diff --git a/.bazelversion b/.bazelversion index f22d756da39..a8907c025d5 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -6.5.0 +7.0.2 diff --git a/.ci/env/openblas.sh b/.ci/env/openblas.sh index f154c5463df..2a2e8ddf448 100755 --- a/.ci/env/openblas.sh +++ b/.ci/env/openblas.sh @@ -16,7 +16,7 @@ #=============================================================================== sudo apt-get update -sudo apt-get install build-essential gcc gfortran +sudo apt-get -y install build-essential gcc gfortran git clone https://github.com/xianyi/OpenBLAS.git CoreCount=$(lscpu -p | grep -Ev '^#' | wc -l) pushd OpenBLAS diff --git a/.ci/env/tbb.sh b/.ci/env/tbb.sh new file mode 100755 index 00000000000..3b6a991647c --- /dev/null +++ b/.ci/env/tbb.sh @@ -0,0 +1,83 @@ +#!/bin/bash +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +# Function to display help +show_help() { + echo "Usage: $0 [-h]" + echo " -h Display this information" + echo " Set CC and CXX environment variables to change the compiler. Default is GNU." +} + +# Check for command-line options +while getopts ":h" opt; do + case $opt in + h) + show_help + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + show_help + exit 1 + ;; + esac +done + +# Set default values for CXX and CC +CXX="${CXX:-g++}" +CC="${CC:-gcc}" + +echo "CXX is set to: $CXX" +echo "CC is set to: $CC" + +TBB_VERSION="v2021.10.0" + +arch=$(uname -m) +if [ "${arch}" == "x86_64" ]; then + arch_dir="intel64" +elif [ "${arch}" == "aarch64" ]; then + arch_dir="arm" +else + arch_dir=${arch} +fi + +sudo apt-get update +sudo apt-get install build-essential gcc gfortran cmake -y +git clone --depth 1 --branch ${TBB_VERSION} https://github.com/oneapi-src/oneTBB.git onetbb-src + +CoreCount=$(lscpu -p | grep -Ev '^#' | wc -l) + +rm -rf __deps/tbb +pushd onetbb-src +mkdir build +pushd build +cmake -DCMAKE_CXX_COMPILER=${CXX} -DCMAKE_BUILD_TYPE=Release -DTBB_TEST=OFF -DTBB_STRICT_PROTOTYPES=OFF -DCMAKE_INSTALL_PREFIX=../../__deps/tbb .. +make -j${CoreCount} +make install +popd +popd +rm -rf onetbb-src + +pushd __deps/tbb + mkdir -p lnx + mv lib/ lnx/ + mv include/ lnx/ + pushd lnx + mkdir -p lib/${arch_dir}/gcc4.8 + mv lib/libtbb* lib/${arch_dir}/gcc4.8 + popd +popd diff --git a/.ci/scripts/build.bat b/.ci/scripts/build.bat index f002b5e755e..d218464707c 100644 --- a/.ci/scripts/build.bat +++ b/.ci/scripts/build.bat @@ -40,4 +40,4 @@ call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Buil echo make %1 -j%CPUCOUNT% COMPILER=%2 PLAT=win32e REQCPU=%3 make %1 -j%CPUCOUNT% COMPILER=%2 PLAT=win32e REQCPU=%3 -cmake -DINSTALL_DIR=__release_win_vc\daal\latest\lib\cmake\oneDAL -P cmake\scripts\generate_config.cmake +cmake -DINSTALL_DIR=__release_win_vc\daal\latest\lib\cmake\oneDAL -DARCH_DIR=intel64 -P cmake\scripts\generate_config.cmake diff --git a/.ci/scripts/build.sh b/.ci/scripts/build.sh index 8d47bbe9655..62b3623a3fe 100755 --- a/.ci/scripts/build.sh +++ b/.ci/scripts/build.sh @@ -1,6 +1,7 @@ #! /bin/bash #=============================================================================== # Copyright 2019 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,7 +48,17 @@ PLATFORM=$(bash dev/make/identify_os.sh) OS=${PLATFORM::3} ARCH=${PLATFORM:3:3} +if [[ "${ARCH}" == "32e" ]] +then optimizations=${optimizations:-avx2} +elif [[ "${ARCH}" == "arm" ]] +then +optimizations=${optimizations:-sve} +else +echo "Unknown architecture '${ARCH}'" +exit 1 +fi + backend_config=${backend_config:-mkl} GLOBAL_RETURN=0 @@ -97,7 +108,16 @@ elif [ "${backend_config}" == "ref" ]; then else echo "Not supported backend env" fi + +#TBB setup +if [[ "${ARCH}" == "32e" ]] +then $(pwd)/dev/download_tbb.sh +elif [[ "${ARCH}" == "arm" ]] +then +$(pwd)/.ci/env/tbb.sh +fi + echo "Calling make" make ${target:-daal_c} ${make_op} \ COMPILER=${compiler} \ diff --git a/.ci/scripts/test.sh b/.ci/scripts/test.sh index d014eb9ede4..b9856cf8554 100755 --- a/.ci/scripts/test.sh +++ b/.ci/scripts/test.sh @@ -1,6 +1,7 @@ #! /bin/bash #=============================================================================== # Copyright 2019 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -54,7 +55,17 @@ TESTING_RETURN=0 PLATFORM=$(bash dev/make/identify_os.sh) OS=${PLATFORM::3} ARCH=${PLATFORM:3:3} -full_arch=intel64 +if [ "$ARCH" == "32e" ]; then + full_arch=intel64 + arch_dir=intel_intel64 +elif [ "$ARCH" == "arm" ]; then + full_arch=arm + arch_dir=arm_aarch64 +else + echo "Unknown architecture ${ARCH} detected for platform ${PLATFORM}" + exit 1 +fi + build_system=${build_system:-cmake} backend=${backend:-mkl} @@ -161,7 +172,7 @@ for link_mode in ${link_modes}; do fi output_result= err= - cmake_results_dir="_cmake_results/intel_intel64_${lib_ext}" + cmake_results_dir="_cmake_results/${arch_dir}_${lib_ext}" for p in ${cmake_results_dir}/*; do e=$(basename "$p") ${p} 2>&1 > ${e}.res diff --git a/.github/renovate.json b/.github/renovate.json index 2e9a9582bb6..810cef4990a 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -1,6 +1,6 @@ { "extends": [ - "config:base", + "config:recommended", ":preserveSemverRanges" ], "pip_requirements": { diff --git a/.github/workflows/renovate-validation.yml b/.github/workflows/renovate-validation.yml index a82ae67b1cd..3241cb94564 100644 --- a/.github/workflows/renovate-validation.yml +++ b/.github/workflows/renovate-validation.yml @@ -25,6 +25,6 @@ jobs: - name: Checkout uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 - name: Validate - uses: suzuki-shunsuke/github-action-renovate-config-validator@v0.1.3 + uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.0.1 with: config_file_path: .github/renovate.json diff --git a/.gitignore b/.gitignore index f61844615bf..b74a2dbc3d2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,12 +15,12 @@ bazel-* # Visual Studio related files, e.g., ".vscode" .vs* -# Bazel directories -bazel-* - # PyCharm directories .idea* # CMake directories and cache CMakeFiles CMakeCache.txt + +# MODULE.bazel lock file +MODULE.bazel.lock diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 00000000000..01ab1547fa8 --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,3 @@ +module(name = "onedal") + +bazel_dep(name = "bazel_skylib", version = "1.5.0") diff --git a/WORKSPACE b/WORKSPACE index 6373b40e270..888ba44e002 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,14 +1,6 @@ workspace(name = "onedal") load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") -http_archive( - name = "bazel_skylib", - urls = [ - "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz", - "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz", - ], - sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94", -) load("@onedal//dev/bazel/config:config.bzl", "declare_onedal_config") declare_onedal_config( diff --git a/cmake/scripts/generate_config.cmake b/cmake/scripts/generate_config.cmake index fb04a832435..a891736dd46 100644 --- a/cmake/scripts/generate_config.cmake +++ b/cmake/scripts/generate_config.cmake @@ -1,5 +1,6 @@ #=============================================================================== # Copyright 2021 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +20,7 @@ set(DAL_ROOT_REL_PATH "../../..") set(INC_REL_PATH "include") set(LIB_REL_PATH "lib") set(DLL_REL_PATH "redist") -set(SUB_DIR "intel64") +set(ARCH_DIR_ONEDAL "${ARCH_DIR_ONEDAL}") # Parse version info if possible if (NOT "$ENV{DALROOT}" STREQUAL "") diff --git a/cmake/templates/oneDALConfig.cmake.in b/cmake/templates/oneDALConfig.cmake.in index 26ce3143a2b..73a63b625e7 100644 --- a/cmake/templates/oneDALConfig.cmake.in +++ b/cmake/templates/oneDALConfig.cmake.in @@ -209,14 +209,14 @@ foreach (_dal_component ${DAL_LIBS}) find_library( _dal_lib NAMES "${LIB_PREFIX}${_dal_component}${LIB_EXT}" - PATH_SUFFIXES "lib/intel64" + PATH_SUFFIXES "lib/@ARCH_DIR_ONEDAL@" PATHS "${_dal_root}") elseif (${ONEDAL_LINK} STREQUAL "dynamic") add_library(oneDAL::${_dal_component} SHARED IMPORTED) find_library( _dal_lib NAMES "${LIB_PREFIX}${_dal_component}${DLL_EXT}" - PATH_SUFFIXES "lib/intel64" + PATH_SUFFIXES "lib/@ARCH_DIR_ONEDAL@" PATHS "${_dal_root}") endif() diff --git a/cpp/daal/include/algorithms/algorithm_container_base_batch.h b/cpp/daal/include/algorithms/algorithm_container_base_batch.h index 2efe6fdf942..03b3d48b16e 100644 --- a/cpp/daal/include/algorithms/algorithm_container_base_batch.h +++ b/cpp/daal/include/algorithms/algorithm_container_base_batch.h @@ -1,6 +1,7 @@ /* file: algorithm_container_base_batch.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -142,11 +143,18 @@ class AlgorithmContainerImpl : public AlgorithmContainer * \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) * \tparam avx2Container Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) * \tparam avx512Container Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512 + * \tparam sve Implementation for ARM processors based on Arm Scalable Vector Extension */ + +#if defined(TARGET_X86_64) template class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl +#elif defined(TARGET_ARM) +template +class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl +#endif { public: /** diff --git a/cpp/daal/include/algorithms/algorithm_container_base_common.h b/cpp/daal/include/algorithms/algorithm_container_base_common.h index 6d0c946fead..5f63a868634 100644 --- a/cpp/daal/include/algorithms/algorithm_container_base_common.h +++ b/cpp/daal/include/algorithms/algorithm_container_base_common.h @@ -1,6 +1,7 @@ /* file: algorithm_container_base_common.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +25,8 @@ #ifndef __ALGORITHM_CONTAINER_BASE_COMMON_H__ #define __ALGORITHM_CONTAINER_BASE_COMMON_H__ +#include "services/daal_defines.h" + #include "algorithms/algorithm_container_base.h" #include "services/error_handling.h" #include "services/internal/gpu_support_checker.h" @@ -53,8 +56,13 @@ namespace interface1 * \tparam avx2Container Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) * \tparam avx512Container Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512 */ + +#if defined(TARGET_X86_64) template +#elif defined(TARGET_ARM) +template +#endif class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl { public: @@ -99,10 +107,15 @@ class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \ +#if defined(TARGET_X86_64) + #define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...) \ + algorithms::AlgorithmDispatchContainer DAAL_KERNEL_SSE42_CONTAINER( \ + ContainerTemplate, __VA_ARGS__) DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \ DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)> +#elif defined(TARGET_ARM) + #define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...) \ + algorithms::AlgorithmDispatchContainer DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)> +#endif /** @} */ } // namespace interface1 diff --git a/cpp/daal/include/services/daal_defines.h b/cpp/daal/include/services/daal_defines.h index 6d43749d7c8..5415d31dcb7 100644 --- a/cpp/daal/include/services/daal_defines.h +++ b/cpp/daal/include/services/daal_defines.h @@ -1,6 +1,7 @@ /* file: daal_defines.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +29,14 @@ #include // for size_t +#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64) || defined(_M_AMD64) + #define TARGET_X86_64 +#endif + +#if defined(__ARM_ARCH) || defined(__aarch64__) + #define TARGET_ARM +#endif + #if (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)) && !defined(SYCL_LANGUAGE_VERSION) #define DAAL_INTEL_CPP_COMPILER #endif @@ -65,6 +74,8 @@ #if !defined(DAAL_INT) #if defined(_WIN64) || defined(__x86_64__) #define DAAL_INT __int64 + #elif defined(TARGET_ARM) + #define DAAL_INT __int64 #else #define DAAL_INT __int32 #endif diff --git a/cpp/daal/include/services/env_detect.h b/cpp/daal/include/services/env_detect.h index 83f4040dfac..9f6ad24fef7 100644 --- a/cpp/daal/include/services/env_detect.h +++ b/cpp/daal/include/services/env_detect.h @@ -1,6 +1,7 @@ /* file: env_detect.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,11 +43,16 @@ namespace daal */ enum CpuType { +#if defined(TARGET_X86_64) sse2 = 0, /*!< Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2) */ sse42 = 2, /*!< Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) */ avx2 = 4, /*!< Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) */ avx512 = 6, /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) */ lastCpuType = avx512 +#elif defined(TARGET_ARM) + sve = 0, /*!< ARM(R) processors based on Arm's Scalable Vector Extension (SVE) */ + lastCpuType = sve +#endif }; namespace services @@ -91,7 +97,12 @@ class DAAL_EXPORT Environment : public Base enum CpuTypeEnable { cpu_default = 0, /*!< Default processor type */ - avx512 = 2 /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */ + +#if defined(TARGET_X86_64) + avx512 = 2 /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */ +#elif defined(TARGET_ARM) + sve = 2, /*!< ARM(R) processors based on Arm's Scalable Vector Extension (SVE) */ +#endif }; /** @@ -167,7 +178,10 @@ class DAAL_EXPORT Environment : public Base _executionContext = internal::ImplAccessor::getImplPtr(ctx); } - services::internal::sycl::ExecutionContextIface & getDefaultExecutionContext() { return *_executionContext; } + services::internal::sycl::ExecutionContextIface & getDefaultExecutionContext() + { + return *_executionContext; + } private: Environment(); diff --git a/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h b/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h new file mode 100644 index 00000000000..799525128ef --- /dev/null +++ b/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h @@ -0,0 +1,41 @@ +/* file: aarch64_kernel_defines.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef __aarch64_KERNEL_DEFINES_H__ +#define __aarch64_KERNEL_DEFINES_H__ + +#define DAAL_KERNEL_SVE + +#if defined(DAAL_KERNEL_SVE) + #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID + #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::sve + #define DAAL_KERNEL_SVE_ONLY(something) , something + #define DAAL_KERNEL_SVE_ONLY_CODE(...) __VA_ARGS__ + #define DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sve, __VA_ARGS__) + #define DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, ...) extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sve, __VA_ARGS__); + #define DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sve, __VA_ARGS__) + #define DAAL_KERNEL_SVE_CONTAINER_CASE_SYCL(ContainerTemplate, ...) +#else + #define DAAL_KERNEL_SVE_ONLY(something) + #define DAAL_KERNEL_SVE_ONLY_CODE(...) + #define DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, ...) + #define DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, ...) + #define DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, ...) + #define DAAL_KERNEL_SVE_CONTAINER_CASE_SYCL(ContainerTemplate, ...) +#endif + +#endif diff --git a/cpp/daal/include/services/internal/daal_kernel_defines.h b/cpp/daal/include/services/internal/daal_kernel_defines.h index fd631a61f3f..f4f723dfd13 100644 --- a/cpp/daal/include/services/internal/daal_kernel_defines.h +++ b/cpp/daal/include/services/internal/daal_kernel_defines.h @@ -1,6 +1,7 @@ /* file: daal_kernel_defines.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,11 +32,18 @@ * @ingroup services * @{ */ + #define DAAL_KERNEL_SSE2 #define DAAL_KERNEL_SSE42 #define DAAL_KERNEL_AVX2 #define DAAL_KERNEL_AVX512 +#if defined(TARGET_X86_64) + #include "services/internal/x86_64/x86_64_kernel_defines.h" +#elif defined(TARGET_ARM) + #include "services/internal/aarch64/aarch64_kernel_defines.h" +#endif + #define DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, cpuType, ...) ContainerTemplate<__VA_ARGS__, cpuType> #define DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, cpuType, ...) \ case cpuType: \ @@ -50,81 +58,6 @@ case cpuType: break; \ } -#if defined(DAAL_KERNEL_SSE2) - #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID - #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::sse2 - #define DAAL_KERNEL_SSE2_ONLY(something) , something - #define DAAL_KERNEL_SSE2_ONLY_CODE(...) __VA_ARGS__ - #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__) - #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) \ - extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__); - #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse2, __VA_ARGS__) -#else - #define DAAL_KERNEL_SSE2_ONLY(something) - #define DAAL_KERNEL_SSE2_ONLY_CODE(...) - #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) -#endif - -#if defined(DAAL_KERNEL_SSE42) - #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID - #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::sse42 - #define DAAL_KERNEL_SSE42_ONLY(something) , something - #define DAAL_KERNEL_SSE42_ONLY_CODE(...) __VA_ARGS__ - #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__) - #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) \ - extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__); - #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse42, __VA_ARGS__) - #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, sse42, __VA_ARGS__) -#else - #define DAAL_KERNEL_SSE42_ONLY(something) - #define DAAL_KERNEL_SSE42_ONLY_CODE(...) - #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) -#endif - -#if defined(DAAL_KERNEL_AVX2) - #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID - #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx2 - #define DAAL_KERNEL_AVX2_ONLY(something) , something - #define DAAL_KERNEL_AVX2_ONLY_CODE(...) __VA_ARGS__ - #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__) - #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) \ - extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__); - #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx2, __VA_ARGS__) - #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx2, __VA_ARGS__) -#else - #define DAAL_KERNEL_AVX2_ONLY(something) - #define DAAL_KERNEL_AVX2_ONLY_CODE(...) - #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) -#endif - -#if defined(DAAL_KERNEL_AVX512) - #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID - #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx512 - #define DAAL_KERNEL_AVX512_ONLY(something) , something - #define DAAL_KERNEL_AVX512_ONLY_CODE(...) __VA_ARGS__ - #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__) - #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) \ - extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__); - #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx512, __VA_ARGS__) - #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx512, __VA_ARGS__) -#else - #define DAAL_KERNEL_AVX512_ONLY(something) - #define DAAL_KERNEL_AVX512_ONLY_CODE(...) - #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) -#endif - #define DAAL_EXPAND(...) __VA_ARGS__ /** @} */ diff --git a/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h b/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h new file mode 100644 index 00000000000..f9570309739 --- /dev/null +++ b/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h @@ -0,0 +1,96 @@ +/* file: x86_64_kernel_defines.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef __x86_64_KERNEL_DEFINES_H__ +#define __x86_64_KERNEL_DEFINES_H__ + +#if defined(DAAL_KERNEL_SSE2) + #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID + #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::sse2 + #define DAAL_KERNEL_SSE2_ONLY(something) , something + #define DAAL_KERNEL_SSE2_ONLY_CODE(...) __VA_ARGS__ + #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__) + #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) \ + extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__); + #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse2, __VA_ARGS__) +#else + #define DAAL_KERNEL_SSE2_ONLY(something) + #define DAAL_KERNEL_SSE2_ONLY_CODE(...) + #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) + #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) + #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) + #define DAAL_KERNEL_SSE2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) +#endif + +#if defined(DAAL_KERNEL_SSE42) + #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID + #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::sse42 + #define DAAL_KERNEL_SSE42_ONLY(something) , something + #define DAAL_KERNEL_SSE42_ONLY_CODE(...) __VA_ARGS__ + #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__) + #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) \ + extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__); + #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse42, __VA_ARGS__) + #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, sse42, __VA_ARGS__) +#else + #define DAAL_KERNEL_SSE42_ONLY(something) + #define DAAL_KERNEL_SSE42_ONLY_CODE(...) + #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) + #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) + #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) + #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) +#endif + +#if defined(DAAL_KERNEL_AVX2) + #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID + #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx2 + #define DAAL_KERNEL_AVX2_ONLY(something) , something + #define DAAL_KERNEL_AVX2_ONLY_CODE(...) __VA_ARGS__ + #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__) + #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) \ + extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__); + #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx2, __VA_ARGS__) + #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx2, __VA_ARGS__) +#else + #define DAAL_KERNEL_AVX2_ONLY(something) + #define DAAL_KERNEL_AVX2_ONLY_CODE(...) + #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) + #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) + #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) + #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) +#endif + +#if defined(DAAL_KERNEL_AVX512) + #undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID + #define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx512 + #define DAAL_KERNEL_AVX512_ONLY(something) , something + #define DAAL_KERNEL_AVX512_ONLY_CODE(...) __VA_ARGS__ + #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__) + #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) \ + extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__); + #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx512, __VA_ARGS__) + #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx512, __VA_ARGS__) +#else + #define DAAL_KERNEL_AVX512_ONLY(something) + #define DAAL_KERNEL_AVX512_ONLY_CODE(...) + #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) + #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) + #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) + #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) +#endif + +#endif diff --git a/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp b/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp index 17b4c923eef..81b157d79ff 100644 --- a/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp +++ b/cpp/daal/src/algorithms/algorithm_hyperparameter.cpp @@ -1,6 +1,7 @@ /** file algorithm_hyperparameter.cpp */ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,10 +67,10 @@ struct HyperparameterImpl : public HyperparameterBaseImpl protected: /** Stores integer hyperparameters of the algorithm */ - HashTable _iHT; + HashTable _iHT; /** Stores floating point hyperparameters of the algorithm */ - HashTable _dHT; + HashTable _dHT; }; } // namespace internal diff --git a/cpp/daal/src/algorithms/covariance/covariance_impl.i b/cpp/daal/src/algorithms/covariance/covariance_impl.i index 24cb48524c5..0ebceeffcd7 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_impl.i +++ b/cpp/daal/src/algorithms/covariance/covariance_impl.i @@ -1,6 +1,7 @@ /* file: covariance_impl.i */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -135,8 +136,14 @@ static inline size_t getBlockSize(size_t nrows) return 140; } +#if defined(TARGET_X86_64) + #define DAAL_CPU_TYPE avx512 +#elif defined(TARGET_ARM) + #define DAAL_CPU_TYPE sve +#endif + template <> -inline size_t getBlockSize(size_t nrows) +inline size_t getBlockSize(size_t nrows) { return (nrows > 5000 && nrows <= 50000) ? 1024 : 140; } diff --git a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i index b9d354caf90..16064a701b0 100644 --- a/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i +++ b/cpp/daal/src/algorithms/dtrees/dtrees_train_data_helper.i @@ -115,6 +115,7 @@ struct SplitData SplitData() : impurityDecrease(-daal::services::internal::MaxVal::get()), + left {}, featureValue(0.0), nLeft(0), iStart(0), diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i index 426457ecad7..377fffd20dd 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_impl.i @@ -1,6 +1,7 @@ /* file: df_classification_predict_dense_default_batch_impl.i */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -945,8 +946,12 @@ Status PredictClassificationTask::predictAllPointsByAllTre algorithmFPType * const res = resBD.get(); algorithmFPType * const prob = probBD.get(); daal::SafeStatus safeStat; - const size_t nRowsOfRes = _data->getNumberOfRows(); - const size_t blockSize = cpu == avx512 ? _DEFAULT_BLOCK_SIZE : _DEFAULT_BLOCK_SIZE_COMMON; + const size_t nRowsOfRes = _data->getNumberOfRows(); +#if defined(TARGET_X86_64) + const size_t blockSize = cpu == avx512 ? _DEFAULT_BLOCK_SIZE : _DEFAULT_BLOCK_SIZE_COMMON; +#elif defined(TARGET_ARM) + const size_t blockSize = cpu == sve ? _DEFAULT_BLOCK_SIZE : _DEFAULT_BLOCK_SIZE_COMMON; +#endif const size_t nBlocks = nRowsOfRes / blockSize; const size_t residualSize = nRowsOfRes - nBlocks * blockSize; algorithmFPType * commonBufVal = nullptr; diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i index 03b89e7843c..1306c154037 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i @@ -947,8 +947,8 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase rng; /* index for swapping samples in Fisher-Yates sampling */ diff --git a/cpp/daal/src/algorithms/export_win32e.def b/cpp/daal/src/algorithms/export_win32e.def index a55e32aea15..443714aef69 100644 --- a/cpp/daal/src/algorithms/export_win32e.def +++ b/cpp/daal/src/algorithms/export_win32e.def @@ -21,6 +21,9 @@ fpk_serv_memcpy_s fpk_serv_lock fpk_serv_unlock fpk_serv_strnlen_s +fpk_serv_strncpy_s +fpk_serv_strncat_s +fpk_serv_thread_yield fpk_serv_core_register_cleanup fpk_serv_calloc fpk_serv_printf_s diff --git a/cpp/daal/src/algorithms/kernel_config.h b/cpp/daal/src/algorithms/kernel_config.h index ed19658b813..e328311714f 100644 --- a/cpp/daal/src/algorithms/kernel_config.h +++ b/cpp/daal/src/algorithms/kernel_config.h @@ -1,6 +1,7 @@ /* file: kernel_config.h */ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,6 +25,38 @@ #ifndef __KERNEL_CONFIG_H__ #define __KERNEL_CONFIG_H__ -#include "src/algorithms/kernel_inst_x86.h" +#include "services/daal_defines.h" +#include "src/services/service_defines.h" +#include "services/internal/daal_kernel_defines.h" +#include "services/internal/gpu_support_checker.h" + +#if defined(TARGET_X86_64) + #include "src/algorithms/kernel_inst_x86.h" +#elif defined(TARGET_ARM) + #include "src/algorithms/kernel_inst_arm.h" +#endif + +#define __DAAL_GET_CPUID int cpuid = daalEnv->cpuid; + +#define __DAAL_GET_CPUID_SAFE \ + int cpuid = DAAL_BASE_CPU; \ + DAAL_SAFE_CPU_CALL((cpuid = daalEnv->cpuid), (cpuid = DAAL_BASE_CPU)) + +#define __DAAL_KERNEL_MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(ContainerTemplate, Mode, ...) \ + __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID_SAFE, \ + __VA_ARGS__) + +#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER(ContainerTemplate, Mode, ...) \ + __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID, __VA_ARGS__) + +#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(ContainerTemplate, Mode, ...) \ + __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID, \ + __VA_ARGS__) + +#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(ContainerTemplate, Mode, ...) \ + __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID_SAFE, \ + __VA_ARGS__) #endif diff --git a/cpp/daal/src/algorithms/kernel_inst_arm.h b/cpp/daal/src/algorithms/kernel_inst_arm.h new file mode 100644 index 00000000000..e72d94ef019 --- /dev/null +++ b/cpp/daal/src/algorithms/kernel_inst_arm.h @@ -0,0 +1,71 @@ +/* file: kernel_inst_arm.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// The defines used for kernel allocation, deallocation, and calling kernel methods +//-- +*/ + +#ifndef __KERNEL_INST_ARM_H__ +#define __KERNEL_INST_ARM_H__ + +#define __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ + DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ + namespace interface1 \ + { \ + template <> \ + ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName( \ + daal::services::Environment::env * daalEnv) \ + : BaseClassName(daalEnv), _cntr(nullptr) \ + { \ + GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid)) \ + { \ + DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, __VA_ARGS__) \ + default: _cntr = (new ContainerTemplate<__VA_ARGS__, sve>(daalEnv)); break; \ + } \ + } \ + \ + template class ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ + } + +#define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ + DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ + namespace interface1 \ + { \ + template <> \ + ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName( \ + daal::services::Environment::env * daalEnv) \ + : BaseClassName(daalEnv), _cntr(NULL) \ + { \ + GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid)) \ + { \ + DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, __VA_ARGS__) \ + default: \ + { \ + using cntrTemplateInst = ContainerTemplate<__VA_ARGS__, sve>; \ + static volatile services::internal::GpuSupportRegistrar registrar; \ + _cntr = (new cntrTemplateInst(daalEnv)); \ + break; \ + } \ + } \ + } \ + \ + template class ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ + } + +#endif diff --git a/cpp/daal/src/algorithms/kernel_inst_x86.h b/cpp/daal/src/algorithms/kernel_inst_x86.h index baf3d8d4153..1b30c74ccb1 100644 --- a/cpp/daal/src/algorithms/kernel_inst_x86.h +++ b/cpp/daal/src/algorithms/kernel_inst_x86.h @@ -24,19 +24,6 @@ #ifndef __KERNEL_INST_X86_H__ #define __KERNEL_INST_X86_H__ -#include "services/daal_defines.h" -#include "src/services/service_defines.h" -#include "services/internal/daal_kernel_defines.h" -#include "services/internal/gpu_support_checker.h" - -#define __DAAL_GET_CPUID int cpuid = daalEnv->cpuid; - -#define __DAAL_GET_CPUID_SAFE \ - int cpuid = DAAL_BASE_CPU; \ - DAAL_SAFE_CPU_CALL((cpuid = daalEnv->cpuid), (cpuid = DAAL_BASE_CPU)) - -#define __DAAL_KERNEL_MIN(a, b) ((a) < (b) ? (a) : (b)) - #define __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ @@ -64,13 +51,6 @@ DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ } -#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(ContainerTemplate, Mode, ...) \ - __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID_SAFE, \ - __VA_ARGS__) - -#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER(ContainerTemplate, Mode, ...) \ - __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID, __VA_ARGS__) - #define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ @@ -104,12 +84,4 @@ DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ } -#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(ContainerTemplate, Mode, ...) \ - __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID, \ - __VA_ARGS__) - -#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(ContainerTemplate, Mode, ...) \ - __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID_SAFE, \ - __VA_ARGS__) - #endif diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp index 6ba45be595f..2f4e64e3c7d 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_csr_lloyd_batch_fpt_cpu.cpp @@ -37,7 +37,7 @@ template class BatchContainer; } namespace internal { -template class KMeansBatchKernel; +template class DAAL_EXPORT KMeansBatchKernel; } // namespace internal } // namespace kmeans } // namespace algorithms diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp index c46d8302cf0..0ccd42d0429 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_csr_plusplus_batch_fpt_cpu.cpp @@ -39,7 +39,7 @@ template class BatchContainer; } namespace internal { -template class KMeansInitKernel; +template class DAAL_EXPORT KMeansInitKernel; } // namespace internal } // namespace init } // namespace kmeans diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i old mode 100755 new mode 100644 index c32c63ed1fd..4f6ee638fd1 --- a/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i +++ b/cpp/daal/src/algorithms/kmeans/kmeans_plusplus_init_impl.i @@ -247,13 +247,13 @@ template class DataHelperCSR { public: - typedef BlockHelperCSR BlockHelperType; + typedef BlockHelperCSR BlockHelperType; DataHelperCSR(NumericTable * ntData) - : dim(ntData->getNumberOfColumns()), nRows(ntData->getNumberOfRows()), _nt(ntData), _csr(dynamic_cast(ntData)) + : dim(ntData->getNumberOfColumns()), nRows(ntData->getNumberOfRows()), _nt(ntData), _csr(dynamic_cast(ntData)) {} NumericTable * nt() const { return _nt; } - CSRNumericTableIface * ntIface() const { return _csr; } + CSRNumericTable * ntIface() const { return _csr; } Status updateMinDistInBlock(algorithmFPType * const minDistAccTrials, size_t nBlock, size_t iBlock, size_t nTrials, size_t iBestTrial, const algorithmFPType * aWeights, const algorithmFPType * const pLastAddedCenter, algorithmFPType * const aMinDist) @@ -261,11 +261,13 @@ public: const size_t iStartRow = iBlock * _nRowsInBlock; //start row const size_t nRowsToProcess = (iBlock == nBlock - 1) ? nRows - iBlock * _nRowsInBlock : _nRowsInBlock; //rows to process - ReadRowsCSR ntDataBD(_csr, iStartRow, nRowsToProcess); - DAAL_CHECK_BLOCK_STATUS(ntDataBD); - const algorithmFPType * const pData = ntDataBD.values(); - const size_t * const colIdx = ntDataBD.cols(); - const size_t * const rowIdx = ntDataBD.rows(); + // TODO: Better to use ReadRowsCSR, but there is a bug related to static library linking. + // Fixme when ReadRowsCSR will be fixed. + daal::data_management::CSRBlockDescriptor block; + _csr->getSparseBlock(iStartRow, nRowsToProcess, daal::data_management::readOnly, block); + const auto pData = block.getBlockValuesPtr(); + const auto colIdx = block.getBlockColumnIndicesPtr(); + const auto rowIdx = block.getBlockRowIndicesPtr(); algorithmFPType * const pDistSqBest = &aMinDist[iBestTrial * nRows + iStartRow]; const algorithmFPType * const weights = aWeights ? &aWeights[iStartRow] : nullptr; @@ -282,7 +284,7 @@ public: minDistAccTrials[iBestTrial * nBlock + iBlock] = updateMinDistForITrials(pDistSqBest, iBestTrial, nRowsToProcess, pData, colIdx, rowIdx, pLastAddedCenter, weights, pDistSqBest); - return Status(); + return _csr->releaseSparseBlock(block); } algorithmFPType updateMinDistForITrials(algorithmFPType * const pDistSq, size_t iTrials, size_t nRowsToProcess, @@ -316,19 +318,25 @@ public: //of the data in this row algorithmFPType copyOneRowCalcSumSq(size_t iRow, algorithmFPType * pDst) const { - ReadRowsCSR ntDataBD(_csr, iRow, 1); - const algorithmFPType * pData = ntDataBD.values(); - const size_t * colIdx = ntDataBD.cols(); - const size_t * rowIdx = ntDataBD.rows(); + // TODO: Better to use ReadRowsCSR, but there is a bug related to static library linking. + // Fixme when ReadRowsCSR will be fixed. + daal::data_management::CSRBlockDescriptor block; + _csr->getSparseBlock(iRow, 1, daal::data_management::readOnly, block); + const auto pData = block.getBlockValuesPtr(); + const auto colIdx = block.getBlockColumnIndicesPtr(); + const auto rowIdx = block.getBlockRowIndicesPtr(); daal::services::internal::service_memset(pDst, algorithmFPType(0.), dim); algorithmFPType res(0.); const size_t nValues = rowIdx[1] - rowIdx[0]; - for (size_t i = 0; i < nValues; ++i, ++pData, ++colIdx) + for (size_t i = 0; i < nValues; ++i) { - res += (*pData) * (*pData); - pDst[(*colIdx) - 1] = *pData; + const auto val = pData[i]; + res += val * val; + const auto colIndex = colIdx[i]; + pDst[colIndex - 1] = val; } + _csr->releaseSparseBlock(block); return res; } @@ -338,7 +346,7 @@ public: protected: NumericTable * _nt; - CSRNumericTableIface * _csr; + CSRNumericTable * _csr; }; //Base task class for kmeans++ and kmeans|| @@ -546,6 +554,7 @@ Status TaskPlusPlusBatch::run() //copy it to the result status |= this->copyPoints(&clusters[iCluster * this->_data.dim], &this->_lastAddedCenter[this->_trialBest * this->_data.dim], 1u); } + return status; } diff --git a/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp b/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp index e4d361eb40c..dff9b1aa101 100644 --- a/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/coordinate_descent/coordinate_descent_types_fpt.cpp @@ -60,10 +60,6 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in } const Parameter * algParam = static_cast(par); - if (!algParam->optionalResultRequired) - { - return s; - } return s; } template DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * par, diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i index 84f34861fb8..ce438f8b1fa 100755 --- a/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i +++ b/cpp/daal/src/algorithms/qr/qr_dense_default_pcl_impl.i @@ -1,6 +1,7 @@ /* file: qr_dense_default_pcl_impl.i */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,6 +89,8 @@ inline int * get_nblocks_array(int * size) return array; } /* rows/cols is greater or equal to: --------------------------------------------------------- 0 1 2 4 8 16 32 64 128 256 512 1K 2K ----------------------------------------------------*/ + +#if defined(TARGET_X86_64) template <> inline int * get_nblocks_array(int * size) { @@ -116,6 +119,22 @@ inline int * get_nblocks_array(int * size) *size = sizeof(array) / sizeof(int) - 1; return array; } +#elif defined(TARGET_ARM) +template <> +inline int * get_nblocks_array(int * size) +{ + static int array[] = { 1, 1, 1, 2, 4, 8, 16, 20, 24, 24, 20, 0 }; + *size = sizeof(array) / sizeof(int) - 1; + return array; +} +template <> +inline int * get_nblocks_array(int * size) +{ + static int array[] = { 1, 1, 1, 2, 4, 8, 16, 20, 20, 24, 20, 0 }; + *size = sizeof(array) / sizeof(int) - 1; + return array; +} +#endif #define QR_CHECK_BREAK(cond, error) \ if (!(cond)) \ diff --git a/cpp/daal/src/externals/service_dispatch.h b/cpp/daal/src/externals/service_dispatch.h index 0afb2d01921..9a6aef97e92 100644 --- a/cpp/daal/src/externals/service_dispatch.h +++ b/cpp/daal/src/externals/service_dispatch.h @@ -1,6 +1,7 @@ /* file: service_dispatch.h */ /******************************************************************************* * Copyright 2018 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,26 +27,43 @@ #include "services/internal/daal_kernel_defines.h" -#define DAAL_DISPATCH_FUNCTION_BY_CPU(func, ...) \ - switch (static_cast(daal::services::Environment::getInstance()->getCpuId())) \ - { \ - DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : func(daal::CpuType::sse42, __VA_ARGS__); break;) \ - DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : func(daal::CpuType::avx2, __VA_ARGS__); break;) \ - DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : func(daal::CpuType::avx512, __VA_ARGS__); break;) \ - DAAL_EXPAND(default : func(daal::CpuType::sse2, __VA_ARGS__); break;) \ - } +#if defined(TARGET_X86_64) + #define DAAL_DISPATCH_FUNCTION_BY_CPU(func, ...) \ + switch (static_cast(daal::services::Environment::getInstance()->getCpuId())) \ + { \ + DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : func(daal::CpuType::sse42, __VA_ARGS__); break;) \ + DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : func(daal::CpuType::avx2, __VA_ARGS__); break;) \ + DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : func(daal::CpuType::avx512, __VA_ARGS__); break;) \ + DAAL_EXPAND(default : func(daal::CpuType::sse2, __VA_ARGS__); break;) \ + } -#define DAAL_DISPATCH_FUNCTION_BY_CPU_SAFE(func, ...) \ - services::Status st; \ - int cpuid = daal::sse2; \ - DAAL_SAFE_CPU_CALL((cpuid = daal::services::Environment::getInstance()->getCpuId()), (cpuid = daal::sse2)) \ - switch (static_cast(cpuid)) \ - { \ - DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : st = func(daal::CpuType::sse42, __VA_ARGS__); break;) \ - DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : st = func(daal::CpuType::avx2, __VA_ARGS__); break;) \ - DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : st = func(daal::CpuType::avx512, __VA_ARGS__); break;) \ - DAAL_EXPAND(default : st = func(daal::CpuType::sse2, __VA_ARGS__); break;) \ - } \ - services::throwIfPossible(st); + #define DAAL_DISPATCH_FUNCTION_BY_CPU_SAFE(func, ...) \ + services::Status st; \ + int cpuid = daal::sse2; \ + DAAL_SAFE_CPU_CALL((cpuid = daal::services::Environment::getInstance()->getCpuId()), (cpuid = daal::sse2)) \ + switch (static_cast(cpuid)) \ + { \ + DAAL_KERNEL_SSE42_ONLY_CODE(case daal::CpuType::sse42 : st = func(daal::CpuType::sse42, __VA_ARGS__); break;) \ + DAAL_KERNEL_AVX2_ONLY_CODE(case daal::CpuType::avx2 : st = func(daal::CpuType::avx2, __VA_ARGS__); break;) \ + DAAL_KERNEL_AVX512_ONLY_CODE(case daal::CpuType::avx512 : st = func(daal::CpuType::avx512, __VA_ARGS__); break;) \ + DAAL_EXPAND(default : st = func(daal::CpuType::sse2, __VA_ARGS__); break;) \ + } \ + services::throwIfPossible(st); +#elif defined(TARGET_ARM) + #define DAAL_DISPATCH_FUNCTION_BY_CPU(func, ...) \ + switch (static_cast(daal::services::Environment::getInstance()->getCpuId())) \ + { \ + DAAL_KERNEL_SVE_ONLY_CODE(case daal::CpuType::sve : func(daal::CpuType::sve, __VA_ARGS__); break;) \ + } + #define DAAL_DISPATCH_FUNCTION_BY_CPU_SAFE(func, ...) \ + services::Status st; \ + int cpuid = daal::sve; \ + DAAL_SAFE_CPU_CALL((cpuid = daal::services::Environment::getInstance()->getCpuId()), (cpuid = daal::sve)) \ + switch (static_cast(cpuid)) \ + { \ + DAAL_KERNEL_SVE_ONLY_CODE(case daal::CpuType::sve : st = func(daal::CpuType::sve, __VA_ARGS__); break;) \ + } \ + services::throwIfPossible(st); +#endif #endif diff --git a/cpp/daal/src/services/compiler/generic/env_detect_features.cpp b/cpp/daal/src/services/compiler/generic/env_detect_features.cpp index ea61430b4a4..0f50e003f17 100644 --- a/cpp/daal/src/services/compiler/generic/env_detect_features.cpp +++ b/cpp/daal/src/services/compiler/generic/env_detect_features.cpp @@ -1,6 +1,7 @@ /* file: env_detect_features.cpp */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,10 +22,15 @@ //-- */ -#include - #include "services/env_detect.h" #include "services/daal_defines.h" + +#if defined(TARGET_X86_64) + #include +#elif defined(TARGET_ARM) + #include +#endif + #include "src/services/service_defines.h" #include "src/threading/threading.h" @@ -41,23 +47,24 @@ void __daal_serv_CPUHasAVX512f_enable_it_mac(); #endif +#if defined(TARGET_X86_64) void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd) { -#if defined(_MSC_VER) + #if defined(_MSC_VER) __cpuidex((int *)abcd, eax, ecx); -#else + #else uint32_t ebx, edx; - #if defined(__i386__) && defined(__PIC__) + #if defined(__i386__) && defined(__PIC__) /* in case of PIC under 32-bit EBX cannot be clobbered */ __asm__("movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx)); - #else + #else __asm__("cpuid" : "+b"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx)); - #endif + #endif abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx; -#endif + #endif } bool __daal_internal_is_intel_cpu() @@ -86,11 +93,11 @@ static int check_cpuid(uint32_t eax, uint32_t ecx, int abcd_index, uint32_t mask static int check_xgetbv_xcr0_ymm(uint32_t mask) { uint32_t xcr0; -#if defined(_MSC_VER) + #if defined(_MSC_VER) xcr0 = (uint32_t)_xgetbv(0); -#else + #else __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); -#endif + #endif return ((xcr0 & mask) == mask); /* checking if xmm and ymm state are enabled in XCR0 */ } @@ -187,9 +194,9 @@ DAAL_EXPORT bool __daal_serv_cpu_extensions_available() DAAL_EXPORT int __daal_serv_cpu_detect(int enable) { -#if defined(__APPLE__) + #if defined(__APPLE__) __daal_serv_CPUHasAVX512f_enable_it_mac(); -#endif + #endif if (check_avx512_features() && daal_check_is_intel_cpu()) { return daal::avx512; @@ -207,3 +214,24 @@ DAAL_EXPORT int __daal_serv_cpu_detect(int enable) return daal::sse2; } +#elif defined(TARGET_ARM) +DAAL_EXPORT bool __daal_serv_cpu_extensions_available() +{ + return 0; +} + +DAAL_EXPORT int __daal_serv_cpu_detect(int enable) +{ + return daal::sve; +} + +void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd) +{ + // TODO: ARM implementation for cpuid +} + +bool daal_check_is_intel_cpu() +{ + return false; +} +#endif diff --git a/cpp/daal/src/services/env_detect.cpp b/cpp/daal/src/services/env_detect.cpp index 36f61d7f903..f50bd6358fa 100644 --- a/cpp/daal/src/services/env_detect.cpp +++ b/cpp/daal/src/services/env_detect.cpp @@ -1,6 +1,7 @@ /* file: env_detect.cpp */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +32,12 @@ #include "src/services/service_topo.h" #include "src/threading/service_thread_pinner.h" +#if defined(TARGET_X86_64) + #define DAAL_HOST_CPUID daal::services::Environment::avx512 +#elif defined(TARGET_ARM) + #define DAAL_HOST_CPUID daal::services::Environment::sve +#endif + static daal::services::Environment::LibraryThreadingType daal_thr_set = (daal::services::Environment::LibraryThreadingType)-1; static bool isInit = false; @@ -80,7 +87,8 @@ DAAL_EXPORT int daal::services::Environment::enableInstructionsSet(int enable) DAAL_EXPORT int daal::services::Environment::setCpuId(int cpuid) { initNumberOfThreads(); - int host_cpuid = __daal_serv_cpu_detect(daal::services::Environment::avx512); + + int host_cpuid = __daal_serv_cpu_detect(DAAL_HOST_CPUID); if (!_env.cpuid_init_flag) { @@ -90,7 +98,7 @@ DAAL_EXPORT int daal::services::Environment::setCpuId(int cpuid) if (cpuid > host_cpuid) { - _cpu_detect(daal::services::Environment::avx512); + _cpu_detect(DAAL_HOST_CPUID); } else { diff --git a/cpp/daal/src/services/service_defines.h b/cpp/daal/src/services/service_defines.h index 70af3f301ff..ce1e0cd75f5 100644 --- a/cpp/daal/src/services/service_defines.h +++ b/cpp/daal/src/services/service_defines.h @@ -1,6 +1,7 @@ /* file: service_defines.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,7 +34,11 @@ DAAL_EXPORT int __daal_serv_cpu_detect(int); void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd); bool daal_check_is_intel_cpu(); -#define DAAL_BASE_CPU daal::sse2 +#if defined(TARGET_X86_64) + #define DAAL_BASE_CPU daal::sse2 +#elif defined(TARGET_ARM) + #define DAAL_BASE_CPU daal::sve +#endif #define DAAL_CHECK_CPU_ENVIRONMENT (daal_check_is_intel_cpu()) @@ -117,18 +122,26 @@ enum DataFormat } // namespace daal /* CPU comparison macro */ -#define __sse2__ (0) -#define __sse42__ (2) -#define __avx2__ (4) -#define __avx512__ (6) +#if defined(TARGET_X86_64) + #define __sse2__ (0) + #define __sse42__ (2) + #define __avx2__ (4) + #define __avx512__ (6) +#elif defined(TARGET_ARM) + #define __sve__ (0) +#endif #define __float__ (0) #define __double__ (1) -#define CPU_sse2 __sse2__ -#define CPU_sse42 __sse42__ -#define CPU_avx2 __avx2__ -#define CPU_avx512 __avx512__ +#if defined(TARGET_X86_64) + #define CPU_sse2 __sse2__ + #define CPU_sse42 __sse42__ + #define CPU_avx2 __avx2__ + #define CPU_avx512 __avx512__ +#elif defined(TARGET_ARM) + #define CPU_sve __sve__ +#endif #define FPTYPE_float __float__ #define FPTYPE_double __double__ diff --git a/cpp/daal/src/services/service_topo.h b/cpp/daal/src/services/service_topo.h index 0340265f3ad..483e705af96 100644 --- a/cpp/daal/src/services/service_topo.h +++ b/cpp/daal/src/services/service_topo.h @@ -1,6 +1,7 @@ /* file: service_topo.h */ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,9 +58,12 @@ typedef cpuset_t cpu_set_t; #define __cdecl - #ifdef __x86_64__ + #if defined(TARGET_X86_64) #define LNX_PTR2INT unsigned long long #define LNX_MY1CON 1LL + #elif defined(TARGET_ARM) +using LNX_PTR2INT = uintptr_t; +constexpr LNX_PTR2INT LNX_MY1CON = 1LL; #else #define LNX_PTR2INT unsigned int #define LNX_MY1CON 1 diff --git a/cpp/daal/src/threading/export_lnxarm.ref.def b/cpp/daal/src/threading/export_lnxarm.ref.def new file mode 100644 index 00000000000..58fccd3c71a --- /dev/null +++ b/cpp/daal/src/threading/export_lnxarm.ref.def @@ -0,0 +1,63 @@ +;=============================================================================== +; Copyright contributors to the oneDAL project +; +; Licensed under the Apache License, Version 2.0 (the "License"); +; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. +;=============================================================================== + +EXPORTS +openblas_set_num_threads +openblas_get_num_threads +ssyrk_ +dsyrk_ +ssyr_ +dsyr_ +sgemm_ +dgemm_ +ssymm_ +dsymm_ +sgemv_ +dgemv_ +saxpy_ +daxpy_ +sdot_ +ddot_ +sgetrf_ +dgetrf_ +sgetrs_ +dgetrs_ +spotrf_ +dpotrf_ +spotrs_ +dpotrs_ +spotri_ +dpotri_ +sgerqf_ +dgerqf_ +sormrq_ +dormrq_ +strtrs_ +dtrtrs_ +spptrf_ +dpptrf_ +sgeqrf_ +dgeqrf_ +sgeqp3_ +dgeqp3_ +sorgqr_ +dorgqr_ +sgesvd_ +dgesvd_ +ssyevd_ +dsyevd_ +sormqr_ +dormqr_ diff --git a/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp index 4091a492cbc..42b8186a5fe 100644 --- a/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel_dense.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +15,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include "daal/src/algorithms/covariance/covariance_kernel.h" #include "oneapi/dal/algo/covariance/backend/cpu/finalize_compute_kernel.hpp" @@ -23,6 +26,12 @@ #include "oneapi/dal/table/row_accessor.hpp" +#if defined(TARGET_X86_64) +#define CPU_EXTENSION dal::detail::cpu_extension::avx512 +#elif defined(TARGET_ARM) +#define CPU_EXTENSION dal::detail::cpu_extension::sve +#endif + namespace oneapi::dal::covariance::backend { using dal::backend::context_cpu; @@ -64,7 +73,7 @@ static compute_result call_daal_kernel_finalize(const context_cpu& ctx, /// the logic of block size calculation is copied from DAAL, /// to be changed to passing the values from the performance model std::int64_t blockSize = 140; - if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) { + if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) { const std::int64_t row_count = rows_count_global; if (5000 < row_count && row_count <= 50000) { blockSize = 1024; diff --git a/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp b/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp index 2058eeb457a..d7ec3fc3acc 100644 --- a/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp +++ b/cpp/oneapi/dal/algo/covariance/backend/cpu/partial_compute_kernel_dense.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +24,12 @@ #include "oneapi/dal/table/row_accessor.hpp" +#if defined(TARGET_X86_64) +#define CPU_EXTENSION dal::detail::cpu_extension::avx512 +#elif defined(TARGET_ARM) +#define CPU_EXTENSION dal::detail::cpu_extension::sve +#endif + namespace oneapi::dal::covariance::backend { using dal::backend::context_cpu; @@ -53,7 +60,7 @@ static partial_compute_result call_daal_kernel_partial_compute( /// the logic of block size calculation is copied from DAAL, /// to be changed to passing the values from the performance model std::int64_t blockSize = 140; - if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) { + if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) { const std::int64_t row_count = data.get_row_count(); if (5000 < row_count && row_count <= 50000) { blockSize = 1024; diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp index c841038f172..10bf2da4501 100644 --- a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp +++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_dpc.cpp @@ -15,77 +15,43 @@ *******************************************************************************/ #include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel.hpp" -#include "oneapi/dal/algo/covariance/backend/gpu/misc.hpp" - +#include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp" #include "oneapi/dal/backend/primitives/lapack.hpp" #include "oneapi/dal/backend/primitives/reduction.hpp" #include "oneapi/dal/backend/primitives/stat.hpp" #include "oneapi/dal/backend/primitives/utils.hpp" - +#include "oneapi/dal/detail/policy.hpp" +#include "oneapi/dal/detail/common.hpp" #include "oneapi/dal/table/row_accessor.hpp" +#include "oneapi/dal/detail/profiler.hpp" namespace oneapi::dal::covariance::backend { -namespace bk = dal::backend; namespace pr = oneapi::dal::backend::primitives; -using alloc = sycl::usm::alloc; -using bk::context_gpu; +using method_t = method::dense; using task_t = task::compute; using input_t = partial_compute_result; using result_t = compute_result; using descriptor_t = detail::descriptor_base; -template -static compute_result finalize_compute(const context_gpu& ctx, - const descriptor_t& desc, - const partial_compute_result& input) { - auto& q = ctx.get_queue(); - - const std::int64_t column_count = input.get_partial_crossproduct().get_column_count(); - ONEDAL_ASSERT(column_count > 0); - - dal::detail::check_mul_overflow(column_count, column_count); - - auto bias = desc.get_bias(); - auto result = compute_result{}.set_result_options(desc.get_result_options()); - - const auto nobs_host = pr::table2ndarray(q, input.get_partial_n_rows()); - auto rows_count_global = nobs_host.get_data()[0]; - ONEDAL_ASSERT(rows_count_global > 0); - - const auto sums = - pr::table2ndarray_1d(q, input.get_partial_sum(), sycl::usm::alloc::device); - const auto xtx = - pr::table2ndarray(q, input.get_partial_crossproduct(), sycl::usm::alloc::device); - - if (desc.get_result_options().test(result_options::cov_matrix)) { - auto [cov, cov_event] = compute_covariance(q, rows_count_global, xtx, sums, bias); - result.set_cov_matrix( - (homogen_table::wrap(cov.flatten(q, { cov_event }), column_count, column_count))); - } - if (desc.get_result_options().test(result_options::cor_matrix)) { - auto [corr, corr_event] = compute_correlation(q, rows_count_global, xtx, sums); - result.set_cor_matrix( - (homogen_table::wrap(corr.flatten(q, { corr_event }), column_count, column_count))); - } - if (desc.get_result_options().test(result_options::means)) { - auto [means, means_event] = compute_means(q, sums, rows_count_global); - result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count)); - } - return result; +template +static result_t finalize_compute(const bk::context_gpu& ctx, + const descriptor_t& desc, + const input_t& input) { + return finalize_compute_kernel_dense_impl(ctx)(desc, input); } template -struct finalize_compute_kernel_gpu { - result_t operator()(const context_gpu& ctx, +struct finalize_compute_kernel_gpu { + result_t operator()(const bk::context_gpu& ctx, const descriptor_t& desc, const input_t& input) const { - return finalize_compute(ctx, desc, input); + return finalize_compute(ctx, desc, input); } }; -template struct finalize_compute_kernel_gpu; -template struct finalize_compute_kernel_gpu; +template struct finalize_compute_kernel_gpu; +template struct finalize_compute_kernel_gpu; } // namespace oneapi::dal::covariance::backend diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp new file mode 100644 index 00000000000..611ebb341b6 --- /dev/null +++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp @@ -0,0 +1,53 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel.hpp" +#include "oneapi/dal/backend/primitives/utils.hpp" +#include "oneapi/dal/util/common.hpp" +#include "oneapi/dal/detail/policy.hpp" +#include "oneapi/dal/backend/communicator.hpp" + +#ifdef ONEDAL_DATA_PARALLEL + +namespace oneapi::dal::covariance::backend { + +namespace bk = dal::backend; + +template +class finalize_compute_kernel_dense_impl { + using task_t = task::compute; + using comm_t = bk::communicator; + using input_t = partial_compute_result; + using result_t = compute_result; + using descriptor_t = detail::descriptor_base; + using parameters_t = detail::compute_parameters; + +public: + finalize_compute_kernel_dense_impl(const bk::context_gpu& ctx) + : q(ctx.get_queue()), + comm_(ctx.get_communicator()) {} + result_t operator()(const descriptor_t& desc, const input_t& input); + +private: + sycl::queue q; + comm_t comm_; +}; + +} // namespace oneapi::dal::covariance::backend + +#endif // ONEDAL_DATA_PARALLEL diff --git a/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp new file mode 100644 index 00000000000..3a198252c17 --- /dev/null +++ b/cpp/oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl_dpc.cpp @@ -0,0 +1,112 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/algo/covariance/backend/gpu/finalize_compute_kernel_dense_impl.hpp" +#include "oneapi/dal/algo/covariance/backend/gpu/misc.hpp" + +#include "oneapi/dal/backend/common.hpp" +#include "oneapi/dal/detail/common.hpp" +#include "oneapi/dal/detail/policy.hpp" +#include "oneapi/dal/detail/profiler.hpp" + +#include "oneapi/dal/backend/primitives/lapack.hpp" +#include "oneapi/dal/backend/primitives/reduction.hpp" +#include "oneapi/dal/backend/primitives/stat.hpp" +#include "oneapi/dal/backend/primitives/utils.hpp" + +#include "oneapi/dal/table/row_accessor.hpp" + +#ifdef ONEDAL_DATA_PARALLEL + +namespace oneapi::dal::covariance::backend { + +namespace bk = dal::backend; +namespace pr = oneapi::dal::backend::primitives; +using alloc = sycl::usm::alloc; + +using bk::context_gpu; +using task_t = task::compute; +using input_t = partial_compute_result; +using result_t = compute_result; +using descriptor_t = detail::descriptor_base; + +/// A wrapper that computes 2d arrays of correlation or covariance matrix and 1d array of means. +/// The choice is based on the optional results +/// +/// @tparam Float Floating-point type used to perform computations +/// +/// @param[in] desc The descriptor of the algorithm +/// @param[in] input The partial_compute_result class with partial sums and xtx matrix +/// +/// @return The compute_result object, which contains functions to get covariance/correlation matrix or means. +template +result_t finalize_compute_kernel_dense_impl::operator()(const descriptor_t& desc, + const input_t& input) { + const std::int64_t column_count = input.get_partial_crossproduct().get_column_count(); + ONEDAL_ASSERT(column_count > 0); + + dal::detail::check_mul_overflow(column_count, column_count); + + auto bias = desc.get_bias(); + auto result = compute_result{}.set_result_options(desc.get_result_options()); + + const auto nobs_host = pr::table2ndarray(q, input.get_partial_n_rows()); + auto rows_count_global = nobs_host.get_data()[0]; + { + ONEDAL_PROFILER_TASK(allreduce_rows_count_global); + comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait(); + } + + ONEDAL_ASSERT(rows_count_global > 0); + + const auto sums = + pr::table2ndarray_1d(q, input.get_partial_sum(), sycl::usm::alloc::device); + + { + ONEDAL_PROFILER_TASK(allreduce_sums, q); + comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait(); + } + + const auto xtx = + pr::table2ndarray(q, input.get_partial_crossproduct(), sycl::usm::alloc::device); + + { + ONEDAL_PROFILER_TASK(allreduce_xtx, q); + comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait(); + } + + if (desc.get_result_options().test(result_options::cov_matrix)) { + auto [cov, cov_event] = compute_covariance(q, rows_count_global, xtx, sums, bias); + result.set_cov_matrix( + (homogen_table::wrap(cov.flatten(q, { cov_event }), column_count, column_count))); + } + if (desc.get_result_options().test(result_options::cor_matrix)) { + auto [corr, corr_event] = compute_correlation(q, rows_count_global, xtx, sums); + result.set_cor_matrix( + (homogen_table::wrap(corr.flatten(q, { corr_event }), column_count, column_count))); + } + if (desc.get_result_options().test(result_options::means)) { + auto [means, means_event] = compute_means(q, sums, rows_count_global); + result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count)); + } + return result; +} + +template class finalize_compute_kernel_dense_impl; +template class finalize_compute_kernel_dense_impl; +} // namespace oneapi::dal::covariance::backend + +#endif // ONEDAL_DATA_PARALLEL diff --git a/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp b/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp index 6092e5c81ff..d916608d6b3 100644 --- a/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp +++ b/cpp/oneapi/dal/algo/covariance/detail/finalize_compute_ops_dpc.cpp @@ -29,14 +29,16 @@ struct finalize_compute_ops_dispatcher { const partial_compute_result& input) const { using kernel_dispatcher_t = dal::backend::kernel_dispatcher< // KERNEL_SINGLE_NODE_CPU(backend::finalize_compute_kernel_cpu), - KERNEL_SINGLE_NODE_GPU(backend::finalize_compute_kernel_gpu)>; + KERNEL_UNIVERSAL_SPMD_GPU(backend::finalize_compute_kernel_gpu)>; return kernel_dispatcher_t()(policy, desc, input); } }; -#define INSTANTIATE(F, M, T) \ - template struct ONEDAL_EXPORT \ - finalize_compute_ops_dispatcher; +#define INSTANTIATE(F, M, T) \ + template struct ONEDAL_EXPORT \ + finalize_compute_ops_dispatcher; \ + template struct ONEDAL_EXPORT \ + finalize_compute_ops_dispatcher; INSTANTIATE(float, method::dense, task::compute) INSTANTIATE(double, method::dense, task::compute) diff --git a/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp b/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp index 22ee3d8a4b0..6f2b7e59065 100644 --- a/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp +++ b/cpp/oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +16,7 @@ *******************************************************************************/ #include +#include #include "oneapi/dal/detail/common.hpp" #include "oneapi/dal/detail/profiler.hpp" @@ -27,6 +29,12 @@ #include "oneapi/dal/algo/covariance/parameters/cpu/compute_parameters.hpp" +#if defined(TARGET_X86_64) +#define CPU_EXTENSION dal::detail::cpu_extension::avx512 +#elif defined(TARGET_ARM) +#define CPU_EXTENSION dal::detail::cpu_extension::sve +#endif + namespace oneapi::dal::covariance::parameters { using dal::backend::context_cpu; @@ -46,7 +54,7 @@ std::int64_t propose_block_size(const context_cpu& ctx, const std::int64_t row_c /// The constants are defined as the values that show the best performance results /// in the series of performance measurements with the varying block sizes and dataset sizes. std::int64_t block_size = 140l; - if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) { + if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) { /// Here if AVX512 extensions are available on CPU if (5000l < row_count && row_count <= 50000l) { block_size = 1024l; diff --git a/cpp/oneapi/dal/algo/covariance/test/fixture.hpp b/cpp/oneapi/dal/algo/covariance/test/fixture.hpp index 5cd74690a7f..f79e481db55 100644 --- a/cpp/oneapi/dal/algo/covariance/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/covariance/test/fixture.hpp @@ -37,6 +37,8 @@ class covariance_test : public te::crtp_algo_fixture { using Float = std::tuple_element_t<0, TestType>; using Method = std::tuple_element_t<1, TestType>; using input_t = cov::compute_input<>; + using partial_input_t = cov::partial_compute_input<>; + using partial_result_t = cov::partial_compute_result<>; using result_t = cov::compute_result<>; using descriptor_t = cov::descriptor; diff --git a/cpp/oneapi/dal/algo/covariance/test/online_spmd.cpp b/cpp/oneapi/dal/algo/covariance/test/online_spmd.cpp new file mode 100644 index 00000000000..bc4cf4f8dbd --- /dev/null +++ b/cpp/oneapi/dal/algo/covariance/test/online_spmd.cpp @@ -0,0 +1,131 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/algo/covariance/test/fixture.hpp" +#include "oneapi/dal/test/engine/tables.hpp" +#include "oneapi/dal/test/engine/io.hpp" + +namespace oneapi::dal::covariance::test { + +namespace te = dal::test::engine; +namespace la = te::linalg; +namespace cov = oneapi::dal::covariance; + +template +class covariance_online_spmd_test + : public covariance_test> { +public: + using base_t = covariance_test>; + using float_t = typename base_t::float_t; + using input_t = typename base_t::input_t; + using partial_input_t = typename base_t::partial_input_t; + using partial_result_t = typename base_t::partial_result_t; + using result_t = typename base_t::result_t; + + void set_rank_count(std::int64_t rank_count) { + rank_count_ = rank_count; + } + + void set_blocks_count(std::int64_t blocks_count) { + blocks_count_ = blocks_count; + } + + template + result_t finalize_compute_override(Args&&... args) { + return this->finalize_compute_via_spmd_threads_and_merge(rank_count_, + std::forward(args)...); + } + + result_t merge_finalize_compute_result_override(const std::vector& results) { + return results[0]; + } + + template + std::vector split_finalize_compute_input_override(std::int64_t split_count, + Args&&... args) { + ONEDAL_ASSERT(split_count == rank_count_); + const std::vector input{ std::forward(args)... }; + + return input; + } + + void online_spmd_general_checks(const te::dataframe& data_fr, + cov::result_option_id compute_mode, + const te::table_id& data_table_id) { + CAPTURE(static_cast(compute_mode)); + const table data = data_fr.get_table(this->get_policy(), data_table_id); + + const auto cov_desc = base_t::get_descriptor(compute_mode); + std::vector partial_results; + auto input_table = base_t::template split_table_by_rows(data, rank_count_); + for (int64_t i = 0; i < rank_count_; i++) { + dal::covariance::partial_compute_result<> partial_result; + auto input_table_blocks = + base_t::template split_table_by_rows(input_table[i], blocks_count_); + for (int64_t j = 0; j < blocks_count_; j++) { + partial_result = + this->partial_compute(cov_desc, partial_result, input_table_blocks[j]); + } + partial_results.push_back(partial_result); + } + const auto compute_result = this->finalize_compute_override(cov_desc, partial_results); + + base_t::check_compute_result(cov_desc, data, compute_result); + } + +private: + std::int64_t rank_count_; + std::int64_t blocks_count_; +}; + +using covariance_types = COMBINE_TYPES((float, double), (covariance::method::dense)); + +TEMPLATE_LIST_TEST_M(covariance_online_spmd_test, + "covariance common flow", + "[covariance][integration][spmd]", + covariance_types) { + SKIP_IF(this->get_policy().is_cpu()); + SKIP_IF(this->not_float64_friendly()); + + const te::dataframe data = + GENERATE_DATAFRAME(te::dataframe_builder{ 1000, 100 }.fill_normal(-30, 30, 7777), + te::dataframe_builder{ 2000, 20 }.fill_normal(0, 1, 7777), + te::dataframe_builder{ 2500, 20 }.fill_normal(-30, 30, 7777)); + this->set_rank_count(GENERATE(1, 2, 4)); + this->set_blocks_count(GENERATE(1, 3, 10)); + cov::result_option_id mode_mean = result_options::means; + cov::result_option_id mode_cov = result_options::cov_matrix; + cov::result_option_id mode_cor = result_options::cor_matrix; + cov::result_option_id mode_cov_mean = result_options::cov_matrix | result_options::means; + cov::result_option_id mode_cov_cor = result_options::cov_matrix | result_options::cor_matrix; + cov::result_option_id mode_cor_mean = result_options::cor_matrix | result_options::means; + cov::result_option_id res_all = + result_options::cov_matrix | result_options::cor_matrix | result_options::means; + + const cov::result_option_id compute_mode = GENERATE_COPY(mode_mean, + mode_cor, + mode_cov, + mode_cor_mean, + mode_cov_mean, + mode_cov_cor, + res_all); + + const auto data_table_id = this->get_homogen_table_id(); + + this->online_spmd_general_checks(data, compute_mode, data_table_id); +} + +} // namespace oneapi::dal::covariance::test diff --git a/cpp/oneapi/dal/algo/covariance/test/spmd.cpp b/cpp/oneapi/dal/algo/covariance/test/spmd.cpp index 568fd9cfaf0..ac50be44d2a 100644 --- a/cpp/oneapi/dal/algo/covariance/test/spmd.cpp +++ b/cpp/oneapi/dal/algo/covariance/test/spmd.cpp @@ -71,7 +71,7 @@ class covariance_spmd_test : public covariance_testcompute(cov_desc, data); + const auto compute_result = this->compute_override(cov_desc, data); base_t::check_compute_result(cov_desc, data, compute_result); } diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp index 56ea3021929..b2269d4cdd9 100644 --- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,10 +108,11 @@ vertex_similarity_result jaccard( return res; } +#if defined(TARGET_X86_64) template <> vertex_similarity_result jaccard( const detail::descriptor_base &desc, const dal::preview::detail::topology &t, void *result_ptr); - +#endif } // namespace oneapi::dal::preview::jaccard::backend diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp index ca350bec92a..7e5a2dda834 100644 --- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp +++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +17,6 @@ #pragma once -#include - #include #include "oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp" diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp index f9c81a76cbc..8bb76b2ddbe 100644 --- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp +++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_cpu.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp index 15c5e3976cc..eef60006c5f 100644 --- a/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp +++ b/cpp/oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_skx.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +15,9 @@ * limitations under the License. *******************************************************************************/ +#if defined(TARGET_X86_64) #include +#endif #include "oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel.hpp" #include "oneapi/dal/algo/jaccard/backend/cpu/vertex_similarity_default_kernel_avx512.hpp" @@ -27,6 +30,7 @@ namespace oneapi::dal::preview::jaccard::backend { +#if defined(TARGET_X86_64) template vertex_similarity_result jaccard_avx512< dal::backend::cpu_dispatch_avx512>(const detail::descriptor_base& desc, const dal::preview::detail::topology& t, @@ -39,5 +43,6 @@ vertex_similarity_result jaccard(desc, t, result_ptr); } +#endif } // namespace oneapi::dal::preview::jaccard::backend diff --git a/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp b/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp index dd2d31f0277..17f44c6483a 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/cpu/infer_kernel.cpp @@ -33,9 +33,21 @@ using descriptor_t = detail::descriptor_base; namespace daal_kmeans = daal::algorithms::kmeans; namespace interop = dal::backend::interop; -template -using daal_kmeans_lloyd_dense_kernel_t = - daal_kmeans::internal::KMeansBatchKernel; +template +using daal_method_constant = std::integral_constant; + +template +struct to_daal_method; + +template <> +struct to_daal_method : daal_method_constant {}; + +template <> +struct to_daal_method : daal_method_constant {}; + +template +using batch_kernel_t = + daal_kmeans::internal::KMeansBatchKernel::value, Float, Cpu>; inline auto get_daal_parameter_to_infer(const descriptor_t& desc) { const std::int64_t max_iteration_count = 0; @@ -55,11 +67,11 @@ inline auto get_daal_parameter_to_infer(const descriptor_t& desc) { return parameter; } -template +template static infer_result call_daal_kernel(const context_cpu& ctx, const descriptor_t& desc, const model& trained_model, - const table& data) { + const Table& data) { const std::int64_t row_count = data.get_row_count(); auto result = infer_result{}.set_result_options(desc.get_result_options()); @@ -84,11 +96,13 @@ static infer_result call_daal_kernel(const context_cpu& ctx, daal_objective_function_value.get(), nullptr }; - interop::status_to_exception( - interop::call_daal_kernel(ctx, - input, - output, - &par)); + interop::status_to_exception(dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) { + return batch_kernel_t::value, + Method>() + .compute(input, output, &par); + })); + if (desc.get_result_options().test(result_options::compute_assignments)) { result.set_responses( dal::detail::homogen_table_builder{}.reset(arr_responses, row_count, 1).build()); @@ -101,23 +115,28 @@ static infer_result call_daal_kernel(const context_cpu& ctx, return result; } -template +template static infer_result infer(const context_cpu& ctx, const descriptor_t& desc, const infer_input& input) { - return call_daal_kernel(ctx, desc, input.get_model(), input.get_data()); + using table_type = + std::conditional_t, csr_table, table>; + const auto data = static_cast(input.get_data()); + return call_daal_kernel(ctx, desc, input.get_model(), data); } -template -struct infer_kernel_cpu { +template +struct infer_kernel_cpu { infer_result operator()(const context_cpu& ctx, const descriptor_t& desc, const infer_input& input) const { - return infer(ctx, desc, input); + return infer(ctx, desc, input); } }; -template struct infer_kernel_cpu; -template struct infer_kernel_cpu; +template struct infer_kernel_cpu; +template struct infer_kernel_cpu; +template struct infer_kernel_cpu; +template struct infer_kernel_cpu; } // namespace oneapi::dal::kmeans::backend diff --git a/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd_dense.cpp b/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd.cpp similarity index 68% rename from cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd_dense.cpp rename to cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd.cpp index e3404e8b47d..f32dcaae63a 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd_dense.cpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/cpu/train_kernel_lloyd.cpp @@ -18,6 +18,7 @@ #include #include "oneapi/dal/algo/kmeans/backend/cpu/train_kernel.hpp" +#include "oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp" #include "oneapi/dal/backend/interop/common.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" @@ -32,50 +33,34 @@ using dal::backend::context_cpu; using descriptor_t = detail::descriptor_base; namespace daal_kmeans = daal::algorithms::kmeans; -namespace daal_kmeans_init = daal::algorithms::kmeans::init; namespace interop = dal::backend::interop; -template -using daal_kmeans_lloyd_dense_kernel_t = - daal_kmeans::internal::KMeansBatchKernel; +template +using daal_method_constant = std::integral_constant; -template -using daal_kmeans_init_plus_plus_dense_kernel_t = - daal_kmeans_init::internal::KMeansInitKernel; +template +struct to_daal_method; -template +template <> +struct to_daal_method : daal_method_constant {}; + +template <> +struct to_daal_method : daal_method_constant {}; + +template +using batch_kernel_t = + daal_kmeans::internal::KMeansBatchKernel::value, Float, Cpu>; + +template static daal::data_management::NumericTablePtr get_initial_centroids( const context_cpu& ctx, const descriptor_t& desc, - const table& data, + const Table& data, const table& initial_centroids) { - const std::int64_t column_count = data.get_column_count(); - const std::int64_t cluster_count = desc.get_cluster_count(); - daal::data_management::NumericTablePtr daal_initial_centroids; if (!initial_centroids.has_data()) { - const auto daal_data = interop::convert_to_daal_table(data); - daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count)); - - const std::size_t init_len_input = 1; - daal::data_management::NumericTable* init_input[init_len_input] = { daal_data.get() }; - daal_initial_centroids = - interop::allocate_daal_homogen_table(cluster_count, column_count); - const std::size_t init_len_output = 1; - daal::data_management::NumericTable* init_output[init_len_output] = { - daal_initial_centroids.get() - }; - - interop::status_to_exception( - interop::call_daal_kernel( - ctx, - init_len_input, - init_input, - init_len_output, - init_output, - &par, - *(par.engine))); + oneapi::dal::kmeans::detail::daal_generate_centroids(desc, data); } else { daal_initial_centroids = interop::convert_to_daal_table(initial_centroids); @@ -96,10 +81,10 @@ inline auto get_daal_parameter_to_train(const descriptor_t& desc) { return par; } -template +template static train_result call_daal_kernel(const context_cpu& ctx, const descriptor_t& desc, - const table& data, + const Table& data, const table& initial_centroids) { const std::int64_t row_count = data.get_row_count(); const std::int64_t column_count = data.get_column_count(); @@ -107,7 +92,8 @@ static train_result call_daal_kernel(const context_cpu& ctx, auto par = get_daal_parameter_to_train(desc); - auto daal_initial_centroids = get_initial_centroids(ctx, desc, data, initial_centroids); + auto daal_initial_centroids = + get_initial_centroids(ctx, desc, data, initial_centroids); const auto daal_data = interop::convert_to_daal_table(data); auto result = train_result{}; @@ -127,7 +113,6 @@ static train_result call_daal_kernel(const context_cpu& ctx, array arr_responses = array::empty(row_count); array arr_objective_function_value = array::empty(1); - const auto daal_responses = interop::convert_to_daal_homogen_table(arr_responses, row_count, 1); const auto daal_objective_function_value = interop::convert_to_daal_homogen_table(arr_objective_function_value, 1, 1); @@ -136,11 +121,13 @@ static train_result call_daal_kernel(const context_cpu& ctx, daal_responses.get(), daal_objective_function_value.get(), daal_iteration_count.get() }; - interop::status_to_exception( - interop::call_daal_kernel(ctx, - input, - output, - &par)); + + interop::status_to_exception(dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) { + return batch_kernel_t::value, + Method>() + .compute(input, output, &par); + })); result.set_objective_function_value(static_cast(arr_objective_function_value[0])); @@ -153,30 +140,31 @@ static train_result call_daal_kernel(const context_cpu& ctx, model().set_centroids(dal::detail::homogen_table_builder{} .reset(arr_centroids, cluster_count, column_count) .build())); - return result; } -template +template static train_result train(const context_cpu& ctx, const descriptor_t& desc, const train_input& input) { - return call_daal_kernel(ctx, - desc, - input.get_data(), - input.get_initial_centroids()); + using table_type = + std::conditional_t, csr_table, table>; + const auto data = static_cast(input.get_data()); + return call_daal_kernel(ctx, desc, data, input.get_initial_centroids()); } -template -struct train_kernel_cpu { +template +struct train_kernel_cpu { train_result operator()(const context_cpu& ctx, const descriptor_t& desc, const train_input& input) const { - return train(ctx, desc, input); + return train(ctx, desc, input); } }; template struct train_kernel_cpu; template struct train_kernel_cpu; +template struct train_kernel_cpu; +template struct train_kernel_cpu; } // namespace oneapi::dal::kmeans::backend diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp index 99c44b93b31..2eefffd1cd7 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/infer_kernel_dpc.cpp @@ -19,7 +19,9 @@ #include "oneapi/dal/backend/primitives/utils.hpp" #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_integral.hpp" #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_fp.hpp" +#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp" #include "oneapi/dal/table/row_accessor.hpp" +#include "oneapi/dal/table/csr_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" @@ -105,7 +107,83 @@ struct infer_kernel_gpu { } }; -template struct infer_kernel_gpu; -template struct infer_kernel_gpu; +template +struct infer_kernel_gpu { + infer_result operator()(const dal::backend::context_gpu& ctx, + const descriptor_t& desc, + const infer_input& input) const { + auto& queue = ctx.get_queue(); + auto& comm = ctx.get_communicator(); + ONEDAL_ASSERT(input.get_data().get_kind() == dal::csr_table::kind()); + const auto data = static_cast(input.get_data()); + const std::int64_t row_count = data.get_row_count(); + const std::int64_t column_count = data.get_column_count(); + const std::int64_t cluster_count = desc.get_cluster_count(); + dal::detail::check_mul_overflow(cluster_count, column_count); + + auto [arr_val, arr_col, arr_row] = + csr_accessor(data).pull(queue, + { 0, -1 }, + sparse_indexing::zero_based, + sycl::usm::alloc::device); + auto values = pr::ndarray::wrap(arr_val.get_data(), arr_val.get_count()); + auto column_indices = + pr::ndarray::wrap(arr_col.get_data(), arr_col.get_count()); + auto row_offsets = + pr::ndarray::wrap(arr_row.get_data(), arr_row.get_count()); + auto arr_centroid_squares = + pr::ndarray::empty(queue, cluster_count, sycl::usm::alloc::device); + auto arr_data_squares = + pr::ndarray::empty(queue, row_count, sycl::usm::alloc::device); + auto data_squares_event = + compute_data_squares(queue, values, column_indices, row_offsets, arr_data_squares); + + auto distances = pr::ndarray::empty(queue, + { row_count, cluster_count }, + sycl::usm::alloc::device); + + auto arr_closest_distances = + pr::ndarray::empty(queue, { row_count, 1 }, sycl::usm::alloc::device); + auto arr_centroids = pr::table2ndarray(queue, + input.get_model().get_centroids(), + sycl::usm::alloc::device); + auto arr_responses = + pr::ndarray::empty(queue, { row_count, 1 }, sycl::usm::alloc::device); + + auto centroid_squares_event = kernels_fp::compute_squares(queue, + arr_centroids, + arr_centroid_squares, + { data_squares_event }); + auto assign_event = assign_clusters(queue, + values, + column_indices, + row_offsets, + arr_data_squares, + arr_centroids, + arr_centroid_squares, + distances, + arr_responses, + arr_closest_distances, + { data_squares_event, centroid_squares_event }); + auto objective_function = + calc_objective_function(queue, arr_closest_distances, { assign_event }); + { + // Reduce objective function value over all ranks + comm.allreduce(objective_function).wait(); + } + auto result = infer_result{}; + result.set_objective_function_value(objective_function); + + result.set_responses( + dal::homogen_table::wrap(arr_responses.flatten(queue, { assign_event }), row_count, 1)); + + return result; + } +}; + +template struct infer_kernel_gpu; +template struct infer_kernel_gpu; +template struct infer_kernel_gpu; +template struct infer_kernel_gpu; } // namespace oneapi::dal::kmeans::backend diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp new file mode 100644 index 00000000000..478bf9de85d --- /dev/null +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp @@ -0,0 +1,412 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/backend/primitives/reduction.hpp" +#include "oneapi/dal/backend/interop/common_dpc.hpp" +#include "oneapi/dal/backend/interop/error_converter.hpp" +#include "oneapi/dal/backend/interop/table_conversion.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" +#include "oneapi/dal/backend/atomic.hpp" + +namespace oneapi::dal::kmeans::backend { + +using dal::backend::context_gpu; +using descriptor_t = detail::descriptor_base; +using event_vector = std::vector; + +template +using local_accessor_rw_t = sycl::local_accessor; + +namespace interop = dal::backend::interop; +namespace pr = dal::backend::primitives; +namespace de = dal::detail; +namespace bk = dal::backend; + +template +sycl::event compute_data_squares(sycl::queue& q, + const pr::ndview& values, + const pr::ndview& column_indices, + const pr::ndview& row_offsets, + pr::ndview& squares) { + ONEDAL_PROFILER_TASK(compute_data_squares, q); + return pr::reduce_by_rows(q, + values, + column_indices, + row_offsets, + sparse_indexing::zero_based, + squares, + pr::sum{}, + pr::square{}); +} + +// Temporary function, TODO: replace this call with spgemm call +// TODO: need to add dimensions integer overflow +template +sycl::event custom_spgemm(sycl::queue& q, + const pr::ndview& values, + const pr::ndview& column_indices, + const pr::ndview& row_offsets, + const pr::ndview& b, + pr::ndview& c, + const Float alpha, + const Float beta, + const event_vector& deps = {}) { + ONEDAL_PROFILER_TASK(custom_spgemm, q); + const size_t a_row_count = row_offsets.get_count() - 1; + const size_t reduce_dim = b.get_dimension(1); + const size_t b_row_count = b.get_dimension(0); + + const auto local_size = + std::min(bk::device_max_wg_size(q), bk::down_pow2(reduce_dim)); + auto res_ptr = c.get_mutable_data(); + const auto a_ptr = values.get_data(); + const auto row_ofs = row_offsets.get_data(); + const auto col_ind = column_indices.get_data(); + const auto b_ptr = b.get_data(); + + // Compute matrix block by block to avoid integer overflow + const std::int64_t row_block = 8 * bk::device_max_wg_size(q); + const std::int64_t row_block_size = std::min(row_block, a_row_count); + const std::int64_t col_block_size = std::min(row_block, b_row_count); + + const auto nd_range = + bk::make_multiple_nd_range_3d({ row_block_size, col_block_size, local_size }, + { 1, 1, local_size }); + + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(nd_range, [=](auto item) { + const auto row_shift = item.get_global_id(0); + const auto col_shift = item.get_global_id(1); + const auto local_id = item.get_local_id(2); + + for (auto row_idx = row_shift; row_idx < a_row_count; row_idx += row_block) { + for (auto col_idx = col_shift; col_idx < b_row_count; col_idx += row_block) { + const auto start = row_ofs[row_idx] + local_id; + const auto end = row_ofs[row_idx + 1]; + Float acc = Float(0); + for (std::int64_t data_idx = start; data_idx < end; data_idx += local_size) { + const auto reduce_id = col_ind[data_idx]; + acc += a_ptr[data_idx] * b_ptr[col_idx * reduce_dim + reduce_id]; + } + const Float scalar_mul = + sycl::reduce_over_group(item.get_group(), + acc, + Float(0), + sycl::ext::oneapi::plus()); + if (local_id == 0) { + res_ptr[row_idx * b_row_count + col_idx] = + beta * res_ptr[row_idx * b_row_count + col_idx] + alpha * scalar_mul; + } + } + } + }); + }); +} + +/// Calculates distances from each data point to each centroid and selects the closest centroid to each data point +/// @param[in] q A sycl-queue to perform operations on device +/// @param[in] values A data part of csr table with :expr:`non_zero_count x 1` dimensions +/// @param[in] column_indices An array of column indices in csr table with :expr:`non_zero_count x 1` dimensions +/// @param[in] row_offsets An arrat of row offsets in csr table with :expr:`(row_count + 1) x 1` dimensions +/// @param[in] data_squares An array of data squared elementwise with :expr:`row_count x 1` dimensions +/// @param[in] centroids An array of centroids with :expr:`cluster_count x column_count` dimensions +/// @param[in] centroids_squares An array of centroids squares with :expr:`cluster_count x 1` dimensions +/// @param[out] distances An array of distances of dataset to each cluster with :expr:`row_count x cluster_count` dimensions +/// @param[out] responses An array of responses with :expr:`row_count x 1` dimensions +/// @param[out] closest_dists An array of closests distances for each data point with :expr:`row_count x 1` dimensions +/// @param[in] deps An event vector of dependencies for specified kernel +template +sycl::event assign_clusters(sycl::queue& q, + const pr::ndview& values, + const pr::ndview& column_indices, + const pr::ndview& row_offsets, + const pr::ndview& data_squares, + const pr::ndview& centroids, + const pr::ndview& centroid_squares, + pr::ndview& distances, + pr::ndview& responses, + pr::ndview& closest_dists, + const event_vector& deps = {}) { + ONEDAL_PROFILER_TASK(assign_clusters, q); + auto data_squares_ptr = data_squares.get_data(); + auto cent_squares_ptr = centroid_squares.get_data(); + auto responses_ptr = responses.get_mutable_data(); + auto closest_dists_ptr = closest_dists.get_mutable_data(); + // Calculate rest part of distances + auto dist_event = custom_spgemm(q, + values, + column_indices, + row_offsets, + centroids, + distances, + Float(-2.0), + Float(0), + deps); + + const auto distances_ptr = distances.get_data(); + + const auto cluster_count = centroids.get_dimension(0); + const auto row_count = static_cast(row_offsets.get_count() - 1); + // based on bechmarks an optimal block size is equal to 8 work-group sizes + const std::int64_t block_multiplier = 8; + const std::int64_t row_block = block_multiplier * bk::device_max_wg_size(q); + + const auto local_size = + std::min(bk::device_max_wg_size(q), bk::down_pow2(cluster_count)); + const auto nd_range = + bk::make_multiple_nd_range_2d({ row_block, local_size }, { 1, local_size }); + + auto event = q.submit([&](sycl::handler& cgh) { + cgh.depends_on({ dist_event }); + cgh.depends_on(deps); + cgh.parallel_for(nd_range, [=](auto item) { + const auto row_shift = item.get_global_id(0); + const auto local_id = item.get_local_id(1); + const auto max_val = std::numeric_limits::max(); + const auto max_index = std::numeric_limits::max(); + for (auto row_idx = row_shift; row_idx < row_count; row_idx += row_block) { + auto min_dist = max_val; + auto min_idx = max_index; + auto row_dists = distances_ptr + row_idx * cluster_count; + for (std::int32_t cluster_id = local_id; cluster_id < cluster_count; + cluster_id += local_size) { + const auto dist = cent_squares_ptr[cluster_id] + row_dists[cluster_id] + + data_squares_ptr[row_idx]; + if (dist < min_dist) { + min_dist = dist; + min_idx = cluster_id; + } + } + const Float closest = sycl::reduce_over_group(item.get_group(), + min_dist, + max_val, + sycl::ext::oneapi::minimum()); + const std::int32_t dist_idx = closest == min_dist ? min_idx : max_index; + const std::int32_t closest_id = + sycl::reduce_over_group(item.get_group(), + dist_idx, + max_index, + sycl::ext::oneapi::minimum()); + if (local_id == 0) { + responses_ptr[row_idx] = closest_id; + closest_dists_ptr[row_idx] = closest; + } + } + }); + }); + return event; +} + +// Calculates an objective function, which is sum of all distances from points to centroid. +/// @param[in] q A sycl-queue to perform operations on device +/// @param[in] dists An array of distances for each data point to the closest cluster +/// @param[in] deps An event vector of dependencies for specified kernel +template +Float calc_objective_function(sycl::queue& q, + const pr::ndview& dists, + const event_vector& deps = {}) { + ONEDAL_PROFILER_TASK(calc_objective_function, q); + pr::sum sum{}; + pr::identity ident{}; + auto view_1d = dists.template reshape<1>(pr::ndshape<1>{ dists.get_dimension(0) }); + return pr::reduce_1d(q, view_1d, sum, ident, deps); +} + +// Updates the centroids based on new responses and cluster counts. +// New centroid is a mean among all points in cluster. +// If cluster is empty, centroid remains the same as in previous iteration. +/// @param[in] q A sycl-queue to perform operations on device +/// @param[in] values A data part of csr table with :expr:`non_zero_count x 1` dimensions +/// @param[in] column_indices An array of column indices in csr table :expr:`non_zero_count x 1` dimensions +/// @param[in] row_offsets An arrat of row offsets in csr table with :expr:`(row_count + 1) x 1` dimensions +/// @param[in] column_count A number of column in input dataset +/// @param[in] reponses An array of cluster assignments with :expr:`row_count x 1` dimensions +/// @param[out] centroids An array of centroids with :expr:`cluster_count x column_count` dimensions +/// @param[in] cluster_counts An array of cluster counts with :expr:`cluster_count x 1` dimensions +/// @param[in] deps An event vector of dependencies for specified kernel +template +sycl::event update_centroids(sycl::queue& q, + const bk::communicator& comm, + const pr::ndview& values, + const pr::ndview& column_indices, + const pr::ndview& row_offsets, + std::int64_t column_count, + const pr::ndarray& responses, + pr::ndarray& centroids, + const pr::ndarray& cluster_counts, + const event_vector& deps = {}) { + ONEDAL_PROFILER_TASK(update_centroids, q); + const auto resp_ptr = responses.get_data(); + auto centroids_ptr = centroids.get_mutable_data(); + const auto row_count = row_offsets.get_count() - 1; + const auto data_ptr = values.get_data(); + const auto row_ofs_ptr = row_offsets.get_data(); + const auto col_ind_ptr = column_indices.get_data(); + const auto counts_ptr = cluster_counts.get_data(); + + const auto local_size = bk::device_max_wg_size(q); + const auto num_clusters = centroids.get_dimension(0); + + const auto clean_range = + bk::make_multiple_nd_range_2d({ num_clusters, column_count }, { 1, 1 }); + auto clean_event = q.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(clean_range, [=](auto it) { + const auto cluster_id = it.get_global_id(0); + const auto col_id = it.get_global_id(1); + centroids_ptr[cluster_id * column_count + col_id] = 0; + }); + }); + + const auto row_block = + std::min(bk::device_max_wg_size(q) * 8, bk::down_pow2(row_count)); + const auto col_block = + std::min(bk::device_max_wg_size(q), bk::down_pow2(column_count)); + const auto range = + bk::make_multiple_nd_range_3d({ num_clusters, row_block, col_block }, { 1, 1, col_block }); + + auto centroids_sum_event = q.submit([&](sycl::handler& cgh) { + cgh.depends_on(clean_event); + local_accessor_rw_t local_centroid(column_count, cgh); + cgh.parallel_for(range, [=](auto it) { + const auto cluster_id = it.get_global_id(0); + const auto row_shift = it.get_global_id(1); + const auto local_id = static_cast(it.get_local_id(2)); + if (counts_ptr[cluster_id] == 0) { + return; + } + auto local_centroid_ptr = + local_centroid.template get_multi_ptr().get_raw(); + for (std::int64_t col_idx = local_id; col_idx < column_count; col_idx += col_block) { + local_centroid_ptr[col_idx] = 0; + } + it.barrier(); + for (std::int64_t row_idx = row_shift; row_idx < row_count; row_idx += row_block) { + if (resp_ptr[row_idx] == static_cast(cluster_id)) { + const auto start = row_ofs_ptr[row_idx]; + const auto end = row_ofs_ptr[row_idx + 1]; + for (auto idx = start + local_id; idx < end; idx += col_block) { + const auto col_idx = col_ind_ptr[idx]; + const auto val = data_ptr[idx]; + bk::atomic_local_add(local_centroid_ptr + col_idx, val); + } + } + } + it.barrier(); + if (local_id == 0) { + for (std::int64_t col_idx = 0; col_idx < column_count; ++col_idx) { + const auto pos = cluster_id * column_count + col_idx; + bk::atomic_global_add(centroids_ptr + pos, local_centroid_ptr[col_idx]); + } + } + }); + }); + { + // Reduce centroids over all ranks in of distributed computing + auto centroids_reduce_event = comm.allreduce(centroids.flatten(q, { centroids_sum_event })); + centroids_reduce_event.wait(); + } + + const auto finalize_range = + bk::make_multiple_nd_range_2d({ num_clusters, local_size }, { 1, local_size }); + auto finalize_centroids = q.submit([&](sycl::handler& cgh) { + cgh.depends_on(centroids_sum_event); + cgh.parallel_for(finalize_range, [=](auto it) { + const auto cluster_id = it.get_global_id(0); + const auto local_id = it.get_local_id(1); + const auto cent_count = counts_ptr[cluster_id]; + if (cent_count == 0) { + return; + } + for (std::int32_t col_idx = local_id; col_idx < column_count; col_idx += local_size) { + centroids_ptr[cluster_id * column_count + col_idx] /= cent_count; + } + }); + }); + return finalize_centroids; +} + +/// Handling empty clusters. +/// @param[in] ctx GPU context structure +/// @param[in] row_count A number of rows in the dataset +/// @param[out] responses An array of cluster assignments with :expr:`row_count x 1` dimensions +/// @param[out] cluster_counts An array of cluster counts with :expr:`cluster_count x 1` dimensions +/// @param[out] dists An array of closest distances to cluster with :expr:`row_count x 1` dimensions +/// @param[in] deps An event vector of dependencies for specified kernel +template +sycl::event handle_empty_clusters(const dal::backend::context_gpu& ctx, + const std::int64_t row_count, + pr::ndarray& responses, + pr::ndarray& cluster_counts, + pr::ndarray& dists, + const event_vector& deps = {}) { + auto& queue = ctx.get_queue(); + auto& comm = ctx.get_communicator(); + ONEDAL_PROFILER_TASK(handle_empty_clusters, queue); + const auto rank_count = comm.get_rank_count(); + const auto rank = comm.get_rank(); + const auto num_clusters = cluster_counts.get_dimension(0); + + auto resp_ptr = responses.get_mutable_data(); + auto counts_ptr = cluster_counts.get_mutable_data(); + auto dists_ptr = dists.get_mutable_data(); + + const auto abs_min_val = -std::numeric_limits::max(); + + auto local_size = bk::device_max_wg_size(queue); + auto range = bk::make_multiple_nd_range_1d(local_size, local_size); + auto event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(range, [=](auto it) { + const auto local_id = it.get_local_id(1); + for (std::int64_t cluster_id = rank; cluster_id < num_clusters; + cluster_id += rank_count) { + // no need to handle non-empty clusters + if (counts_ptr[cluster_id] > 0) { + continue; + } + std::int64_t cand_idx = -1; + Float cand_dist = abs_min_val; + for (std::int64_t row_idx = local_id; row_idx < row_count; row_idx += local_size) { + const auto dist = dists_ptr[row_idx]; + if (dist > cand_dist) { + cand_dist = dist; + cand_idx = row_idx; + } + } + const Float longest_dist = + sycl::reduce_over_group(it.get_group(), + cand_dist, + abs_min_val, + sycl::ext::oneapi::maximum()); + const auto id = longest_dist == cand_dist ? cand_idx : -1; + const auto longest_id = + sycl::reduce_over_group(it.get_group(), + id, + sycl::ext::oneapi::maximum()); + if (local_id == 0 && longest_id != -1) { + resp_ptr[longest_id] = cluster_id; + counts_ptr[longest_id] = 1; + dists_ptr[cluster_id] = Float(0); + } + } + }); + }); + return event; +} + +} // namespace oneapi::dal::kmeans::backend diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_csr_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_csr_dpc.cpp new file mode 100644 index 00000000000..00eeebf42c4 --- /dev/null +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_csr_dpc.cpp @@ -0,0 +1,229 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/algo/kmeans/backend/gpu/train_kernel.hpp" +#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_integral.hpp" +#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_fp.hpp" +#include "oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp" +#include "oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" + +#include "oneapi/dal/detail/profiler.hpp" + +namespace oneapi::dal::kmeans::backend { + +using dal::backend::context_gpu; +using descriptor_t = detail::descriptor_base; +using event_vector = std::vector; + +namespace interop = dal::backend::interop; +namespace pr = dal::backend::primitives; +namespace de = dal::detail; +namespace bk = dal::backend; + +// Initializes centroids randomly on CPU if it was not set by user. +template +static pr::ndarray get_initial_centroids(const dal::backend::context_gpu& ctx, + const descriptor_t& params, + const train_input& input) { + auto& queue = ctx.get_queue(); + + const auto data = static_cast(input.get_data()); + + const std::int64_t column_count = data.get_column_count(); + const std::int64_t cluster_count = params.get_cluster_count(); + + if (!input.get_initial_centroids().has_data()) { + auto daal_initial_centroids = + oneapi::dal::kmeans::detail::daal_generate_centroids(params, data); + daal::data_management::BlockDescriptor block; + daal_initial_centroids->getBlockOfRows(0, + cluster_count, + daal::data_management::readOnly, + block); + Float* initial_centroids_ptr = block.getBlockPtr(); + auto arr_host_initial = + pr::ndarray::wrap(initial_centroids_ptr, { cluster_count, column_count }); + return arr_host_initial.to_device(queue); + } + auto initial_centroids_ptr = row_accessor(input.get_initial_centroids()) + .pull(queue, { 0, -1 }, sycl::usm::alloc::device); + return pr::ndarray::wrap(initial_centroids_ptr, { cluster_count, column_count }); +} + +/// Main entrypoint for GPU CSR Kmeans algorithm +/// @param[in] ctx GPU context structure +/// @param[in] params A descriptor containing parameters for algorithm +/// @param[in] input A train input +template +struct train_kernel_gpu { + train_result operator()(const dal::backend::context_gpu& ctx, + const descriptor_t& params, + const train_input& input) const { + auto& queue = ctx.get_queue(); + auto& comm = ctx.get_communicator(); + ONEDAL_ASSERT(input.get_data().get_kind() == dal::csr_table::kind()); + const auto data = static_cast(input.get_data()); + const std::int64_t row_count = data.get_row_count(); + const std::int64_t column_count = data.get_column_count(); + const std::int64_t cluster_count = params.get_cluster_count(); + const std::int64_t max_iteration_count = params.get_max_iteration_count(); + const double accuracy_threshold = params.get_accuracy_threshold(); + dal::detail::check_mul_overflow(cluster_count, column_count); + + auto [arr_val, arr_col, arr_row] = + csr_accessor(data).pull(queue, + { 0, -1 }, + sparse_indexing::zero_based, + sycl::usm::alloc::device); + auto values = pr::ndarray::wrap(arr_val.get_data(), arr_val.get_count()); + auto column_indices = + pr::ndarray::wrap(arr_col.get_data(), arr_col.get_count()); + auto row_offsets = + pr::ndarray::wrap(arr_row.get_data(), arr_row.get_count()); + auto arr_initial = get_initial_centroids(ctx, params, input); + auto arr_centroid_squares = + pr::ndarray::empty(queue, cluster_count, sycl::usm::alloc::device); + auto arr_data_squares = + pr::ndarray::empty(queue, row_count, sycl::usm::alloc::device); + auto data_squares_event = + compute_data_squares(queue, values, column_indices, row_offsets, arr_data_squares); + + auto distances = pr::ndarray::empty(queue, + { row_count, cluster_count }, + sycl::usm::alloc::device); + + auto arr_closest_distances = + pr::ndarray::empty(queue, { row_count, 1 }, sycl::usm::alloc::device); + auto arr_centroids = pr::ndarray::empty(queue, + { cluster_count, column_count }, + sycl::usm::alloc::device); + auto arr_responses = + pr::ndarray::empty(queue, { row_count, 1 }, sycl::usm::alloc::device); + auto cluster_counts = + pr::ndarray::empty(queue, cluster_count, sycl::usm::alloc::device); + + Float prev_objective_function = de::limits::max(); + std::int64_t iter; + sycl::event last_event = data_squares_event; + + for (iter = 0; iter < max_iteration_count; iter++) { + auto centroid_squares_event = + kernels_fp::compute_squares(queue, + iter == 0 ? arr_initial : arr_centroids, + arr_centroid_squares, + { last_event }); + auto assign_event = assign_clusters(queue, + values, + column_indices, + row_offsets, + arr_data_squares, + iter == 0 ? arr_initial : arr_centroids, + arr_centroid_squares, + distances, + arr_responses, + arr_closest_distances, + { centroid_squares_event, last_event }); + auto count_event = count_clusters(queue, + arr_responses, + cluster_count, + cluster_counts, + { assign_event }); + + { + // Cluster counters over all ranks in case of distributed computing + auto count_reduce_event = + comm.allreduce(cluster_counts.flatten(queue, { count_event })); + count_reduce_event.wait(); + } + + auto empty_cluster_event = handle_empty_clusters(ctx, + row_count, + arr_responses, + cluster_counts, + arr_closest_distances, + { count_event }); + + auto objective_function = calc_objective_function(queue, + arr_closest_distances, + { empty_cluster_event, count_event }); + + { + // Reduce objective function value over all ranks + auto obj_func_reduce_event = comm.allreduce(objective_function); + obj_func_reduce_event.wait(); + } + auto update_event = update_centroids(queue, + comm, + values, + column_indices, + row_offsets, + column_count, + arr_responses, + arr_centroids, + cluster_counts, + { count_event }); + + last_event = update_event; + + if (accuracy_threshold > 0 && + objective_function + accuracy_threshold > prev_objective_function) { + iter++; + break; + } + prev_objective_function = objective_function; + } + auto centroid_squares_event = + kernels_fp::compute_squares(queue, + iter == 0 ? arr_initial : arr_centroids, + arr_centroid_squares, + { last_event }); + auto assign_event = assign_clusters(queue, + values, + column_indices, + row_offsets, + arr_data_squares, + iter == 0 ? arr_initial : arr_centroids, + arr_centroid_squares, + distances, + arr_responses, + arr_closest_distances, + { last_event, centroid_squares_event }); + auto objective_function = + calc_objective_function(queue, + arr_closest_distances, + { last_event, centroid_squares_event, assign_event }); + { + // Reduce objective function value over all ranks + auto obj_func_reduce_event = comm.allreduce(objective_function); + obj_func_reduce_event.wait(); + } + + model model; + model.set_centroids( + dal::homogen_table::wrap(arr_centroids.flatten(queue), cluster_count, column_count)); + return train_result() + .set_responses(dal::homogen_table::wrap(arr_responses.flatten(queue), row_count, 1)) + .set_iteration_count(iter) + .set_objective_function_value(objective_function) + .set_model(model); + } +}; + +template struct train_kernel_gpu; +template struct train_kernel_gpu; + +} // namespace oneapi::dal::kmeans::backend diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp index efca39dd338..a8c02f27318 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp @@ -21,6 +21,7 @@ #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_integral.hpp" #include "oneapi/dal/algo/kmeans/backend/gpu/cluster_updater.hpp" #include "oneapi/dal/algo/kmeans/backend/gpu/kernels_fp.hpp" +#include "oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp" #include "oneapi/dal/exceptions.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/table/row_accessor.hpp" @@ -36,16 +37,11 @@ namespace oneapi::dal::kmeans::backend { using dal::backend::context_gpu; using descriptor_t = detail::descriptor_base; -namespace daal_kmeans_init = daal::algorithms::kmeans::init; namespace interop = dal::backend::interop; namespace pr = dal::backend::primitives; namespace de = dal::detail; namespace bk = dal::backend; -template -using daal_kmeans_init_plus_plus_dense_kernel_t = - daal_kmeans_init::internal::KMeansInitKernel; - template static pr::ndarray get_initial_centroids(const dal::backend::context_gpu& ctx, const descriptor_t& params, @@ -60,31 +56,9 @@ static pr::ndarray get_initial_centroids(const dal::backend::context_g daal::data_management::NumericTablePtr daal_initial_centroids; if (!input.get_initial_centroids().has_data()) { - // We use CPU algorithm for initialization, so input data - // may be copied to DAAL homogen table - const auto daal_data = interop::copy_to_daal_homogen_table(data); - daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count)); - - const std::size_t init_len_input = 1; - daal::data_management::NumericTable* init_input[init_len_input] = { daal_data.get() }; - daal_initial_centroids = - interop::allocate_daal_homogen_table(cluster_count, column_count); - const std::size_t init_len_output = 1; - daal::data_management::NumericTable* init_output[init_len_output] = { - daal_initial_centroids.get() - }; - - const dal::backend::context_cpu cpu_ctx; - interop::status_to_exception( - interop::call_daal_kernel( - cpu_ctx, - init_len_input, - init_input, - init_len_output, - init_output, - &par, - *(par.engine))); + oneapi::dal::kmeans::detail::daal_generate_centroids(params, + data); daal::data_management::BlockDescriptor block; daal_initial_centroids->getBlockOfRows(0, cluster_count, @@ -107,8 +81,8 @@ struct train_kernel_gpu { const train_input& input) const { auto& queue = ctx.get_queue(); auto& comm = ctx.get_communicator(); - const auto data = input.get_data(); + ONEDAL_ASSERT(data.get_kind() != dal::csr_table::kind()); const std::int64_t row_count = data.get_row_count(); const std::int64_t column_count = data.get_column_count(); const std::int64_t cluster_count = desc.get_cluster_count(); diff --git a/cpp/oneapi/dal/algo/kmeans/common.hpp b/cpp/oneapi/dal/algo/kmeans/common.hpp index 39ded2f1d82..7c406212b16 100644 --- a/cpp/oneapi/dal/algo/kmeans/common.hpp +++ b/cpp/oneapi/dal/algo/kmeans/common.hpp @@ -43,12 +43,17 @@ namespace v1 { /// method. struct lloyd_dense {}; +/// Tag-type that denotes :ref:`Lloyd's ` computational +/// method for sparse data. +struct lloyd_csr {}; + /// Alias tag-type for :ref:`Lloyd's ` computational /// method. using by_default = lloyd_dense; } // namespace v1 using v1::lloyd_dense; +using v1::lloyd_csr; using v1::by_default; } // namespace method @@ -95,7 +100,8 @@ template constexpr bool is_valid_float_v = dal::detail::is_one_of_v; template -constexpr bool is_valid_method_v = dal::detail::is_one_of_v; +constexpr bool is_valid_method_v = + dal::detail::is_one_of_v; template constexpr bool is_valid_task_v = dal::detail::is_one_of_v; diff --git a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp index 8c04129fb05..ce05fd6b75e 100644 --- a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp +++ b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops.cpp @@ -37,8 +37,10 @@ struct infer_ops_dispatcher { #define INSTANTIATE(F, M, T) \ template struct ONEDAL_EXPORT infer_ops_dispatcher; -INSTANTIATE(float, method::by_default, task::clustering) -INSTANTIATE(double, method::by_default, task::clustering) +INSTANTIATE(float, method::lloyd_dense, task::clustering) +INSTANTIATE(double, method::lloyd_dense, task::clustering) +INSTANTIATE(float, method::lloyd_csr, task::clustering) +INSTANTIATE(double, method::lloyd_csr, task::clustering) } // namespace v1 } // namespace oneapi::dal::kmeans::detail diff --git a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp index e27ac3549b0..6f00c81ecfb 100644 --- a/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans/detail/infer_ops_dpc.cpp @@ -42,8 +42,10 @@ struct infer_ops_dispatcher { template struct ONEDAL_EXPORT \ infer_ops_dispatcher; -INSTANTIATE(float, method::by_default, task::clustering) -INSTANTIATE(double, method::by_default, task::clustering) +INSTANTIATE(float, method::lloyd_dense, task::clustering) +INSTANTIATE(double, method::lloyd_dense, task::clustering) +INSTANTIATE(float, method::lloyd_csr, task::clustering) +INSTANTIATE(double, method::lloyd_csr, task::clustering) } // namespace v1 } // namespace oneapi::dal::kmeans::detail diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp b/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp new file mode 100644 index 00000000000..544e24546f8 --- /dev/null +++ b/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp @@ -0,0 +1,89 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ +#include + +#include "oneapi/dal/backend/interop/common_dpc.hpp" +#include "oneapi/dal/backend/interop/error_converter.hpp" +#include "oneapi/dal/backend/interop/table_conversion.hpp" +#include "oneapi/dal/backend/transfer.hpp" + +namespace oneapi::dal::kmeans::detail { + +namespace daal_kmeans_init = daal::algorithms::kmeans::init; + +template +using daal_init_method_constant = std::integral_constant; +using descriptor_t = detail::descriptor_base; +namespace interop = dal::backend::interop; + +template +struct to_daal_init_method; + +template <> +struct to_daal_init_method + : daal_init_method_constant {}; + +template <> +struct to_daal_init_method + : daal_init_method_constant {}; + +template +using init_kernel_t = + daal_kmeans_init::internal::KMeansInitKernel::value, Float, Cpu>; + +template +inline daal::data_management::NumericTablePtr daal_generate_centroids(const descriptor_t& desc, + const Table& data) { + const std::int64_t column_count = data.get_column_count(); + const std::int64_t cluster_count = desc.get_cluster_count(); + daal::data_management::NumericTablePtr daal_initial_centroids; + const auto daal_data = interop::convert_to_daal_table(data, true); + + daal_kmeans_init::Parameter par(dal::detail::integral_cast(cluster_count)); + + const std::size_t init_len_input = 1; + const daal::data_management::NumericTable* init_input[init_len_input] = { daal_data.get() }; + + daal_initial_centroids = + interop::allocate_daal_homogen_table(cluster_count, column_count); + const std::size_t init_len_output = 1; + daal::data_management::NumericTable* init_output[init_len_output] = { + daal_initial_centroids.get() + }; + const dal::backend::context_cpu cpu_ctx; + interop::status_to_exception(dal::backend::dispatch_by_cpu(cpu_ctx, [&](auto cpu) { + return init_kernel_t::value, + Method>() + .compute(init_len_input, init_input, init_len_output, init_output, &par, *(par.engine)); + })); + return daal_initial_centroids; +} + +template daal::data_management::NumericTablePtr +daal_generate_centroids(const descriptor_t& desc, + const table& data); +template daal::data_management::NumericTablePtr +daal_generate_centroids(const descriptor_t& desc, + const table& data); +template daal::data_management::NumericTablePtr +daal_generate_centroids(const descriptor_t& desc, + const csr_table& data); +template daal::data_management::NumericTablePtr +daal_generate_centroids(const descriptor_t& desc, + const csr_table& data); + +} // namespace oneapi::dal::kmeans::detail diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp b/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp index f7f085a068d..0be157b6f86 100644 --- a/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp +++ b/cpp/oneapi/dal/algo/kmeans/detail/train_ops.cpp @@ -38,6 +38,8 @@ struct train_ops_dispatcher { INSTANTIATE(float, method::lloyd_dense, task::clustering) INSTANTIATE(double, method::lloyd_dense, task::clustering) +INSTANTIATE(float, method::lloyd_csr, task::clustering) +INSTANTIATE(double, method::lloyd_csr, task::clustering) } // namespace v1 } // namespace oneapi::dal::kmeans::detail diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp index cc071b82000..d7e672e1777 100644 --- a/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans/detail/train_ops_dpc.cpp @@ -45,6 +45,8 @@ struct train_ops_dispatcher { INSTANTIATE(float, method::lloyd_dense, task::clustering) INSTANTIATE(double, method::lloyd_dense, task::clustering) +INSTANTIATE(float, method::lloyd_csr, task::clustering) +INSTANTIATE(double, method::lloyd_csr, task::clustering) } // namespace v1 } // namespace oneapi::dal::kmeans::detail diff --git a/cpp/oneapi/dal/algo/kmeans/test/batch.cpp b/cpp/oneapi/dal/algo/kmeans/test/batch.cpp index 01663a209fb..3211daba5ee 100644 --- a/cpp/oneapi/dal/algo/kmeans/test/batch.cpp +++ b/cpp/oneapi/dal/algo/kmeans/test/batch.cpp @@ -15,7 +15,7 @@ *******************************************************************************/ #include "oneapi/dal/algo/kmeans/test/fixture.hpp" - +#include "oneapi/dal/table/csr_accessor.hpp" namespace oneapi::dal::kmeans::test { template @@ -74,6 +74,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][batch]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->check_empty_clusters(); } @@ -82,6 +83,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][batch]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->check_on_smoke_data(); } @@ -90,6 +92,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][batch]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->check_on_gold_data(); } @@ -100,7 +103,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, // This test is not stable on CPU // TODO: Remove the following `SKIP_IF` once stability problem is resolved SKIP_IF(this->get_policy().is_cpu()); - + SKIP_IF(this->is_sparse_method()); SKIP_IF(this->not_float64_friendly()); this->check_on_large_data_with_one_cluster(); } @@ -110,6 +113,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][batch][nightly][stress]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->partial_centroids_stress_test(); } @@ -118,6 +122,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][batch][external-dataset][higgs]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); const std::int64_t iters = 3; const std::string higgs_path = "workloads/higgs/dataset/higgs_1m_test.csv"; @@ -140,6 +145,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][nightly][batch][external-dataset][susy]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); const std::int64_t iters = 10; const std::string susy_path = "workloads/susy/dataset/susy_test.csv"; @@ -162,6 +168,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][nightly][batch][external-dataset][epsilon]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); const std::int64_t iters = 2; const std::string epsilon_path = "workloads/epsilon/dataset/epsilon_80k_train.csv"; @@ -184,6 +191,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][batch][external-dataset][higgs]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); const std::int64_t iters = 3; const std::string higgs_path = "workloads/higgs/dataset/higgs_1m_test.csv"; @@ -206,6 +214,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][nightly][batch][external-dataset][susy]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); const std::int64_t iters = 10; const std::string susy_path = "workloads/susy/dataset/susy_test.csv"; @@ -228,6 +237,7 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, "[kmeans][nightly][batch][external-dataset][epsilon]", kmeans_types) { SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); const std::int64_t iters = 2; const std::string epsilon_path = "workloads/epsilon/dataset/epsilon_80k_train.csv"; @@ -260,4 +270,50 @@ TEMPLATE_LIST_TEST_M(kmeans_batch_test, } } +TEMPLATE_LIST_TEST_M(kmeans_batch_test, + "KMmeans sparse default cases", + "[kmeans][batch]", + kmeans_types) { + SKIP_IF(!this->is_sparse_method()); + SKIP_IF(this->not_float64_friendly()); + + SECTION("cluster=5") { + auto input = oneapi::dal::test::engine::csr_make_blobs(5, 50, 20); + bool init_centroids = true; + this->test_on_sparse_data(input, 10, 0.01, init_centroids); + } + + SECTION("cluster=16") { + bool init_centroids = true; + auto input = oneapi::dal::test::engine::csr_make_blobs(16, 200, 100); + this->test_on_sparse_data(input, 10, 0.01, init_centroids); + } + + SECTION("cluster=128") { + SKIP_IF(this->get_policy().is_cpu()); + bool init_centroids = true; + auto input = oneapi::dal::test::engine::csr_make_blobs(128, 100000, 200); + this->test_on_sparse_data(input, 10, 0.01, init_centroids); + } + + SECTION("cluster=5") { + auto input = oneapi::dal::test::engine::csr_make_blobs(5, 50, 20); + bool init_centroids = false; + this->test_on_sparse_data(input, 20, 0.01, init_centroids); + } + + SECTION("cluster=16") { + bool init_centroids = false; + auto input = oneapi::dal::test::engine::csr_make_blobs(16, 200, 100); + this->test_on_sparse_data(input, 10, 0.01, init_centroids); + } + + SECTION("cluster=32") { + SKIP_IF(this->get_policy().is_cpu()); + bool init_centroids = false; + auto input = oneapi::dal::test::engine::csr_make_blobs(32, 10000, 100); + this->test_on_sparse_data(input, 30, 0.01, init_centroids); + } +} + } // namespace oneapi::dal::kmeans::test diff --git a/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp b/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp index 0a0e3acf3ae..1fdf3cc00e1 100644 --- a/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/kmeans/test/fixture.hpp @@ -26,6 +26,7 @@ #include "oneapi/dal/table/homogen.hpp" #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" +#include "oneapi/dal/test/engine/csr_table_builder.hpp" #include "oneapi/dal/test/engine/math.hpp" #include "oneapi/dal/test/engine/metrics/clustering.hpp" @@ -34,7 +35,8 @@ namespace oneapi::dal::kmeans::test { namespace te = dal::test::engine; namespace la = dal::test::engine::linalg; -using kmeans_types = COMBINE_TYPES((float, double), (kmeans::method::lloyd_dense)); +using kmeans_types = COMBINE_TYPES((float, double), + (kmeans::method::lloyd_dense, kmeans::method::lloyd_csr)); template class kmeans_test : public te::crtp_algo_fixture { @@ -63,6 +65,10 @@ class kmeans_test : public te::crtp_algo_fixture { return descriptor_t{ cluster_count }; } + bool is_sparse_method() { + return std::is_same_v; + } + void exact_checks(const table& data, const table& initial_centroids, const table& ref_centroids, @@ -285,6 +291,32 @@ class kmeans_test : public te::crtp_algo_fixture { this->exact_checks(x, x, x, y, cluster_count, 1, 0.0); } + void test_on_sparse_data(const oneapi::dal::test::engine::csr_make_blobs& input, + std::int64_t max_iter_count, + float_t accuracy_threshold, + bool init_centroids) { + const table data = input.get_data(this->get_policy()); + const auto cluster_count = input.cluster_count_; + REQUIRE(data.get_kind() == csr_table::kind()); + auto desc = this->get_descriptor(cluster_count, max_iter_count, accuracy_threshold); + INFO("KMeans sparse training"); + if (init_centroids) { + const table initial_centroids = input.get_initial_centroids(); + const auto train_result = this->train(desc, data, initial_centroids); + check_response_match(input.get_responses(), train_result.get_responses()); + } + else { + const auto train_result = this->train(desc, data); + const auto model = train_result.get_model(); + auto match_map = array::zeros(cluster_count); + find_match_centroids(input.get_result_centroids(), + model.get_centroids(), + input.column_count_, + match_map); + check_response_match(match_map, input.get_responses(), train_result.get_responses()); + } + } + void test_on_dataset(const std::string& dataset_path, std::int64_t cluster_count, std::int64_t max_iteration_count, diff --git a/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp b/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp index 792d5d74901..2c7e3d34063 100644 --- a/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp +++ b/cpp/oneapi/dal/algo/kmeans/test/spmd.cpp @@ -134,6 +134,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, // removed once it's supported for CPU. The same for the rest of tests cases. SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(GENERATE(2, 4)); this->check_if_results_same_on_all_ranks(); @@ -145,6 +146,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(GENERATE(1, 2)); this->check_empty_clusters(); @@ -156,6 +158,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(GENERATE(1, 2)); this->check_on_smoke_data(); @@ -167,6 +170,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(GENERATE(1, 2, 4, 8)); this->check_on_gold_data(); @@ -178,6 +182,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(GENERATE(1, 8)); this->check_on_large_data_with_one_cluster(); @@ -189,6 +194,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(10); const std::int64_t iters = 3; @@ -213,6 +219,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(10); const std::int64_t iters = 10; @@ -237,6 +244,7 @@ TEMPLATE_LIST_TEST_M(kmeans_spmd_test, kmeans_types) { SKIP_IF(this->get_policy().is_cpu()); SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->is_sparse_method()); this->set_rank_count(10); const std::int64_t iters = 2; diff --git a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp index f193593b588..e24cdb02539 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp +++ b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/train_kernel_dense_batch_dpc.cpp @@ -113,6 +113,10 @@ static train_result call_dal_kernel(const context_gpu& ctx, result.set_iterations_count(iter_num); } + if (options.test(result_options::inner_iterations_count)) { + result.set_inner_iterations_count(opt_impl->get_inner_iter()); + } + return result; } diff --git a/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp b/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp index 449a0cac61b..4fd6240f409 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp +++ b/cpp/oneapi/dal/algo/logistic_regression/backend/optimizer_impl.hpp @@ -36,7 +36,14 @@ class optimizer_impl : public base { virtual double get_tol() = 0; virtual std::int64_t get_max_iter() = 0; + // this function returns meaningful value only for newton_cg optimizer + // inner iterations value can be accessed after minimize method was called + virtual std::int64_t get_inner_iter() { + return -1; + } + #ifdef ONEDAL_DATA_PARALLEL + virtual std::pair minimize(sycl::queue& q, pr::base_function& f, pr::ndview& x, diff --git a/cpp/oneapi/dal/algo/logistic_regression/common.cpp b/cpp/oneapi/dal/algo/logistic_regression/common.cpp index 01a3a047187..d830dd3f441 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/common.cpp +++ b/cpp/oneapi/dal/algo/logistic_regression/common.cpp @@ -34,6 +34,10 @@ result_option_id get_iterations_count_id() { return result_option_id{ result_option_id::make_by_index(2) }; } +result_option_id get_inner_iterations_count_id() { + return result_option_id{ result_option_id::make_by_index(3) }; +} + template result_option_id get_default_result_options() { return result_option_id{}; diff --git a/cpp/oneapi/dal/algo/logistic_regression/common.hpp b/cpp/oneapi/dal/algo/logistic_regression/common.hpp index f8dd60c6ef2..b817b5b1101 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/common.hpp +++ b/cpp/oneapi/dal/algo/logistic_regression/common.hpp @@ -68,6 +68,7 @@ namespace detail { ONEDAL_EXPORT result_option_id get_intercept_id(); ONEDAL_EXPORT result_option_id get_coefficients_id(); ONEDAL_EXPORT result_option_id get_iterations_count_id(); +ONEDAL_EXPORT result_option_id get_inner_iterations_count_id(); } // namespace detail @@ -84,6 +85,9 @@ const inline result_option_id coefficients = detail::get_coefficients_id(); /// Return the number of iterations made by optimizer const inline result_option_id iterations_count = detail::get_iterations_count_id(); +/// Return the number of subiterations made by optimizer. Only available for newton-cg optimizer +const inline result_option_id inner_iterations_count = detail::get_inner_iterations_count_id(); + } // namespace result_options namespace detail { diff --git a/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp b/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp index 0ef343bbaec..b56df6bf0e9 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp +++ b/cpp/oneapi/dal/algo/logistic_regression/detail/optimizer.cpp @@ -32,7 +32,10 @@ namespace pr = be::primitives; class newton_cg_optimizer_impl : public optimizer_impl { public: - newton_cg_optimizer_impl(std::int64_t max_iter, double tol) : max_iter_(max_iter), tol_(tol) {} + newton_cg_optimizer_impl(std::int64_t max_iter, double tol) + : max_iter_(max_iter), + tol_(tol), + inner_iter_(0) {} optimizer_type get_optimizer_type() override { return optimizer_type::newton_cg; @@ -46,13 +49,22 @@ class newton_cg_optimizer_impl : public optimizer_impl { return max_iter_; } + // this parameter is set after minimize function was called + std::int64_t get_inner_iter() override { + return inner_iter_; + } + #ifdef ONEDAL_DATA_PARALLEL + template std::pair minimize_impl(sycl::queue& q, pr::base_function& f, pr::ndview& x, const be::event_vector& deps = {}) { - return pr::newton_cg(q, f, x, Float(tol_), max_iter_, 200l, deps); + auto [opt_event, max_iter, inner_iter] = + pr::newton_cg(q, f, x, Float(tol_), max_iter_, 200l, deps); + inner_iter_ = inner_iter; + return { opt_event, max_iter }; } std::pair minimize(sycl::queue& q, @@ -73,6 +85,7 @@ class newton_cg_optimizer_impl : public optimizer_impl { private: std::int64_t max_iter_; double tol_; + std::int64_t inner_iter_; }; template diff --git a/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp b/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp index a0f88b94ad9..b6d9591b2c5 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp +++ b/cpp/oneapi/dal/algo/logistic_regression/train_types.cpp @@ -39,6 +39,7 @@ class train_result_impl : public base { table intercept; table coefficients; std::int64_t iter_cnt; + std::int64_t inner_iter_cnt; result_option_id options; @@ -157,6 +158,24 @@ void train_result::set_iterations_count_impl(std::int64_t value) { impl_->iter_cnt = value; } +template +std::int64_t train_result::get_inner_iterations_count() const { + using msg = dal::detail::error_messages; + if (!get_result_options().test(result_options::inner_iterations_count)) { + throw domain_error(msg::this_result_is_not_enabled_via_result_options()); + } + return impl_->inner_iter_cnt; +} + +template +void train_result::set_inner_iterations_count_impl(std::int64_t value) { + using msg = dal::detail::error_messages; + if (!get_result_options().test(result_options::inner_iterations_count)) { + throw domain_error(msg::this_result_is_not_enabled_via_result_options()); + } + impl_->inner_iter_cnt = value; +} + template const table& train_result::get_coefficients() const { using msg = dal::detail::error_messages; diff --git a/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp b/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp index e39da6e2d1d..555e32f5cd9 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp +++ b/cpp/oneapi/dal/algo/logistic_regression/train_types.hpp @@ -150,6 +150,14 @@ class train_result { return *this; } + /// Number of optimizer subiterations + std::int64_t get_inner_iterations_count() const; + + auto& set_inner_iterations_count(std::int64_t value) { + set_inner_iterations_count_impl(value); + return *this; + } + /// Table of Logistic Regression coefficients and intercept const table& get_packed_coefficients() const; @@ -173,6 +181,7 @@ class train_result { void set_coefficients_impl(const table&); void set_packed_coefficients_impl(const table&); void set_iterations_count_impl(std::int64_t); + void set_inner_iterations_count_impl(std::int64_t); void set_result_options_impl(const result_option_id&); diff --git a/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp b/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp index e59c44d53b1..ffe447ec5d2 100644 --- a/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp +++ b/cpp/oneapi/dal/algo/pca/backend/cpu/finalize_train_kernel_cov.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +15,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include #include #include "daal/src/algorithms/covariance/covariance_kernel.h" @@ -26,6 +29,12 @@ #include "oneapi/dal/backend/interop/table_conversion.hpp" #include "oneapi/dal/table/row_accessor.hpp" +#if defined(TARGET_X86_64) +#define CPU_EXTENSION dal::detail::cpu_extension::avx512 +#elif defined(TARGET_ARM) +#define CPU_EXTENSION dal::detail::cpu_extension::sve +#endif + namespace oneapi::dal::pca::backend { using dal::backend::context_cpu; @@ -84,7 +93,7 @@ static train_result call_daal_kernel_finalize_train(const context_cpu& ctx /// the logic of block size calculation is copied from DAAL, /// to be changed to passing the values from the performance model std::int64_t blockSize = 140; - if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) { + if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) { if (5000 < row_count && row_count <= 50000) { blockSize = 1024; } diff --git a/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp b/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp index 8400fd918fc..dff5d1a3a2a 100644 --- a/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp +++ b/cpp/oneapi/dal/algo/pca/backend/cpu/partial_train_kernel_cov.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2023 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +15,9 @@ * limitations under the License. *******************************************************************************/ +#include + +#include #include #include "daal/src/algorithms/covariance/covariance_kernel.h" @@ -25,6 +29,12 @@ #include "oneapi/dal/backend/interop/table_conversion.hpp" #include "oneapi/dal/table/row_accessor.hpp" +#if defined(TARGET_X86_64) +#define CPU_EXTENSION dal::detail::cpu_extension::avx512 +#elif defined(TARGET_ARM) +#define CPU_EXTENSION dal::detail::cpu_extension::sve +#endif + namespace oneapi::dal::pca::backend { using dal::backend::context_cpu; @@ -66,7 +76,7 @@ static partial_train_result call_daal_kernel_partial_train( /// the logic of block size calculation is copied from DAAL, /// to be changed to passing the values from the performance model std::int64_t blockSize = 140; - if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) { + if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) { const std::int64_t row_count = data.get_row_count(); if (5000 < row_count && row_count <= 50000) { blockSize = 1024; diff --git a/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp b/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp index ef12d49fe74..63b364f7e6b 100644 --- a/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp +++ b/cpp/oneapi/dal/algo/pca/backend/cpu/train_kernel_cov.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +15,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include #include @@ -25,6 +28,12 @@ #include "oneapi/dal/backend/interop/table_conversion.hpp" #include "oneapi/dal/table/row_accessor.hpp" +#if defined(TARGET_X86_64) +#define CPU_EXTENSION dal::detail::cpu_extension::avx512 +#elif defined(TARGET_ARM) +#define CPU_EXTENSION dal::detail::cpu_extension::sve +#endif + namespace oneapi::dal::pca::backend { using dal::backend::context_cpu; @@ -83,7 +92,8 @@ static result_t call_daal_kernel(const context_cpu& ctx, /// the logic of block size calculation is copied from DAAL, /// to be changed to passing the values from the performance model std::int64_t blockSize = 140; - if (ctx.get_enabled_cpu_extensions() == dal::detail::cpu_extension::avx512) { + if (ctx.get_enabled_cpu_extensions() == CPU_EXTENSION) { + const std::int64_t row_count = data.get_row_count(); if (5000 < row_count && row_count <= 50000) { blockSize = 1024; } diff --git a/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp b/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp index 181dc2f31bc..f98462bc963 100644 --- a/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp +++ b/cpp/oneapi/dal/algo/subgraph_isomorphism/backend/cpu/compiler_adapt.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2021 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,13 @@ #pragma once #include + +#include + +#if defined(TARGET_X86_64) #include +#endif + #include #include "oneapi/dal/backend/dispatcher.hpp" @@ -83,6 +90,7 @@ ONEDAL_FORCEINLINE std::int32_t ONEDAL_popcnt64(std::uint64_t a) { #endif } +#if defined(TARGET_X86_64) template <> ONEDAL_FORCEINLINE std::int32_t ONEDAL_lzcnt_u32(std::uint32_t a) { if (a == 0) @@ -164,5 +172,20 @@ ONEDAL_FORCEINLINE std::int32_t ONEDAL_popcnt64 } return bit_cnt; } +#elif defined(TARGET_ARM) +template <> +ONEDAL_FORCEINLINE std::int32_t ONEDAL_lzcnt_u32(std::uint32_t a) { + return __builtin_clz(a); +} +template <> +ONEDAL_FORCEINLINE std::int32_t ONEDAL_lzcnt_u64(std::uint64_t a) { + return __builtin_clzl(a); +} + +template <> +ONEDAL_FORCEINLINE std::int32_t ONEDAL_popcnt64(std::uint64_t a) { + return __builtin_popcountl(a); +} +#endif } // namespace oneapi::dal::preview::subgraph_isomorphism::backend diff --git a/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp b/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp index cfbdc57231d..935e7057332 100644 --- a/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp +++ b/cpp/oneapi/dal/algo/triangle_counting/backend/cpu/intersection_tc.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,9 @@ #pragma once +#if defined(TARGET_X86_64) #include +#endif #include @@ -68,6 +71,7 @@ ONEDAL_FORCEINLINE std::int32_t _popcnt32_redef(const std::int32_t& x) { {} #endif +#if defined(TARGET_X86_64) template <> struct intersection_local_tc { ONEDAL_FORCEINLINE std::int64_t operator()(const std::int32_t* neigh_u, @@ -419,5 +423,6 @@ struct intersection_local_tc { return total; } }; +#endif } // namespace oneapi::dal::preview::triangle_counting::backend diff --git a/cpp/oneapi/dal/backend/dispatcher.cpp b/cpp/oneapi/dal/backend/dispatcher.cpp index 1a5e39e95dd..69974fabdbb 100644 --- a/cpp/oneapi/dal/backend/dispatcher.cpp +++ b/cpp/oneapi/dal/backend/dispatcher.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,17 +38,25 @@ void context_cpu::global_init() { inline constexpr detail::cpu_extension from_daal_cpu_type(daal::CpuType cpu) { using detail::cpu_extension; switch (cpu) { +#if defined(TARGET_X86_64) case daal::sse2: return cpu_extension::sse2; case daal::sse42: return cpu_extension::sse42; case daal::avx2: return cpu_extension::avx2; case daal::avx512: return cpu_extension::avx512; +#elif defined(TARGET_ARM) + case daal::sve: return cpu_extension::sve; +#endif } return cpu_extension::none; } detail::cpu_extension detect_top_cpu_extension() { if (!__daal_serv_cpu_extensions_available()) { +#if defined(TARGET_X86_64) return detail::cpu_extension::sse2; +#elif defined(TARGET_ARM) + return detail::cpu_extension::sve; +#endif } const auto daal_cpu = (daal::CpuType)__daal_serv_cpu_detect(0); return from_daal_cpu_type(daal_cpu); diff --git a/cpp/oneapi/dal/backend/dispatcher.hpp b/cpp/oneapi/dal/backend/dispatcher.hpp index 5325cb2efab..7737f214ebf 100644 --- a/cpp/oneapi/dal/backend/dispatcher.hpp +++ b/cpp/oneapi/dal/backend/dispatcher.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +17,8 @@ #pragma once +#include "daal/include/services/daal_defines.h" + #include "oneapi/dal/detail/policy.hpp" #include "oneapi/dal/detail/spmd_policy.hpp" @@ -38,11 +41,16 @@ namespace oneapi::dal::backend { detail::cpu_extension detect_top_cpu_extension(); +#if defined(TARGET_X86_64) struct cpu_dispatch_sse2 {}; struct cpu_dispatch_sse42 {}; struct cpu_dispatch_avx2 {}; struct cpu_dispatch_avx512 {}; +#elif defined(TARGET_ARM) +struct cpu_dispatch_sve {}; +#endif +#if defined(TARGET_X86_64) using cpu_dispatch_default = cpu_dispatch_sse2; #define __CPU_TAG_SSE2__ oneapi::dal::backend::cpu_dispatch_sse2 @@ -51,6 +59,13 @@ using cpu_dispatch_default = cpu_dispatch_sse2; #define __CPU_TAG_AVX512__ oneapi::dal::backend::cpu_dispatch_avx512 #define __CPU_TAG_DEFAULT__ oneapi::dal::backend::cpu_dispatch_default +#elif defined(TARGET_ARM) +using cpu_dispatch_default = cpu_dispatch_sve; + +#define __CPU_TAG_ARMV8SVE__ oneapi::dal::backend::cpu_dispatch_sve + +#endif + template class communicator_provider : public base { public: @@ -279,6 +294,8 @@ inline constexpr auto dispatch_by_cpu(const context_cpu& ctx, Op&& op) { using detail::cpu_extension; [[maybe_unused]] const cpu_extension cpu_ex = ctx.get_enabled_cpu_extensions(); + +#if defined(TARGET_X86_64) ONEDAL_IF_CPU_DISPATCH_AVX512(if (test_cpu_extension(cpu_ex, cpu_extension::avx512)) { return op(cpu_dispatch_avx512{}); }) @@ -286,6 +303,12 @@ inline constexpr auto dispatch_by_cpu(const context_cpu& ctx, Op&& op) { if (test_cpu_extension(cpu_ex, cpu_extension::avx2)) { return op(cpu_dispatch_avx2{}); }) ONEDAL_IF_CPU_DISPATCH_SSE42( if (test_cpu_extension(cpu_ex, cpu_extension::sse42)) { return op(cpu_dispatch_sse42{}); }) + +#elif defined(TARGET_ARM) + ONEDAL_IF_CPU_DISPATCH_A8SVE( + if (test_cpu_extension(cpu_ex, cpu_extension::sve)) { return op(cpu_dispatch_sve{}); }) +#endif + return op(cpu_dispatch_default{}); } diff --git a/cpp/oneapi/dal/backend/dispatcher_cpu.hpp b/cpp/oneapi/dal/backend/dispatcher_cpu.hpp index ef93e796f4b..ca7c92e0d38 100644 --- a/cpp/oneapi/dal/backend/dispatcher_cpu.hpp +++ b/cpp/oneapi/dal/backend/dispatcher_cpu.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +17,21 @@ #pragma once +#include + #ifdef __ONEDAL_IDE_MODE__ // If this file is openned in IDE it will complain about // `_onedal_dispatcher_cpu.hpp` as this file is generated at build time. // It's recommended to define __ONEDAL_IDE_MODE__ in your IDE settings to // enable this branch for preprocessor. + +#if defined(TARGET_X86_64) #define ONEDAL_CPU_DISPATCH_SSE42 #define ONEDAL_CPU_DISPATCH_AVX2 #define ONEDAL_CPU_DISPATCH_AVX512 +#elif defined(TARGET_ARM) +#define ONEDAL_CPU_DISPATCH_A8SVE +#endif #else // This file is automatically generated by build system #include "oneapi/dal/_dal_cpu_dispatcher_gen.hpp" @@ -46,3 +54,9 @@ #else #define ONEDAL_IF_CPU_DISPATCH_AVX512(x) #endif + +#ifdef ONEDAL_CPU_DISPATCH_A8SVE +#define ONEDAL_IF_CPU_DISPATCH_A8SVE(x) x +#else +#define ONEDAL_IF_CPU_DISPATCH_A8SVE(x) +#endif diff --git a/cpp/oneapi/dal/backend/interop/common.hpp b/cpp/oneapi/dal/backend/interop/common.hpp index c9d6652396c..82a2d77700b 100644 --- a/cpp/oneapi/dal/backend/interop/common.hpp +++ b/cpp/oneapi/dal/backend/interop/common.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +31,7 @@ struct daal_cpu_value { constexpr static daal::CpuType value = cpu; }; +#if defined(TARGET_X86_64) template <> struct to_daal_cpu_type : daal_cpu_value {}; template <> @@ -39,6 +41,12 @@ struct to_daal_cpu_type : daal_cpu_value {}; template <> struct to_daal_cpu_type : daal_cpu_value {}; +#elif defined(TARGET_ARM) +template <> +struct to_daal_cpu_type : daal_cpu_value {}; + +#endif + template typename CpuKernel, typename... Args> inline auto call_daal_kernel(const context_cpu& ctx, Args&&... args) { return dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) { diff --git a/cpp/oneapi/dal/backend/interop/table_conversion.hpp b/cpp/oneapi/dal/backend/interop/table_conversion.hpp index b1fca83cd3c..d68f1f179c6 100644 --- a/cpp/oneapi/dal/backend/interop/table_conversion.hpp +++ b/cpp/oneapi/dal/backend/interop/table_conversion.hpp @@ -20,6 +20,9 @@ #include #endif +#include + +#include "daal/src/data_management/service_numeric_table.h" #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/table/detail/table_builder.hpp" #include "oneapi/dal/table/backend/interop/sycl_table_adapter.hpp" @@ -122,7 +125,11 @@ inline daal::data_management::NumericTablePtr wrap_by_host_soa_adapter(const hom } template -inline daal::data_management::NumericTablePtr convert_to_daal_table(const homogen_table& table) { +inline daal::data_management::NumericTablePtr convert_to_daal_table(const homogen_table& table, + bool need_copy = false) { + if (need_copy) { + return copy_to_daal_homogen_table(table); + } if (table.get_data_layout() == data_layout::row_major) { if (auto wrapper = wrap_by_host_homogen_adapter(table)) { return wrapper; @@ -143,6 +150,7 @@ inline auto convert_to_daal_csr_table(array& data, std::int64_t row_count, std::int64_t column_count, bool allow_copy = false) { + using daal::services::Status; ONEDAL_ASSERT(data.get_count() == column_indices.get_count()); ONEDAL_ASSERT(row_indices.get_count() == row_count + 1); @@ -170,12 +178,17 @@ inline auto convert_to_daal_csr_table(array& data, reinterpret_cast(row_indices.get_mutable_data()), daal_object_owner{ row_indices }); - return daal::data_management::CSRNumericTable::create( + Status status; + const auto table = daal::data_management::CSRNumericTable::create( daal_data, daal_column_indices, daal_row_indices, dal::detail::integral_cast(column_count), - dal::detail::integral_cast(row_count)); + dal::detail::integral_cast(row_count), + daal::data_management::CSRNumericTable::CSRIndexing::oneBased, + &status); + status_to_exception(status); + return table; } template @@ -222,25 +235,22 @@ inline daal::data_management::CSRNumericTablePtr wrap_by_host_csr_adapter(const } template -inline daal::data_management::CSRNumericTablePtr convert_to_daal_table(const csr_table& table) { +inline daal::data_management::CSRNumericTablePtr convert_to_daal_table(const csr_table& table, + bool need_copy = false) { auto wrapper = wrap_by_host_csr_adapter(table); - if (!wrapper) { - return copy_to_daal_csr_table(table); - } - else { - return wrapper; - } + return need_copy || !wrapper ? copy_to_daal_csr_table(table) : wrapper; } template -inline daal::data_management::NumericTablePtr convert_to_daal_table(const table& table) { +inline daal::data_management::NumericTablePtr convert_to_daal_table(const table& table, + bool need_copy = false) { if (table.get_kind() == homogen_table::kind()) { const auto& homogen = static_cast(table); - return convert_to_daal_table(homogen); + return convert_to_daal_table(homogen, need_copy); } else if (table.get_kind() == csr_table::kind()) { const auto& csr = static_cast(table); - return convert_to_daal_table(csr); + return convert_to_daal_table(csr, need_copy); } else { return copy_to_daal_homogen_table(table); diff --git a/cpp/oneapi/dal/backend/memory.hpp b/cpp/oneapi/dal/backend/memory.hpp index 5f9e4bb71ff..4579af7fc9e 100644 --- a/cpp/oneapi/dal/backend/memory.hpp +++ b/cpp/oneapi/dal/backend/memory.hpp @@ -392,7 +392,7 @@ inline sycl::event copy_all2all(sycl::queue& queue, event = memcpy_host2usm(queue, dest, src, sizeof(T) * n, deps); } else { - copy(dest, src, sizeof(T) * n); + memcpy(dest, src, sizeof(T) * n); } return event; } diff --git a/cpp/oneapi/dal/backend/micromkl/macro.hpp b/cpp/oneapi/dal/backend/micromkl/macro.hpp index 3cd555e78cb..b46910ce6e5 100644 --- a/cpp/oneapi/dal/backend/micromkl/macro.hpp +++ b/cpp/oneapi/dal/backend/micromkl/macro.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2021 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +17,8 @@ #pragma once +#include + #ifndef __MICROMKL_INCLUDE_GUARD__ #error "This header cannot be included outside of micromkl module" #endif @@ -50,8 +53,12 @@ FUNC_CPU_DECL(nominal_cpu, prefix, name, argdecl) \ DISPATCH_FUNC_CPU(nominal_cpu, actual_cpu, prefix, name, argdecl, argcall) +#if defined(TARGET_X86_64) #define FUNC_AVX512(...) EXPAND(FUNC_CPU(avx512, avx512, __VA_ARGS__)) #define FUNC_AVX2(...) EXPAND(FUNC_CPU(avx2, avx2, __VA_ARGS__)) +#elif defined(TARGET_ARM) +#define FUNC_A8SVE(...) EXPAND(FUNC_CPU(sve, sve, __VA_ARGS__)) +#endif #ifdef __APPLE__ #define FUNC_SSE42(...) EXPAND(FUNC_CPU(sse42, avx2, __VA_ARGS__)) @@ -61,12 +68,18 @@ #define FUNC_SSE2(...) EXPAND(FUNC_CPU(sse2, sse2, __VA_ARGS__)) #endif +#if defined(TARGET_X86_64) #define FUNC(prefix, name, argdecl, argcall) \ DISPATCH_FUNC_DECL(prefix, name, argdecl) \ FUNC_AVX512(prefix, name, argdecl, argcall) \ FUNC_AVX2(prefix, name, argdecl, argcall) \ FUNC_SSE42(prefix, name, argdecl, argcall) \ FUNC_SSE2(prefix, name, argdecl, argcall) +#elif defined(TARGET_ARM) +#define FUNC(prefix, name, argdecl, argcall) \ + DISPATCH_FUNC_DECL(prefix, name, argdecl) \ + FUNC_A8SVE(prefix, name, argdecl, argcall) +#endif #ifdef ONEDAL_REF #define FUNC_DECL(prefix, floatabr, name, argdecl, argcall) \ @@ -83,6 +96,12 @@ #define INSTANTIATE_CPU(cpu, name, Float, argdecl) \ template void name argdecl(Float); +#ifdef ONEDAL_CPU_DISPATCH_A8SVE +#define INSTANTIATE_A8SVE(...) EXPAND(INSTANTIATE_CPU(sve, __VA_ARGS__)) +#else +#define INSTANTIATE_A8SVE(...) +#endif + #ifdef ONEDAL_CPU_DISPATCH_AVX512 #define INSTANTIATE_AVX512(...) EXPAND(INSTANTIATE_CPU(avx512, __VA_ARGS__)) #else @@ -103,11 +122,15 @@ #define INSTANTIATE_SSE2(...) EXPAND(INSTANTIATE_CPU(sse2, __VA_ARGS__)) +#if defined(TARGET_X86_64) #define INSTANTIATE_FLOAT(name, Float, argdecl) \ INSTANTIATE_AVX512(name, Float, argdecl) \ INSTANTIATE_AVX2(name, Float, argdecl) \ INSTANTIATE_SSE42(name, Float, argdecl) \ INSTANTIATE_SSE2(name, Float, argdecl) +#elif defined(TARGET_ARM) +#define INSTANTIATE_FLOAT(name, Float, argdecl) INSTANTIATE_A8SVE(name, Float, argdecl) +#endif #define FUNC_TEMPLATE(prefix, name, fargdecl, cargdecl, fargcall, cargcall) \ FUNC_DECL(prefix, s, name, fargdecl(float), fargcall) \ diff --git a/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp b/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp index b8dcc175838..341a0b7637e 100644 --- a/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp +++ b/cpp/oneapi/dal/backend/primitives/intersection/intersection.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +17,9 @@ #pragma once +#if defined(TARGET_X86_64) #include +#endif #include @@ -62,6 +65,7 @@ ONEDAL_FORCEINLINE std::int32_t _popcnt32_redef(const std::int32_t &x) { {} #endif +#if defined(TARGET_X86_64) template <> ONEDAL_FORCEINLINE std::int64_t intersection( const std::int32_t *neigh_u, @@ -569,5 +573,6 @@ ONEDAL_FORCEINLINE std::int64_t intersection( } return total; } +#endif } // namespace oneapi::dal::preview::backend diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index a18a727b163..39cae7db796 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -291,6 +291,18 @@ class logloss_test : public te::float_algo_fixture { } } + float_t clip_prob(float_t prob) { + constexpr float_t bottom = sizeof(float_t) > 4 ? 1e-15 : 1e-7; + constexpr float_t top = float_t(1.0) - bottom; + if (prob < bottom) { + prob = bottom; + } + if (prob > top) { + prob = top; + } + return prob; + } + float_t test_predictions_and_logloss(const ndview& data_host, const ndview& params_host, const ndview& labels_host, @@ -313,7 +325,7 @@ class logloss_test : public te::float_algo_fixture { if (fit_intercept) { pred += params_host.at(0); } - float_t prob = 1 / (1 + std::exp(-pred)); + float_t prob = clip_prob(float_t(1.0) / (1 + std::exp(-pred))); logloss -= labels_host.at(i) * std::log(prob) + (1 - labels_host.at(i)) * std::log(1 - prob); float_t out_val = probabilities.at(i); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp index 21516511acc..1035811798d 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg.hpp @@ -25,12 +25,12 @@ namespace oneapi::dal::backend::primitives { // pp. 168 (also known as the truncated Newton method) // https://link.springer.com/book/10.1007/978-0-387-40065-5 template -std::pair newton_cg(sycl::queue& queue, - base_function& f, - ndview& x, - Float tol = 1.0e-5, - std::int64_t maxiter = 100l, - std::int64_t maxinner = 200l, - const event_vector& deps = {}); +std::tuple newton_cg(sycl::queue& queue, + base_function& f, + ndview& x, + Float tol = 1.0e-5, + std::int64_t maxiter = 100l, + std::int64_t maxinner = 200l, + const event_vector& deps = {}); } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp index f5acb90a3d7..b0c84ae0727 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp @@ -27,13 +27,13 @@ namespace oneapi::dal::backend::primitives { template -std::pair newton_cg(sycl::queue& queue, - base_function& f, - ndview& x, - Float tol, - std::int64_t maxiter, - std::int64_t maxinner, - const event_vector& deps) { +std::tuple newton_cg(sycl::queue& queue, + base_function& f, + ndview& x, + Float tol, + std::int64_t maxiter, + std::int64_t maxinner, + const event_vector& deps) { ONEDAL_PROFILER_TASK(newton_cg, queue); std::int64_t n = x.get_dimension(0); @@ -55,6 +55,7 @@ std::pair newton_cg(sycl::queue& queue, Float update_norm = tol + 1; std::int64_t cur_iter_id = 0; + std::int64_t inner_iter_sum = 0; while (cur_iter_id < maxiter) { cur_iter_id++; auto update_event_vec = f.update_x(x, true, last_iter_deps); @@ -98,6 +99,7 @@ std::pair newton_cg(sycl::queue& queue, Float(0), maxinner, { last_event }); + inner_iter_sum += inner_iter; // <-grad, direction> should be > 0 if direction is descent direction last_event = dot_product(queue, gradient, direction, tmp_gpu, &desc, { solve_event }); @@ -106,7 +108,7 @@ std::pair newton_cg(sycl::queue& queue, if (desc < 0) { // failed to find descent direction - return { last_event, cur_iter_id }; + return make_tuple(last_event, cur_iter_id, inner_iter_sum); } Float alpha_opt = backtracking(queue, @@ -127,17 +129,18 @@ std::pair newton_cg(sycl::queue& queue, last = copy(queue, x, buffer2, {}); last_iter_deps = { last }; } - return { last, cur_iter_id }; + return make_tuple(last, cur_iter_id, inner_iter_sum); } -#define INSTANTIATE(F) \ - template std::pair newton_cg(sycl::queue&, \ - base_function&, \ - ndview&, \ - F, \ - std::int64_t, \ - std::int64_t, \ - const event_vector&); +#define INSTANTIATE(F) \ + template std::tuple newton_cg( \ + sycl::queue&, \ + base_function&, \ + ndview&, \ + F, \ + std::int64_t, \ + std::int64_t, \ + const event_vector&); INSTANTIATE(float); INSTANTIATE(double); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index fa045d41142..914bda60f1f 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -90,13 +90,13 @@ class newton_cg_test : public te::float_algo_fixture { logloss_function(this->get_queue(), data, y_gpu, 3.0, true, bsz); auto [solution_, fill_e] = ndarray::zeros(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::device); - auto [opt_event, num_iter] = newton_cg(this->get_queue(), - logloss_func, - solution_, - float_t(1e-8), - 100l, - 200l, - { fill_e }); + auto [opt_event, num_iter, inner_iter] = newton_cg(this->get_queue(), + logloss_func, + solution_, + float_t(1e-8), + 100l, + 200l, + { fill_e }); opt_event.wait_and_throw(); auto solution_host = solution_.to_host(this->get_queue()); @@ -200,7 +200,7 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::zeros(this->get_queue(), { n_ }, sycl::usm::alloc::device); float_t conv_tol = sizeof(float_t) == 4 ? 1e-7 : 1e-14; - auto [opt_event, num_iter] = + auto [opt_event, num_iter, inner_iter] = newton_cg(this->get_queue(), *func_, x, conv_tol, 100, 200l, { x_event }); opt_event.wait_and_throw(); auto x_host = x.to_host(this->get_queue()); diff --git a/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp b/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp index 7eeaf45cc26..5ce78c5f598 100644 --- a/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp +++ b/cpp/oneapi/dal/backend/primitives/reduction/reduction.hpp @@ -16,6 +16,7 @@ #pragma once +#include "oneapi/dal/table/csr.hpp" #include "oneapi/dal/backend/primitives/common.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/reduction/functors.hpp" @@ -107,6 +108,61 @@ inline sycl::event reduce_by_columns(sycl::queue& q, return reduce_by_columns_impl(q, input, output, binary, unary, deps, override_init); } +template +sycl::event reduce_by_rows_impl(sycl::queue& q, + const ndview& values, + const ndview& column_indices, + const ndview& row_offsets, + const dal::sparse_indexing indexing, + ndview& output, + const BinaryOp& binary, + const UnaryOp& unary, + const event_vector& deps, + bool override_init = true); + +/// Reduces `input` rows in CSR format and put result into output +/// +/// @tparam Float Floating-point type used to perform computations +/// @tparam BinaryOp Type of binary operator functor +/// @tparam UnaryOp Type of unary operator functor +/// +/// @param[in] queue SYCL queue +/// @param[in] values An input of values array in CSR format +/// @param[in] column_indices An input of column indices array in CSR format +/// @param[in] row_offsets An input of row offsets array in CSR format +/// @param[in] indexing CSR indexing type. It can be `one_based` or `zero_based` +/// @param[out] output The result of reduction +/// @param[in] deps A vector of `sycl::event`s that represents list of dependencies +template +inline sycl::event reduce_by_rows(sycl::queue& q, + const ndview& values, + const ndview& column_indices, + const ndview& row_offsets, + const dal::sparse_indexing indexing, + ndview& output, + const BinaryOp& binary = BinaryOp{}, + const UnaryOp& unary = UnaryOp{}, + const event_vector& deps = {}, + bool override_init = true) { + ONEDAL_PROFILER_TASK(reduction.reduce_by_rows, q); + static_assert(dal::detail::is_tag_one_of_v, + "BinaryOp must be a special binary operation defined " + "at the primitives level"); + static_assert(dal::detail::is_tag_one_of_v, + "UnaryOp must be a special unary operation defined " + "at the primitives level"); + return reduce_by_rows_impl(q, + values, + column_indices, + row_offsets, + indexing, + output, + binary, + unary, + deps, + override_init); +} + #endif } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp b/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp index b71e75eb8c8..7e1251cb915 100644 --- a/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/reduction/reduction_dpc.cpp @@ -88,6 +88,65 @@ sycl::event reduce_by_rows_impl(sycl::queue& q, return sycl::event{}; } +/// Reduces CSR table with `n x m` dimensions by rows +/// +/// @tparam Float Floating point type, it can be `float` or `double` +/// @tparam BinaryOp Binary operation class, it reduces 2 input values into 1 +/// @tparam UnaryOp Unary operation class, it modifies an input value +/// +/// @param[in] q Sycl queue +/// @param[in] values An array of values in CSR table +/// @param[in] column_indices An array of column indices in CSR table +/// @param[in] row_offsets An array of row offsets in CSR table +/// @param[in] indexing Indexing kind of CSR table +/// @param[out] output An output array with dimensions `n x 1` +/// @param[in] binary A binary operation used in reduction +/// @param[in] unary An unary operation used in reduction +/// @param[in] deps A vector of dependent events +template +sycl::event reduce_by_rows_impl(sycl::queue& q, + const ndview& values, + const ndview& column_indices, + const ndview& row_offsets, + const dal::sparse_indexing indexing, + ndview& output, + const BinaryOp& binary, + const UnaryOp& unary, + const event_vector& deps, + bool override_init) { + ONEDAL_ASSERT(values.get_count() == column_indices.get_count()); + const std::int64_t row_block_size = device_max_wg_size(q); + const std::int64_t column_block_size = device_max_wg_size(q) / 2; + const auto range = + make_multiple_nd_range_2d({ row_block_size, column_block_size }, { 1, column_block_size }); + const auto val_ptr = values.get_data(); + const auto row_ptr = row_offsets.get_data(); + auto const out_ptr = output.get_mutable_data(); + const std::int64_t shift = bool(indexing == sparse_indexing::one_based); + const auto row_count = row_offsets.get_count() - 1; + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(range, [=](auto it) { + const std::int64_t row_shift = it.get_global_id(0); + const std::int64_t col_shift = it.get_local_id(1); + for (auto row_idx = row_shift; row_idx < row_count; row_idx += row_block_size) { + const auto start = row_ptr[row_idx] - shift; + const auto end = row_ptr[row_idx + 1] - shift; + Float local_accum = binary.init_value; + for (auto idx = start + col_shift; idx < end; idx += column_block_size) { + const auto val = val_ptr[idx]; + local_accum = binary.native(local_accum, unary(val)); + } + const auto result = + sycl::reduce_over_group(it.get_group(), local_accum, binary.native); + if (col_shift == 0) { + out_ptr[row_idx] = override_init ? result : out_ptr[row_idx] + result; + } + } + }); + }); +} + template sycl::event reduce_by_columns_impl(sycl::queue& q, const ndview& input, @@ -123,10 +182,22 @@ sycl::event reduce_by_columns_impl(sycl::queue& q, const U&, \ const event_vector&, \ bool); +#define INSTANTIATE_CSR(F, B, U) \ + template sycl::event reduce_by_rows_impl(sycl::queue&, \ + const ndview&, \ + const ndview&, \ + const ndview&, \ + dal::sparse_indexing, \ + ndview&, \ + const B&, \ + const U&, \ + const event_vector&, \ + bool); #define INSTANTIATE_LAYOUT(F, B, U) \ INSTANTIATE(F, ndorder::c, B, U) \ - INSTANTIATE(F, ndorder::f, B, U) + INSTANTIATE(F, ndorder::f, B, U) \ + INSTANTIATE_CSR(F, B, U) #define INSTANTIATE_FLOAT(B, U) \ INSTANTIATE_LAYOUT(double, B, U); \ diff --git a/cpp/oneapi/dal/detail/dispatcher.hpp b/cpp/oneapi/dal/detail/dispatcher.hpp index 522287c3663..15d94d098d3 100644 --- a/cpp/oneapi/dal/detail/dispatcher.hpp +++ b/cpp/oneapi/dal/detail/dispatcher.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +17,33 @@ #pragma once +#include + namespace oneapi::dal::detail { namespace v1 { +#if defined(TARGET_X86_64) struct cpu_dispatch_sse2 {}; struct cpu_dispatch_sse42 {}; struct cpu_dispatch_avx2 {}; struct cpu_dispatch_avx512 {}; - using cpu_dispatch_default = cpu_dispatch_sse2; +#elif defined(TARGET_ARM) +struct cpu_dispatch_sve {}; +using cpu_dispatch_default = cpu_dispatch_sve; +#endif } // namespace v1 +#if defined(TARGET_X86_64) using v1::cpu_dispatch_sse2; using v1::cpu_dispatch_sse42; using v1::cpu_dispatch_avx2; using v1::cpu_dispatch_avx512; +#elif defined(TARGET_ARM) +using v1::cpu_dispatch_sve; +#endif + using v1::cpu_dispatch_default; } // namespace oneapi::dal::detail diff --git a/cpp/oneapi/dal/detail/policy.hpp b/cpp/oneapi/dal/detail/policy.hpp index c62c31e9c30..127c9770d6c 100644 --- a/cpp/oneapi/dal/detail/policy.hpp +++ b/cpp/oneapi/dal/detail/policy.hpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2020 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +17,15 @@ #pragma once +// TODO: Clean up this redefinition and import the defines globally. +#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64) || defined(_M_AMD64) +#define TARGET_X86_64 +#endif + +#if defined(__ARM_ARCH) || defined(__aarch64__) +#define TARGET_ARM +#endif + #include #ifdef ONEDAL_DATA_PARALLEL #include @@ -61,10 +71,14 @@ inline constexpr bool is_data_parallel_policy_v = is_data_parallel_policy::va enum class cpu_extension : uint64_t { none = 0U, +#if defined(TARGET_X86_64) sse2 = 1U << 0, sse42 = 1U << 2, avx2 = 1U << 4, avx512 = 1U << 5 +#elif defined(TARGET_ARM) + sve = 1U << 0, +#endif }; class ONEDAL_EXPORT default_host_policy {}; diff --git a/cpp/oneapi/dal/partial_compute.hpp b/cpp/oneapi/dal/partial_compute.hpp index 820f74f9685..8ec259796f0 100644 --- a/cpp/oneapi/dal/partial_compute.hpp +++ b/cpp/oneapi/dal/partial_compute.hpp @@ -40,26 +40,4 @@ auto partial_compute(sycl::queue& queue, Args&&... args) { using v1::partial_compute; -namespace preview { - -template -auto partial_compute(spmd::communicator& comm, Args&&... args) { - return dal::detail::partial_compute_dispatch( - dal::detail::spmd_policy{ dal::detail::host_policy{}, comm }, - std::forward(args)...); -} - -#ifdef ONEDAL_DATA_PARALLEL -template -auto partial_compute(spmd::communicator& comm, Args&&... args) { - return dal::detail::partial_compute_dispatch( - dal::detail::spmd_policy{ - dal::detail::data_parallel_policy{ comm.get_queue() }, - comm }, - std::forward(args)...); -} -#endif - -} // namespace preview - } // namespace oneapi::dal diff --git a/cpp/oneapi/dal/table/test/csr_accessor.cpp b/cpp/oneapi/dal/table/test/csr_accessor.cpp index eec2e2bb879..40cea3b0a32 100644 --- a/cpp/oneapi/dal/table/test/csr_accessor.cpp +++ b/cpp/oneapi/dal/table/test/csr_accessor.cpp @@ -351,11 +351,10 @@ TEMPLATE_LIST_TEST_M(csr_accessor_test, test_alloc_kind::usm_device, test_alloc_kind::usm_shared); - this->accessor_alloc_ = GENERATE(test_alloc_kind::usm_device, test_alloc_kind::usm_shared); - - // Furter improvement: Add support of the following accessor allocation types: - // test_alloc_kind::host, - // test_alloc_kind::usm_host. + this->accessor_alloc_ = GENERATE(test_alloc_kind::host, + test_alloc_kind::usm_host, + test_alloc_kind::usm_device, + test_alloc_kind::usm_shared); #else this->table_alloc_ = test_alloc_kind::host; this->accessor_alloc_ = test_alloc_kind::host; @@ -379,11 +378,10 @@ TEMPLATE_LIST_TEST_M(csr_accessor_test, test_alloc_kind::usm_device, test_alloc_kind::usm_shared); - this->accessor_alloc_ = GENERATE(test_alloc_kind::usm_device, test_alloc_kind::usm_shared); - - // Furter improvement: Add support of the following accessor allocation types: - // test_alloc_kind::host, - // test_alloc_kind::usm_host. + this->accessor_alloc_ = GENERATE(test_alloc_kind::host, + test_alloc_kind::usm_host, + test_alloc_kind::usm_device, + test_alloc_kind::usm_shared); #else this->table_alloc_ = test_alloc_kind::host; this->accessor_alloc_ = test_alloc_kind::host; diff --git a/cpp/oneapi/dal/test/engine/csr_table_builder.hpp b/cpp/oneapi/dal/test/engine/csr_table_builder.hpp index e8de4036bfe..2e4656f388c 100644 --- a/cpp/oneapi/dal/test/engine/csr_table_builder.hpp +++ b/cpp/oneapi/dal/test/engine/csr_table_builder.hpp @@ -19,6 +19,68 @@ namespace oneapi::dal::test::engine { +csr_table copy_data_to_csr(const dal::array& data, + const dal::array& column_indices, + const dal::array& row_offsets, + const sparse_indexing indexing, + const std::int64_t column_count, + const std::int64_t row_count) { + auto row_offs_ptr = row_offsets.get_data(); + auto data_ptr = data.get_data(); + auto col_indices_ptr = column_indices.get_data(); + auto nnz_count = row_offs_ptr[row_count] - row_offs_ptr[0]; + const auto copied_data = dal::array::empty(nnz_count); + const auto copied_col_indices = dal::array::empty(nnz_count); + const auto copied_row_offsets = dal::array::empty(row_count + 1); + + auto copied_data_ptr = copied_data.get_mutable_data(); + auto copied_col_indices_ptr = copied_col_indices.get_mutable_data(); + auto copied_row_offsets_ptr = copied_row_offsets.get_mutable_data(); + for (std::int32_t i = 0; i < nnz_count; ++i) { + copied_data_ptr[i] = data_ptr[i]; + copied_col_indices_ptr[i] = col_indices_ptr[i]; + } + for (std::int32_t i = 0; i <= row_count; ++i) { + copied_row_offsets_ptr[i] = row_offs_ptr[i]; + } + return csr_table::wrap(copied_data, + copied_col_indices, + copied_row_offsets, + column_count, + indexing); +} + +#ifdef ONEDAL_DATA_PARALLEL +csr_table copy_data_to_csr(sycl::queue& queue, + const dal::array& data, + const dal::array& column_indices, + const dal::array& row_offsets, + const sparse_indexing indexing, + const std::int64_t column_count, + const std::int64_t row_count) { + auto row_offs_ptr = row_offsets.get_data(); + auto nnz_count = row_offs_ptr[row_count] - row_offs_ptr[0]; + const auto copied_data = dal::array::empty(queue, nnz_count, sycl::usm::alloc::device); + const auto copied_col_indices = + dal::array::empty(queue, nnz_count, sycl::usm::alloc::device); + const auto copied_row_offsets = + dal::array::empty(queue, row_count + 1, sycl::usm::alloc::device); + auto data_event = queue.copy(data.get_data(), copied_data.get_mutable_data(), nnz_count); + auto col_indices_event = queue.copy(column_indices.get_data(), + copied_col_indices.get_mutable_data(), + nnz_count); + auto row_offsets_event = queue.copy(row_offsets.get_data(), + copied_row_offsets.get_mutable_data(), + row_count + 1); + sycl::event::wait_and_throw({ data_event, col_indices_event, row_offsets_event }); + return csr_table::wrap(copied_data, + copied_col_indices, + copied_row_offsets, + column_count, + indexing); +} +#endif // ONEDAL_DATA_PARALLEL + /** * Generates random CSR table based on inputs */ @@ -105,53 +167,23 @@ struct csr_table_builder { #ifdef ONEDAL_DATA_PARALLEL csr_table build_csr_table(device_test_policy& policy) const { auto queue = policy.get_queue(); - auto row_offs_ptr = row_offsets_.get_data(); - auto nnz_count = row_offs_ptr[row_count_] - row_offs_ptr[0]; - const auto copied_data = - dal::array::empty(queue, nnz_count, sycl::usm::alloc::device); - const auto copied_col_indices = - dal::array::empty(queue, nnz_count, sycl::usm::alloc::device); - const auto copied_row_offsets = - dal::array::empty(queue, row_count_ + 1, sycl::usm::alloc::device); - auto data_event = - queue.copy(data_.get_data(), copied_data.get_mutable_data(), nnz_count); - auto col_indices_event = queue.copy(column_indices_.get_data(), - copied_col_indices.get_mutable_data(), - nnz_count); - auto row_offsets_event = queue.copy(row_offsets_.get_data(), - copied_row_offsets.get_mutable_data(), - row_count_ + 1); - sycl::event::wait_and_throw({ data_event, col_indices_event, row_offsets_event }); - return csr_table::wrap(copied_data, - copied_col_indices, - copied_row_offsets, - column_count_, - indexing_); + return copy_data_to_csr(queue, + data_, + column_indices_, + row_offsets_, + indexing_, + column_count_, + row_count_); } #endif // ONEDAL_DATA_PARALLEL csr_table build_csr_table(host_test_policy& policy) const { - auto row_offs_ptr = row_offsets_.get_data(); - auto nnz_count = row_offs_ptr[row_count_] - row_offs_ptr[0]; - const auto copied_data = dal::array::empty(nnz_count); - const auto copied_col_indices = dal::array::empty(nnz_count); - const auto copied_row_offsets = dal::array::empty(row_count_ + 1); - - auto copied_data_ptr = copied_data.get_mutable_data(); - auto copied_col_indices_ptr = copied_col_indices.get_mutable_data(); - auto copied_row_offsets_ptr = copied_row_offsets.get_mutable_data(); - for (std::int32_t i = 0; i < nnz_count; ++i) { - copied_data_ptr[i] = data_.get_data()[i]; - copied_col_indices_ptr[i] = column_indices_.get_data()[i]; - } - for (std::int32_t i = 0; i <= row_count_; ++i) { - copied_row_offsets_ptr[i] = row_offs_ptr[i]; - } - return csr_table::wrap(copied_data, - copied_col_indices, - copied_row_offsets, - column_count_, - indexing_); + return copy_data_to_csr(data_, + column_indices_, + row_offsets_, + indexing_, + column_count_, + row_count_); } table build_dense_table() const { @@ -173,4 +205,176 @@ struct csr_table_builder { } }; +/// Generates CSR table with clustering dataset. +/// Dataset is looks like multidimensional blobs +/// with fixed centroid and randomized points around centroid +/// with radius :expr:`r=1.0`. +struct csr_make_blobs { + /// Floating type used for generation + using Float = float; + /// Indexing type used for generation + using Index = std::int64_t; + /// Dataset paramters + Index row_count_, column_count_, cluster_count_; + float nonzero_fraction_; + sparse_indexing indexing_; + const dal::array data_; + const dal::array column_indices_; + const dal::array row_offsets_; + /// Dataset generation parameters + const Float centroid_fill_value = 10.0f; + const Float min_val = -1.0f; + const Float max_val = 1.0f; + + csr_make_blobs(Index cluster_count, + Index row_count, + Index column_count, + float nnz_fraction = 0.05, + sparse_indexing indexing = sparse_indexing::one_based, + Index seed = 42) + : row_count_(row_count), + column_count_(column_count), + cluster_count_(cluster_count), + nonzero_fraction_(nnz_fraction), + indexing_(indexing), + data_(dal::array::empty(nnz_fraction * row_count * column_count)), + column_indices_(dal::array::empty(nnz_fraction * row_count * column_count)), + row_offsets_(dal::array::empty(row_count + 1)) { + // Get data arrays + auto data_ptr = data_.get_mutable_data(); + auto col_indices_ptr = column_indices_.get_mutable_data(); + auto row_offs_ptr = row_offsets_.get_mutable_data(); + const Index indexing_shift = bool(indexing == sparse_indexing::one_based); + // Estimate number of non-zero values in each row + const Index row_nonzero_count = column_count * nnz_fraction; + // Init random engines + std::mt19937 rng(seed); + std::uniform_real_distribution uniform_data(min_val, max_val); + std::uniform_int_distribution uniform_indices(indexing_shift, + column_count + indexing_shift - 1); + // Check if it is possible to generate non-empty row + if (row_nonzero_count < 1) { + std::cout << "ERROR: Non-zero fraction is too small to generate rows" << std::endl; + ONEDAL_ASSERT(row_nonzero_count >= 1); + return; + } + Index fill_count = 0; + row_offs_ptr[0] = indexing_shift; + // Create centroids + for (Index cent_idx = 0; cent_idx < cluster_count; ++cent_idx) { + std::set columns; + while (Index(columns.size()) < row_nonzero_count) { + const Index col_idx = uniform_indices(rng); + columns.insert(col_idx); + } + for (auto iter = columns.begin(); iter != columns.end(); iter++) { + data_ptr[fill_count] = centroid_fill_value * (cent_idx + 1); + col_indices_ptr[fill_count] = *iter; + fill_count++; + } + row_offs_ptr[cent_idx + 1] = fill_count + indexing_shift; + } + + // Generate remaining rows adding random noise to centroids + for (Index row_idx = cluster_count; row_idx < row_count; ++row_idx) { + const Index centroid_id = row_idx % cluster_count; + for (Index data_idx = row_offs_ptr[centroid_id] - indexing_shift; + data_idx < row_offs_ptr[centroid_id + 1] - indexing_shift; + ++data_idx) { + col_indices_ptr[fill_count] = col_indices_ptr[data_idx]; + data_ptr[fill_count] = data_ptr[data_idx] + uniform_data(rng); + fill_count++; + } + row_offs_ptr[row_idx + 1] = fill_count + indexing_shift; + } + } + + table get_data(host_test_policy& policy) const { + return copy_data_to_csr(data_, + column_indices_, + row_offsets_, + indexing_, + column_count_, + row_count_); + } + + table get_initial_centroids() const { + const auto result = dal::array::empty(cluster_count_ * column_count_); + auto result_ptr = result.get_mutable_data(); + + const Index shift = bool(indexing_ == sparse_indexing::one_based); + const auto data_ptr = data_.get_data(); + const auto col_ind_ptr = column_indices_.get_data(); + const auto row_offs_ptr = row_offsets_.get_data(); + for (Index row_idx = 0; row_idx < cluster_count_; ++row_idx) { + for (Index col_id = 0; col_id < column_count_; ++col_id) { + result_ptr[row_idx * column_count_ + col_id] = 0; + } + const auto start = row_offs_ptr[row_idx] - shift; + const auto end = row_offs_ptr[row_idx + 1] - shift; + for (Index data_idx = start; data_idx < end; ++data_idx) { + auto col_idx = col_ind_ptr[data_idx] - shift; + result_ptr[row_idx * column_count_ + col_idx] = data_ptr[data_idx]; + } + } + return homogen_table::wrap(result, cluster_count_, column_count_); + } + + table get_result_centroids() const { + const auto result = dal::array::empty(cluster_count_ * column_count_); + auto result_ptr = result.get_mutable_data(); + const auto cluster_counts = dal::array::empty(cluster_count_); + auto counts_ptr = cluster_counts.get_mutable_data(); + + const Index shift = bool(indexing_ == sparse_indexing::one_based); + const auto data_ptr = data_.get_data(); + const auto col_ind_ptr = column_indices_.get_data(); + const auto row_offs_ptr = row_offsets_.get_data(); + for (Index row_idx = 0; row_idx < cluster_count_; ++row_idx) { + counts_ptr[row_idx] = 0; + for (Index col_id = 0; col_id < column_count_; ++col_id) { + result_ptr[row_idx * column_count_ + col_id] = 0; + } + } + for (Index row_idx = 0; row_idx < row_count_; ++row_idx) { + const auto start = row_offs_ptr[row_idx] - shift; + const auto end = row_offs_ptr[row_idx + 1] - shift; + for (Index data_idx = start; data_idx < end; ++data_idx) { + auto col_idx = col_ind_ptr[data_idx] - shift; + result_ptr[(row_idx % cluster_count_) * column_count_ + col_idx] += + data_ptr[data_idx]; + } + counts_ptr[row_idx % cluster_count_]++; + } + for (Index row_idx = 0; row_idx < cluster_count_; ++row_idx) { + for (Index col_id = 0; col_id < column_count_; ++col_id) { + result_ptr[row_idx * column_count_ + col_id] /= counts_ptr[row_idx]; + } + } + return homogen_table::wrap(result, cluster_count_, column_count_); + } + + table get_responses() const { + auto responses = dal::array::empty(row_count_); + auto response_ptr = responses.get_mutable_data(); + for (std::int32_t i = 0; i < row_count_; ++i) { + response_ptr[i] = i % cluster_count_; + } + return homogen_table::wrap(response_ptr, row_count_, 1); + } + +#ifdef ONEDAL_DATA_PARALLEL + table get_data(device_test_policy& policy) const { + auto queue = policy.get_queue(); + return copy_data_to_csr(queue, + data_, + column_indices_, + row_offsets_, + indexing_, + column_count_, + row_count_); + } +#endif // ONEDAL_DATA_PARALLEL +}; + } //namespace oneapi::dal::test::engine diff --git a/cpp/oneapi/dal/test/engine/fixtures.hpp b/cpp/oneapi/dal/test/engine/fixtures.hpp index 93e3363dbe8..8219c43a686 100644 --- a/cpp/oneapi/dal/test/engine/fixtures.hpp +++ b/cpp/oneapi/dal/test/engine/fixtures.hpp @@ -156,6 +156,16 @@ class crtp_base_algo_fixture : public float_algo_fixture(args)...); } + template + auto split_finalize_compute_input(Args&&... args) { + return derived().split_finalize_compute_input_override(std::forward(args)...); + } + + template + auto merge_finalize_compute_result(Args&&... args) { + return derived().merge_finalize_compute_result_override(std::forward(args)...); + } + template auto split_infer_input(Args&&... args) { return derived().split_infer_input_override(std::forward(args)...); @@ -221,6 +231,16 @@ class crtp_base_algo_fixture : public float_algo_fixture + auto split_finalize_compute_input_override(Args&&... args) { + ONEDAL_ASSERT(!"This method must be overriden in the derived class"); + } + + template + auto merge_finalize_compute_result_override(Args&&... args) { + ONEDAL_ASSERT(!"This method must be overriden in the derived class"); + } + template auto split_infer_input_override(Args&&... args) { ONEDAL_ASSERT(!"This method must be overriden in the derived class"); @@ -326,6 +346,49 @@ class crtp_algo_fixture : public crtp_base_algo_fixture { return this->merge_compute_result(results); } + template + auto finalize_compute_via_spmd_threads(std::int64_t thread_count, + const Descriptor& desc, + Args&&... args) { + ONEDAL_ASSERT(thread_count > 0); + + CAPTURE(thread_count); +#ifdef ONEDAL_DATA_PARALLEL + using comm_t = thread_communicator; + comm_t comm{ this->get_queue(), thread_count }; +#else + using comm_t = thread_communicator; + comm_t comm{ thread_count }; +#endif + + const auto input_per_rank = + this->split_finalize_compute_input(thread_count, std::forward(args)...); + ONEDAL_ASSERT(input_per_rank.size() == + dal::detail::integral_cast(thread_count)); + + const auto results = comm.map([&](std::int64_t rank) { + return dal::test::engine::spmd_finalize_compute(this->get_policy(), + comm, + desc, + input_per_rank[rank]); + }); + ONEDAL_ASSERT(results.size() == dal::detail::integral_cast(thread_count)); + + return results; + } + + template + auto finalize_compute_via_spmd_threads_and_merge(std::int64_t thread_count, + const Descriptor& desc, + Args&&... args) { + const auto results = this->finalize_compute_via_spmd_threads( // + thread_count, + desc, + std::forward(args)...); + + return this->merge_finalize_compute_result(results); + } + template auto infer_via_spmd_threads(std::int64_t thread_count, const Descriptor& desc, Args&&... args) { ONEDAL_ASSERT(thread_count > 0); diff --git a/cpp/oneapi/dal/test/engine/spmd.hpp b/cpp/oneapi/dal/test/engine/spmd.hpp index 7f795525704..a97837a72ed 100644 --- a/cpp/oneapi/dal/test/engine/spmd.hpp +++ b/cpp/oneapi/dal/test/engine/spmd.hpp @@ -79,4 +79,23 @@ inline auto spmd_compute(device_test_policy& policy, } #endif +template +inline auto spmd_finalize_compute(host_test_policy& policy, + const spmd::communicator& comm, + Args&&... args) { + return dal::finalize_compute(dal::detail::spmd_policy{ dal::detail::host_policy{}, comm }, + std::forward(args)...); +} + +#ifdef ONEDAL_DATA_PARALLEL +template +inline auto spmd_finalize_compute(device_test_policy& policy, + const spmd::communicator& comm, + Args&&... args) { + dal::detail::data_parallel_policy local_policy{ policy.get_queue() }; + dal::detail::spmd_policy spmd_policy{ local_policy, comm }; + return dal::finalize_compute(spmd_policy, std::forward(args)...); +} +#endif + } // namespace oneapi::dal::test::engine diff --git a/deploy/local/dal b/deploy/local/dal index 20ea287f505..6ddff7e537b 100644 --- a/deploy/local/dal +++ b/deploy/local/dal @@ -1,6 +1,7 @@ #%Module1.0################################################################### #=============================================================================== # Copyright 2020 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -60,7 +61,14 @@ proc ModulesHelp { } { # Set intermediate variables set dalroot "$componentroot" -set daal_target_arch "intel64" +set daalroot "$componentroot/$modulefilever" +if {[string equal [info machine] "aarch64"]} { + set daal_target_arch "arm" +} else { + set daal_target_arch "intel64" +} + +module-whatis "oneAPI Data Analytics Library for $daal_target_arch." # Setup environment variables setenv DAL_MAJOR_BINARY 1 diff --git a/deploy/local/vars_lnx.sh b/deploy/local/vars_lnx.sh index fc0172ff9bc..8cb606d13cf 100644 --- a/deploy/local/vars_lnx.sh +++ b/deploy/local/vars_lnx.sh @@ -4,6 +4,7 @@ #=============================================================================== # Copyright 2014 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -223,6 +224,17 @@ if [ ! -d $__daal_tmp_dir ]; then __daal_tmp_dir=${component_root} fi +ARCH_ONEDAL=$(uname -m) + +if [ "${ARCH_ONEDAL}" = "x86_64" ]; then + ARCH_DIR_ONEDAL="intel64" +elif [ "${ARCH_ONEDAL}" = "aarch64" ]; then + ARCH_DIR_ONEDAL="arm" +else + echo "Unsupported CPU architecture '${ARCH_ONEDAL}'" + exit 1 +fi + if [ "$(basename "${my_script_path}")" = "env" ] ; then # assume stand-alone # case "${my_script_path}" in # *"env"*) @@ -239,8 +251,8 @@ if [ "$(basename "${my_script_path}")" = "env" ] ; then # assume stand-alone export LD_LIBRARY_PATH="$__daal_tmp_dir/lib${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}" else export CPATH="$__daal_tmp_dir/include${CPATH+:${CPATH}}" - export LIBRARY_PATH="$__daal_tmp_dir/lib/intel64${LIBRARY_PATH+:${LIBRARY_PATH}}" - export LD_LIBRARY_PATH="$__daal_tmp_dir/lib/intel64${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}" + export LIBRARY_PATH="$__daal_tmp_dir/lib/$ARCH_DIR_ONEDAL${LIBRARY_PATH+:${LIBRARY_PATH}}" + export LD_LIBRARY_PATH="$__daal_tmp_dir/lib/$ARCH_DIR_ONEDAL${LD_LIBRARY_PATH+:${LD_LIBRARY_PATH}}" fi # ;; else # must be a consolidated layout diff --git a/deploy/nuget/prepare_dal_nuget.sh b/deploy/nuget/prepare_dal_nuget.sh index bf05f73978c..5d115c71e9b 100755 --- a/deploy/nuget/prepare_dal_nuget.sh +++ b/deploy/nuget/prepare_dal_nuget.sh @@ -1,6 +1,7 @@ #!/bin/bash #=============================================================================== # Copyright 2022 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,12 +54,20 @@ create_package() { # platform specific platform=$(bash $(dirname "$0")/../../dev/make/identify_os.sh) if [ ${platform} = "lnx32e" ]; then - platform=linux-x64 + platform=linux tbb_platform=linux rls_prefix=${rls_dir}/daal/latest dynamic_lib_path=lib/intel64 static_lib_path=lib/intel64 lib_prefix=libonedal + elif [ ${platform} = "lnxarm" ]; then + platform=linux + tbb_platform=linux + rls_prefix=${rls_dir}/daal/latest + dynamic_lib_path=lib/arm + static_lib_path=lib/arm + lib_prefix=libonedal + elif [ ${platform} = "mac32e" ]; then platform=osx-x64 tbb_platform=osx @@ -98,7 +107,7 @@ create_package() { if [ "${build_nupkg}" = "yes" ]; then # extension of libraries - if [ "${platform}" = "linux-x64" ]; then + if [ "${platform}" = "linux" ]; then dl_postfix=.so.${major_binary_version}.${minor_binary_version} sl_postfix=.a elif [ "${platform}" = "osx-x64" ]; then diff --git a/deploy/pkg-config/generate_pkgconfig.py b/deploy/pkg-config/generate_pkgconfig.py index b287f853bc4..323ff7e5138 100755 --- a/deploy/pkg-config/generate_pkgconfig.py +++ b/deploy/pkg-config/generate_pkgconfig.py @@ -1,5 +1,7 @@ +'''generate_pkgconfig.py''' #=============================================================================== # Copyright 2021 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +21,21 @@ import glob import argparse from sys import platform +import platform as plt + +def detect_cpu_architecture(): + """ + Detect CPU architecture + """ + architecture = plt.machine() + if architecture in ('x86_64', 'AMD64'): + return 'x86_64' + elif architecture.startswith('arm') or architecture == 'aarch64': + return 'aarch64' + else: + sys.stderr.write(f"Unknown Architecture {architecture} Detected. " \ + "Only 'x86_64', 'AMD64' and 'aarch64' supported.\n") + sys.exit(1) LIBS_PAR_STAT, LIBS_PAR_DYN = [], [] @@ -45,9 +62,18 @@ }, } +ARCH = detect_cpu_architecture() + if platform in ["linux2", "linux"]: PREF_LIB = "lib" - LIBDIR = 'lib/intel64' + + if ARCH == 'x86_64': + LIBDIR = 'lib/intel64' + elif ARCH == 'aarch64': + LIBDIR = 'lib/arm' + else: + sys.stderr.write(f"Unknown CPU architecture '{ARCH}'\n") + SUFF_DYN_LIB = ".so" SUFF_STAT_LIB = ".a" TBB_LIBS = "-ltbb -ltbbmalloc" diff --git a/deploy/pkg-config/pkg-config.tpl b/deploy/pkg-config/pkg-config.tpl index 1d59f3e5df3..53fd1276066 100755 --- a/deploy/pkg-config/pkg-config.tpl +++ b/deploy/pkg-config/pkg-config.tpl @@ -22,7 +22,7 @@ includedir=${{prefix}}/include #info Name: oneDAL Description: Intel(R) oneAPI Data Analytics Library -Version: 2024.2 +Version: 2024.3 URL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onedal.html #Link line Libs: {libs} diff --git a/dev/bazel/config/config.bzl b/dev/bazel/config/config.bzl index cfcfad074ab..8ff87b7e54b 100644 --- a/dev/bazel/config/config.bzl +++ b/dev/bazel/config/config.bzl @@ -210,7 +210,7 @@ def _declare_onedal_config_impl(repo_ctx): substitutions = { "%{auto_cpu}": auto_cpu, "%{version_major}": "2024", - "%{version_minor}": "2", + "%{version_minor}": "3", "%{version_update}": "0", "%{version_build}": utils.datestamp(repo_ctx), "%{version_buildrev}": "work", diff --git a/dev/bazel/config/cpudetect.cpp b/dev/bazel/config/cpudetect.cpp index a6abc42eb01..eee6a3d3752 100644 --- a/dev/bazel/config/cpudetect.cpp +++ b/dev/bazel/config/cpudetect.cpp @@ -1,5 +1,6 @@ /******************************************************************************* * Copyright 2014 Intel Corporation +* Copyright contributors to the oneDAL project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +15,17 @@ * limitations under the License. *******************************************************************************/ +#if defined(__x86_64__) || defined(__x86_64) || defined(__amd64) || defined(_M_AMD64) + #define TARGET_X86_64 +#endif + +#if defined(__ARM_ARCH) || defined(__aarch64__) + #define TARGET_ARM +#endif + +#if defined(TARGET_X86_64) #include +#endif #if defined(_MSC_VER) #if (_MSC_FULL_VER >= 160040219) @@ -154,20 +165,25 @@ int check_sse42_features() { } std::string detect_cpu() { - try_enable_avx512f_on_macos(); - if (check_avx512_features()) { - return "avx512"; - } - else if (check_avx2_features()) { - return "avx2"; - } - else if (check_sse42_features()) { - return "sse42"; - } - else { - return "sse2"; - } + #if defined(TARGET_X86_64) + try_enable_avx512f_on_macos(); + + if (check_avx512_features()) { + return "avx512"; + } + else if (check_avx2_features()) { + return "avx2"; + } + else if (check_sse42_features()) { + return "sse42"; + } + else { + return "sse2"; + } + #elif defined(TARGET_ARM) + return "sve"; + #endif } int main(int argc, char const *argv[]) { diff --git a/dev/docker/onedal-dev.Dockerfile b/dev/docker/onedal-dev.Dockerfile index 1add2d27724..fa9bea8ed8d 100644 --- a/dev/docker/onedal-dev.Dockerfile +++ b/dev/docker/onedal-dev.Dockerfile @@ -14,7 +14,7 @@ # limitations under the License. #=============================================================================== -FROM ubuntu:22.04 +FROM ubuntu:22.04@sha256:77906da86b60585ce12215807090eb327e7386c8fafb5402369e421f44eff17e ARG workdirectory="/sources/oneDAL" diff --git a/dev/download_tbb.sh b/dev/download_tbb.sh index fe05c687795..5ba94225c95 100755 --- a/dev/download_tbb.sh +++ b/dev/download_tbb.sh @@ -1,6 +1,7 @@ #!/bin/bash #=============================================================================== # Copyright 2014 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/dev/make/common.mk b/dev/make/common.mk index 694d7a3bca1..291ca7e406b 100644 --- a/dev/make/common.mk +++ b/dev/make/common.mk @@ -78,6 +78,7 @@ secure.opts.link.mac = RC.COMPILE = rc.exe $(RCOPT) -fo$@ $< +# Used as $(eval $(call set_c_compile,$(COMPILER),$(_OS),$(gcc_toolchain)) C.COMPILE = $(if $(COMPILER.$(_OS).$(COMPILER)),$(COMPILER.$(_OS).$(COMPILER)),$(error COMPILER.$(_OS).$(COMPILER) must be defined)) \ $(if $(C.COMPILE.gcc_toolchain),--gcc-toolchain=$(C.COMPILE.gcc_toolchain)) \ -c $(secure.opts.icc.$(_OS)) $(COPT) $(INCLUDES) $1 $(-Fo)$@ $< diff --git a/dev/make/cmplr.clang.mkl.mk b/dev/make/compiler_definitions/clang.32e.mk similarity index 82% rename from dev/make/cmplr.clang.mkl.mk rename to dev/make/compiler_definitions/clang.32e.mk index 52d77cf757f..4f4844896a9 100644 --- a/dev/make/cmplr.clang.mkl.mk +++ b/dev/make/compiler_definitions/clang.32e.mk @@ -1,4 +1,4 @@ -# file: cmplt.clang.mk +# file: clang.32e.mk #=============================================================================== # Copyright 2012 Intel Corporation # @@ -16,17 +16,15 @@ #=============================================================================== #++ -# Clang defenitions for makefile +# Clang definitions for makefile. +# This file contains definitions common to clang on a 32e (intel64) platform. +# It should only be included from files which have more specializations (e.g. +# clang.mkl.32e.mk) #-- -PLATs.clang = lnx32e mac32e - -CMPLRDIRSUFF.clang = _clang - -CORE.SERV.COMPILER.clang = generic +include dev/make/compiler_definitions/clang.mk --Zl.clang = --DEBC.clang = -g +PLATs.clang = lnx32e mac32e COMPILER.mac.clang = clang++ -m64 -fgnu-runtime -stdlib=libc++ -mmacosx-version-min=10.15 -fwrapv \ -Werror -Wreturn-type @@ -36,11 +34,6 @@ COMPILER.lnx.clang = clang++ -m64 \ link.dynamic.mac.clang = clang++ -m64 link.dynamic.lnx.clang = clang++ -m64 -pedantic.opts.clang = -pedantic \ - -Wall \ - -Wextra \ - -Wno-unused-parameter - pedantic.opts.mac.clang = $(pedantic.opts.clang) pedantic.opts.lnx.clang = $(pedantic.opts.clang) diff --git a/dev/make/cmplr.clang.ref.mk b/dev/make/compiler_definitions/clang.mk similarity index 55% rename from dev/make/cmplr.clang.ref.mk rename to dev/make/compiler_definitions/clang.mk index b7d12348253..5c962ef511b 100644 --- a/dev/make/cmplr.clang.ref.mk +++ b/dev/make/compiler_definitions/clang.mk @@ -1,6 +1,6 @@ -# file: cmplt.clang.mk +# file: clang.mk #=============================================================================== -# Copyright 2023 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,11 +16,12 @@ #=============================================================================== #++ -# Clang defenitions for makefile +# Clang definitions for makefile +# This file contains definitions common to clang on all platforms. +# It should only be included from files which have more specializations (e.g. +# clang.32e.mk) #-- -PLATs.clang = lnx32e mac32e - CMPLRDIRSUFF.clang = _clang CORE.SERV.COMPILER.clang = generic @@ -28,23 +29,7 @@ CORE.SERV.COMPILER.clang = generic -Zl.clang = -DEBC.clang = -g -COMPILER.mac.clang = clang++ -m64 -fgnu-runtime -stdlib=libc++ -mmacosx-version-min=10.15 -fwrapv \ - -DDAAL_REF -DONEDAL_REF -Werror -Wreturn-type -COMPILER.lnx.clang = clang++ -m64 \ - -DDAAL_REF -DONEDAL_REF -Werror -Wreturn-type - -link.dynamic.mac.clang = clang++ -m64 -link.dynamic.lnx.clang = clang++ -m64 - pedantic.opts.clang = -pedantic \ -Wall \ -Wextra \ -Wno-unused-parameter - -pedantic.opts.mac.clang = $(pedantic.opts.clang) -pedantic.opts.lnx.clang = $(pedantic.opts.clang) - -p4_OPT.clang = $(-Q)march=nocona -mc3_OPT.clang = $(-Q)$(if $(OS_is_mac),march=nocona,march=nehalem) $(if $(OS_is_mac),$(-Q)mtune=nehalem) -avx2_OPT.clang = $(-Q)march=haswell -skx_OPT.clang = $(-Q)march=skx diff --git a/dev/make/compiler_definitions/clang.mkl.32e.mk b/dev/make/compiler_definitions/clang.mkl.32e.mk new file mode 100644 index 00000000000..9bac0e95622 --- /dev/null +++ b/dev/make/compiler_definitions/clang.mkl.32e.mk @@ -0,0 +1,22 @@ +# file: clang.mkl.32e.mk +#=============================================================================== +# Copyright 2012 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +#++ +# Clang definitions for makefile +#-- + +include dev/make/compiler_definitions/clang.32e.mk diff --git a/dev/make/compiler_definitions/clang.ref.32e.mk b/dev/make/compiler_definitions/clang.ref.32e.mk new file mode 100644 index 00000000000..291bc0295d5 --- /dev/null +++ b/dev/make/compiler_definitions/clang.ref.32e.mk @@ -0,0 +1,25 @@ +# file: clang.ref.32e.mk +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +#++ +# Clang definitions for makefile +#-- + +include dev/make/compiler_definitions/clang.32e.mk + +COMPILER.mac.clang = $(COMPILER.mac.clang) -DDAAL_REF -DONEDAL_REF +COMPILER.lnx.clang = $(COMPILER.lnx.clang) -DDAAL_REF -DONEDAL_REF diff --git a/dev/make/compiler_definitions/clang.ref.arm.mk b/dev/make/compiler_definitions/clang.ref.arm.mk new file mode 100644 index 00000000000..6b61a52c0dc --- /dev/null +++ b/dev/make/compiler_definitions/clang.ref.arm.mk @@ -0,0 +1,34 @@ +# file: clang.ref.arm.mk +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +#++ +# Clang definitions for makefile +#-- + +include dev/make/compiler_definitions/clang.mk + +PLATs.clang = lnxarm + +COMPILER.lnx.clang= clang++ -march=armv8-a+sve \ + -DDAAL_REF -DONEDAL_REF -DDAAL_CPU=sve -Werror -Wreturn-type +# Linker flags +link.dynamic.lnx.clang = clang++ -march=armv8-a+sve + +pedantic.opts.lnx.clang = $(pedantic.opts.clang) + +# For SVE +a8sve_OPT.clang = $(-Q)march=armv8-a+sve diff --git a/dev/make/cmplr.dpcpp.mk b/dev/make/compiler_definitions/dpcpp.mk similarity index 97% rename from dev/make/cmplr.dpcpp.mk rename to dev/make/compiler_definitions/dpcpp.mk index 2a78043b729..848f36c2db1 100644 --- a/dev/make/cmplr.dpcpp.mk +++ b/dev/make/compiler_definitions/dpcpp.mk @@ -16,7 +16,7 @@ #=============================================================================== #++ -# DPC++ Compiler defenitions for makefile +# DPC++ Compiler definitions for makefile #-- PLATs.dpcpp = lnx32e win32e diff --git a/dev/make/cmplr.gnu.mkl.mk b/dev/make/compiler_definitions/gnu.32e.mk similarity index 76% rename from dev/make/cmplr.gnu.mkl.mk rename to dev/make/compiler_definitions/gnu.32e.mk index 2e5008a519a..f90f0a95eed 100644 --- a/dev/make/cmplr.gnu.mkl.mk +++ b/dev/make/compiler_definitions/gnu.32e.mk @@ -1,5 +1,5 @@ #=============================================================================== -# Copyright 2023 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,35 +15,26 @@ #=============================================================================== #++ -# g++ defenitions for makefile +# g++ definitions for makefile +# This file contains definitions common to gnu on a 32e (intel64) platform. It +# should only be included from files which have more specializations (e.g. +# gnu.mkl.32e.mk) #-- -PLATs.gnu = lnx32e mac32e - -CMPLRDIRSUFF.gnu = _gnu - -CORE.SERV.COMPILER.gnu = generic +include dev/make/compiler_definitions/gnu.mk --Zl.gnu = --DEBC.gnu = -g +PLATs.gnu = lnx32e mac32e COMPILER.all.gnu = ${CXX} -m64 -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \ -Werror -Wreturn-type link.dynamic.all.gnu = ${CXX} -m64 -pedantic.opts.all.gnu = -pedantic \ - -Wall \ - -Wextra \ - -Wno-unused-parameter - -COMPILER.lnx.gnu = $(COMPILER.all.gnu) -link.dynamic.lnx.gnu = $(link.dynamic.all.gnu) pedantic.opts.lnx.gnu = $(pedantic.opts.all.gnu) +pedantic.opts.mac.gnu = $(pedantic.opts.all.gnu) -COMPILER.mac.gnu = $(COMPILER.all.gnu) +link.dynamic.lnx.gnu = $(link.dynamic.all.gnu) link.dynamic.mac.gnu = $(link.dynamic.all.gnu) -pedantic.opts.mac.gnu = $(pedantic.opts.all.gnu) p4_OPT.gnu = $(-Q)march=nocona mc3_OPT.gnu = $(-Q)march=corei7 diff --git a/dev/make/cmplr.gnu.ref.mk b/dev/make/compiler_definitions/gnu.mk similarity index 57% rename from dev/make/cmplr.gnu.ref.mk rename to dev/make/compiler_definitions/gnu.mk index fac0235da8d..cb0679037f1 100644 --- a/dev/make/cmplr.gnu.ref.mk +++ b/dev/make/compiler_definitions/gnu.mk @@ -1,5 +1,5 @@ #=============================================================================== -# Copyright 2023 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,11 +15,12 @@ #=============================================================================== #++ -# g++ defenitions for makefile +# g++ definitions for makefile +# This file contains definitions common to gnu on all platforms. It +# should only be included from files which have more specializations (e.g. +# gnu.32e.mk) #-- -PLATs.gnu = lnx32e mac32e - CMPLRDIRSUFF.gnu = _gnu CORE.SERV.COMPILER.gnu = generic @@ -27,25 +28,7 @@ CORE.SERV.COMPILER.gnu = generic -Zl.gnu = -DEBC.gnu = -g -COMPILER.all.gnu = ${CXX} -m64 -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \ - -DDAAL_REF -DONEDAL_REF -Werror -Wreturn-type - -link.dynamic.all.gnu = ${CXX} -m64 - pedantic.opts.all.gnu = -pedantic \ -Wall \ -Wextra \ -Wno-unused-parameter - -COMPILER.lnx.gnu = $(COMPILER.all.gnu) -link.dynamic.lnx.gnu = $(link.dynamic.all.gnu) -pedantic.opts.lnx.gnu = $(pedantic.opts.all.gnu) - -COMPILER.mac.gnu = $(COMPILER.all.gnu) -link.dynamic.mac.gnu = $(link.dynamic.all.gnu) -pedantic.opts.mac.gnu = $(pedantic.opts.all.gnu) - -p4_OPT.gnu = $(-Q)march=nocona -mc3_OPT.gnu = $(-Q)march=corei7 -avx2_OPT.gnu = $(-Q)march=haswell -skx_OPT.gnu = $(-Q)march=skylake diff --git a/dev/make/compiler_definitions/gnu.mkl.32e.mk b/dev/make/compiler_definitions/gnu.mkl.32e.mk new file mode 100644 index 00000000000..6877ee330dd --- /dev/null +++ b/dev/make/compiler_definitions/gnu.mkl.32e.mk @@ -0,0 +1,24 @@ +#=============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +#++ +# g++ definitions for makefile +#-- + +include dev/make/compiler_definitions/gnu.32e.mk + +COMPILER.lnx.gnu = $(COMPILER.all.gnu) +COMPILER.mac.gnu = $(COMPILER.all.gnu) diff --git a/dev/make/compiler_definitions/gnu.ref.32e.mk b/dev/make/compiler_definitions/gnu.ref.32e.mk new file mode 100644 index 00000000000..bd58dc8ab1d --- /dev/null +++ b/dev/make/compiler_definitions/gnu.ref.32e.mk @@ -0,0 +1,24 @@ +#=============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +#++ +# g++ definitions for makefile +#-- + +include dev/make/compiler_definitions/gnu.32e.mk + +COMPILER.lnx.gnu = $(COMPILER.all.gnu) -DDAAL_REF -DONEDAL_REF +COMPILER.mac.gnu = $(COMPILER.all.gnu) -DDAAL_REF -DONEDAL_REF diff --git a/dev/make/compiler_definitions/gnu.ref.arm.mk b/dev/make/compiler_definitions/gnu.ref.arm.mk new file mode 100644 index 00000000000..bf7379cc8bc --- /dev/null +++ b/dev/make/compiler_definitions/gnu.ref.arm.mk @@ -0,0 +1,34 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +#++ +# g++ definitions for makefile +#-- + +include dev/make/compiler_definitions/gnu.mk + +PLATs.gnu = lnxarm + +COMPILER.all.gnu = ${CXX} -march=armv8-a+sve -fwrapv -fno-strict-overflow -fno-delete-null-pointer-checks \ + -DDAAL_REF -DONEDAL_REF -DDAAL_CPU=sve -Werror -Wreturn-type + +link.dynamic.all.gnu = ${CXX} -march=native + +COMPILER.lnx.gnu = $(COMPILER.all.gnu) +link.dynamic.lnx.gnu = $(link.dynamic.all.gnu) +pedantic.opts.lnx.gnu = $(pedantic.opts.all.gnu) + +a8sve_OPT.gnu = $(-Q)march=armv8-a+sve diff --git a/dev/make/cmplr.icc.mkl.mk b/dev/make/compiler_definitions/icc.mkl.32e.mk similarity index 97% rename from dev/make/cmplr.icc.mkl.mk rename to dev/make/compiler_definitions/icc.mkl.32e.mk index a3ccb0750f3..a6ff2410ecc 100644 --- a/dev/make/cmplr.icc.mkl.mk +++ b/dev/make/compiler_definitions/icc.mkl.32e.mk @@ -15,7 +15,7 @@ #=============================================================================== #++ -# Intel compiler defenitions for makefile +# Intel compiler definitions for makefile #-- PLATs.icc = lnx32e win32e mac32e diff --git a/dev/make/cmplr.icx.mkl.mk b/dev/make/compiler_definitions/icx.mkl.32e.mk similarity index 96% rename from dev/make/cmplr.icx.mkl.mk rename to dev/make/compiler_definitions/icx.mkl.32e.mk index cbcde1a7e09..b22bcfe22ac 100644 --- a/dev/make/cmplr.icx.mkl.mk +++ b/dev/make/compiler_definitions/icx.mkl.32e.mk @@ -15,7 +15,7 @@ #=============================================================================== #++ -# Intel compiler defenitions for makefile +# Intel compiler definitions for makefile #-- PLATs.icx = lnx32e mac32e diff --git a/dev/make/cmplr.vc.mkl.mk b/dev/make/compiler_definitions/vc.mkl.32e.mk similarity index 100% rename from dev/make/cmplr.vc.mkl.mk rename to dev/make/compiler_definitions/vc.mkl.32e.mk diff --git a/dev/make/function_definitions/32e.mk b/dev/make/function_definitions/32e.mk new file mode 100644 index 00000000000..41dfbb96fe9 --- /dev/null +++ b/dev/make/function_definitions/32e.mk @@ -0,0 +1,107 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +ifeq ($(filter mkl ref,$(BACKEND_CONFIG)),) + $(error Unsupported backend config '$(BACKEND_CONFIG)'. \ + Supported config for '$(PLAT)' are ['mkl', 'ref']) +endif + +COMPILERs = icc icx gnu clang vc +COMPILER ?= icc +CPUs := sse2 sse42 avx2 avx512 +CPUs.files := nrh neh hsw skx + +ONEAPI.dispatcher_tag.nrh := -D__CPU_TAG__=__CPU_TAG_SSE2__ +ONEAPI.dispatcher_tag.neh := -D__CPU_TAG__=__CPU_TAG_SSE42__ +ONEAPI.dispatcher_tag.hsw := -D__CPU_TAG__=__CPU_TAG_AVX2__ +ONEAPI.dispatcher_tag.skx := -D__CPU_TAG__=__CPU_TAG_AVX512__ + +# Used as $(eval $(call add_mandatory_cpu,var_name)) to add the mandatory CPU +# sse2 to the start of the list of CPUs stored in 'var_name' +define add_mandatory_cpu + $$(eval $1 := $$(if $$(filter sse2,$$($1)),$$($1),sse2 $$($1))) +endef + +# Used as $(eval $(call set_uarch_options_for_compiler,$(COMPILER))) +define set_uarch_options_for_compiler + $$(eval p4_OPT := $$(p4_OPT.$1)) + $$(eval mc3_OPT := $$(mc3_OPT.$1)) + $$(eval avx2_OPT := $$(avx2_OPT.$1)) + $$(eval skx_OPT := $$(skx_OPT.$1)) +endef + +# Used as $(eval $(call set_arch_file_suffix,var_name)) +define set_arch_file_suffix + $$(eval $1.files := $$(subst sse2,nrh,$$(subst sse42,neh,$$(subst avx2,hsw,$$(subst avx512,skx,$$($1)))))) +endef + +# Used as $(eval $(call set_usecpu_defs)) +# There are no parameters, as we assume we want to update the variable USECPUS, +# but we can't set this without a function call, as we rely on other variables +# already being set +define set_usecpu_defs + $$(eval USECPUS.out.defs := $$(subst sse2,^\#define DAAL_KERNEL_SSE2$$(sed.eow),\ + $$(subst sse42,^\#define DAAL_KERNEL_SSE42$$(sed.eow),\ + $$(subst avx2,^\#define DAAL_KERNEL_AVX2$$(sed.eow),\ + $$(subst avx512,^\#define DAAL_KERNEL_AVX512$$(sed.eow),$$(USECPUS.out)))))) +endef + +# Used as $(eval $(call append_uarch_copt,$(OBJNAME))) +define append_uarch_copt +$$(eval $$(call containing,_nrh, $1): COPT += $$(p4_OPT) -DDAAL_CPU=sse2) +$$(eval $$(call containing,_neh, $1): COPT += $$(mc3_OPT) -DDAAL_CPU=sse42) +$$(eval $$(call containing,_hsw, $1): COPT += $$(avx2_OPT) -DDAAL_CPU=avx2) +$$(eval $$(call containing,_skx, $1): COPT += $$(skx_OPT) -DDAAL_CPU=avx512) + +$$(eval $$(call containing,_flt, $1): COPT += -DDAAL_FPTYPE=float) +$$(eval $$(call containing,_dbl, $1): COPT += -DDAAL_FPTYPE=double) +endef + +# Used as $(eval $(call subst_arch_cpu_in_var,VARNAME)) +define subst_arch_cpu_in_var + $$(eval $1 := $$(subst _cpu_nrh,_cpu,$$($1))) + $$(eval $1 := $$(subst _cpu_neh,_cpu,$$($1))) + $$(eval $1 := $$(subst _cpu_hsw,_cpu,$$($1))) + $$(eval $1 := $$(subst _cpu_skx,_cpu,$$($1))) +endef + +# Use as $(eval $(call add_cpu_to_uarch_in_files,VAR_NAME +define add_cpu_to_uarch_in_files + $$(eval nrh_files := $$(subst _nrh,_cpu_nrh,$$(call containing,_nrh,$$($1)))) + $$(eval neh_files := $$(subst _neh,_cpu_neh,$$(call containing,_neh,$$($1)))) + $$(eval hsw_files := $$(subst _hsw,_cpu_hsw,$$(call containing,_hsw,$$($1)))) + $$(eval skx_files := $$(subst _skx,_cpu_skx,$$(call containing,_skx,$$($1)))) + $$(eval user_cpu_files := $$(nrh_files) $$(neh_files) $$(hsw_files) $$(skx_files)) +endef + +# Used as $(eval $(call dispatcher_cpu_rule,rule_name,$(USECPUS)))) +define dispatcher_cpu_rule +$1: | $(dir $1)/. + $(if $(filter sse42,$2),echo "#define ONEDAL_CPU_DISPATCH_SSE42" >> $$@) + $(if $(filter avx2,$2),echo "#define ONEDAL_CPU_DISPATCH_AVX2" >> $$@) + $(if $(filter avx512,$2),echo "#define ONEDAL_CPU_DISPATCH_AVX512" >> $$@) +endef + +# Used as $(eval $(call update_copt_from_dispatcher_tag,$(OBJ_NAME),suffix)) +# This must be called after the p4_OPT, mc3_OPT, avx2_OPT, skx_OPT, a8sve_OPT, +# and ONEAPI.dispatcher_tag.* variables are defined. Otherwise this will be a +# no-op +define update_copt_from_dispatcher_tag + $$(eval $(call containing,_nrh, $1): COPT += $$(p4_OPT$2) $$(ONEAPI.dispatcher_tag.nrh)) + $$(eval $(call containing,_neh, $1): COPT += $$(mc3_OPT$2) $$(ONEAPI.dispatcher_tag.neh)) + $$(eval $(call containing,_hsw, $1): COPT += $$(avx2_OPT$2) $$(ONEAPI.dispatcher_tag.hsw)) + $$(eval $(call containing,_skx, $1): COPT += $$(skx_OPT$2) $$(ONEAPI.dispatcher_tag.skx)) +endef diff --git a/dev/make/function_definitions/arm.mk b/dev/make/function_definitions/arm.mk new file mode 100644 index 00000000000..181b1c9ee2a --- /dev/null +++ b/dev/make/function_definitions/arm.mk @@ -0,0 +1,82 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +ifeq ($(filter ref,$(BACKEND_CONFIG)),) + $(error Unsupported backend config '$(BACKEND_CONFIG)'. \ + Supported config for '$(PLAT)' are ['ref']) +endif + +COMPILERs = gnu clang +COMPILER ?= gnu +CPUs := sve +CPUs.files := a8sve + +ONEAPI.dispatcher_tag.a8sve := -D__CPU_TAG__=__CPU_TAG_ARMV8SVE__ + +# Used as $(eval $(call add_mandatory_cpu,var_name)) to add the mandatory CPU +# sse2 to the start of the list of CPUs stored in 'var_name' +define add_mandatory_cpu + $$(eval $1 := $$(if $$(filter sve,$$($1)),$$($1),sve $$($1))) +endef + +# Used as $(eval $(call set_uarch_options_for_compiler,$(COMPILER))) +define set_uarch_options_for_compiler + $$(eval a8sve_OPT := $$(a8sve_OPT.$1)) +endef + +# Used as $(eval $(call set_arch_file_suffix,var_name)) +define set_arch_file_suffix + $$(eval $1.files := $$(subst sve,a8sve,$$($1))) +endef + +# Used as $(eval $(call set_usecpu_defs)) +# There are no parameters, as we assume we want to update the variable USECPUS, +# but we can't set this without a function call, as we rely on other variables +# already being set +define set_usecpu_defs + $$(eval USECPUS.out.defs := $$(subst sve,^\#define DAAL_KERNEL_SVE$$(sed.eow),$$(USECPUS.out))) +endef + +# Used as $(eval $(call append_uarch_copt,$(OBJNAME))) +define append_uarch_copt +$$(eval $$(call containing,_flt, $1): COPT += -DDAAL_FPTYPE=float) +$$(eval $$(call containing,_dbl, $1): COPT += -DDAAL_FPTYPE=double) +endef + +# Used as $(eval $(call subst_arch_cpu_in_var,VARNAME)) +define subst_arch_cpu_in_var + $$(eval $1 := $$(subst _cpu_a8sve,_cpu,$$($1))) +endef + +# Use as $(eval $(call add_cpu_to_uarch_in_files,VAR_NAME +define add_cpu_to_uarch_in_files + $$(eval a8sve_files := $$(subst _a8sve,_cpu_a8sve,$$(call containing,_a8sve,$$($1)))) + $$(eval user_cpu_files := $$(a8sve_files)) +endef + +# Used as $(eval $(call dispatcher_cpu_rule,rule_name,$(USECPUS)))) +define dispatcher_cpu_rule +$1: | $(dir $1)/. + $(if $(filter sve,$2),echo "#define ONEDAL_CPU_DISPATCH_A8SVE" >> $$@) +endef + +# Used as $(eval $(call update_copt_from_dispatcher_tag,$(OBJ_NAME),suffix)) +# This must be called after the p4_OPT, mc3_OPT, avx2_OPT, skx_OPT, a8sve_OPT, +# and ONEAPI.dispatcher_tag.* variables are defined. Otherwise this will be a +# no-op +define update_copt_from_dispatcher_tag + $$(eval $(call containing,_a8sve, $1): COPT += $$(a8sve_OPT$2) $$(ONEAPI.dispatcher_tag.a8sve)) +endef diff --git a/dev/make/function_definitions/lnx32e.mk b/dev/make/function_definitions/lnx32e.mk new file mode 100644 index 00000000000..ea5e759520a --- /dev/null +++ b/dev/make/function_definitions/lnx32e.mk @@ -0,0 +1,37 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +BACKEND_CONFIG ?= mkl +ARCH = 32e +ARCH_DIR_ONEDAL = intel64 +_OS := lnx +_IA := intel64 + +include dev/make/function_definitions/32e.mk + +# Used as $(eval $(call set_daal_rt_deps)) +define set_daal_rt_deps + $$(eval daaldep.lnx32e.rt.thr := -L$$(TBBDIR.soia.lnx) -ltbb -ltbbmalloc \ + -lpthread $$(daaldep.lnx32e.rt.$$(COMPILER)) \ + $$(if $$(COV.libia),$$(COV.libia)/libcov.a)) + $$(eval daaldep.lnx32e.rt.seq := -lpthread $$(daaldep.lnx32e.rt.$$(COMPILER)) \ + $$(if $$(COV.libia),$$(COV.libia)/libcov.a)) + $$(eval daaldep.lnx32e.rt.dpc := -lpthread -lOpenCL \ + $$(if $$(COV.libia),$$(COV.libia)/libcov.a)) + $$(eval daaldep.lnx32e.threxport := export_lnx32e.$$(BACKEND_CONFIG).def) + + $$(eval daaldep.lnx.threxport.create = grep -v -E '^(EXPORTS|;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter) | sed -e 's/^/-u /') +endef diff --git a/dev/make/function_definitions/lnxarm.mk b/dev/make/function_definitions/lnxarm.mk new file mode 100644 index 00000000000..c44df217e03 --- /dev/null +++ b/dev/make/function_definitions/lnxarm.mk @@ -0,0 +1,37 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +BACKEND_CONFIG ?= ref +ARCH = arm +ARCH_DIR_ONEDAL = arm +_OS := lnx +_IA := arm + +include dev/make/function_definitions/arm.mk + +# Used as $(eval $(call set_daal_rt_deps)) +define set_daal_rt_deps + $$(eval daaldep.lnxarm.rt.thr := -L$$(TBBDIR.soia.lnx) -ltbb -ltbbmalloc \ + -lpthread $$(daaldep.lnxarm.rt.$$(COMPILER)) \ + $$(if $$(COV.libia),$$(COV.libia)/libcov.a)) + $$(eval daaldep.lnxarm.rt.seq := -lpthread $$(daaldep.lnxarm.rt.$$(COMPILER)) \ + $$(if $$(COV.libia),$$(COV.libia)/libcov.a)) + $$(eval daaldep.lnxarm.rt.dpc := -lpthread -lOpenCL \ + $$(if $$(COV.libia),$$(COV.libia)/libcov.a)) + $$(eval daaldep.lnxarm.threxport := export_lnxarm.$$(BACKEND_CONFIG).def) + + $$(eval daaldep.lnx.threxport.create = grep -v -E '^(EXPORTS|;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter) | sed -e 's/^/-u /') +endef diff --git a/dev/make/function_definitions/mac32e.mk b/dev/make/function_definitions/mac32e.mk new file mode 100644 index 00000000000..a86b2416838 --- /dev/null +++ b/dev/make/function_definitions/mac32e.mk @@ -0,0 +1,33 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +BACKEND_CONFIG ?= mkl +ARCH = 32e +ARCH_DIR_ONEDAL = intel64 +_OS := mac +_IA := intel64 + +include dev/make/function_definitions/32e.mk + +# Used as $(eval $(call set_daal_rt_deps)) +define set_daal_rt_deps + $$(eval daaldep.mac32e.rt.thr := -L$$(RELEASEDIR.tbb.soia) -ltbb -ltbbmalloc \ + $$(daaldep.mac32e.rt.$$(COMPILER))) + $$(eval daaldep.mac32e.rt.seq := $$(daaldep.mac32e.rt.$$(COMPILER))) + $$(eval daaldep.mac32e.threxport := export_mac.def) + + $$(eval daaldep.mac.threxport.create = grep -v -E '^(EXPORTS|;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter) | sed -e 's/^/-u /') +endef diff --git a/dev/make/function_definitions/win32e.mk b/dev/make/function_definitions/win32e.mk new file mode 100644 index 00000000000..c37480ef549 --- /dev/null +++ b/dev/make/function_definitions/win32e.mk @@ -0,0 +1,34 @@ +#=============================================================================== +# Copyright contributors to the oneDAL project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +BACKEND_CONFIG ?= mkl +ARCH = 32e +ARCH_DIR_ONEDAL = intel64 +_OS := win +_IA := intel64 + +include dev/make/function_definitions/32e.mk + +# Used as $(eval $(call set_daal_rt_deps)) +define set_daal_rt_deps + $$(eval daaldep.win32e.rt.thr := -LIBPATH:$$(RELEASEDIR.tbb.libia) \ + $$(dep_thr) $$(if $$(CHECK_DLL_SIG),Wintrust.lib)) + $$(eval daaldep.win32e.rt.seq := $$(dep_seq) \ + $$(if $$(CHECK_DLL_SIG),Wintrust.lib)) + $$(eval daaldep.win32e.threxport := export.def) + + $$(eval daaldep.win.threxport.create = grep -v -E '^(;|$$$$$$$$)' $$$$< $$$$(USECPUS.out.grep.filter)) +endef diff --git a/dev/make/identify_os.sh b/dev/make/identify_os.sh index 12816582bbc..d8d70054d9f 100755 --- a/dev/make/identify_os.sh +++ b/dev/make/identify_os.sh @@ -16,12 +16,20 @@ #=============================================================================== os=$(uname) +ARCH=$(uname -m) if [ "${os}" = "Linux" ]; then - echo lnx32e + if [ "${ARCH}" = "x86_64" ]; then + echo lnx32e + elif [ "${ARCH}" = "aarch64" ]; then + echo lnxarm + else + echo "Unkown architecture: ${ARCH}" + exit 1 + fi elif [ "${os}" = "Darwin" ]; then echo mac32e elif [[ "${os}" =~ "MSYS" || "${os}" =~ "CYGWIN" ]]; then echo win32e else - echo "UnknownOS" + echo "Unknown OS: ${os}" fi diff --git a/docs/doxygen/doxygen_conf_cpp.txt b/docs/doxygen/doxygen_conf_cpp.txt index f28ca2e879f..26cd6fb69b2 100644 --- a/docs/doxygen/doxygen_conf_cpp.txt +++ b/docs/doxygen/doxygen_conf_cpp.txt @@ -38,7 +38,7 @@ PROJECT_NAME = "C++ API Reference for Intel(R) oneAPI Data Analytics L # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "2024.2" +PROJECT_NUMBER = "2024.3" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/doxygen/doxygen_conf_cpp_examples.txt b/docs/doxygen/doxygen_conf_cpp_examples.txt index 2bb1aa9bfce..f1927e0a04e 100644 --- a/docs/doxygen/doxygen_conf_cpp_examples.txt +++ b/docs/doxygen/doxygen_conf_cpp_examples.txt @@ -38,7 +38,7 @@ PROJECT_NAME = "C++ API Reference for Intel(R) oneAPI Data Analytics L # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "2024.2" +PROJECT_NUMBER = "2024.3" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/doxygen/doxygen_conf_cpp_web.txt b/docs/doxygen/doxygen_conf_cpp_web.txt index c71f084e687..d47f4402e11 100644 --- a/docs/doxygen/doxygen_conf_cpp_web.txt +++ b/docs/doxygen/doxygen_conf_cpp_web.txt @@ -38,7 +38,7 @@ PROJECT_NAME = "C++ API Reference for Intel(R) oneAPI Data Analytics L # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "2024.2" +PROJECT_NUMBER = "2024.3" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/docs/requirements.txt b/docs/requirements.txt index 381ee0deae3..63f249ddea7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ alabaster==0.7.13 Babel==2.13.1 beautifulsoup4==4.12.2 -certifi==2023.7.22 +certifi==2024.2.2 chardet==5.2.0 click==8.1.7 colorama==0.4.6 @@ -13,7 +13,7 @@ importlib-resources==6.1.1 Jinja2==3.1.3 lxml==5.1.0 MarkupSafe==2.1.3 -packaging==23.2 +packaging==24.0 pydata-sphinx-theme==0.14.3 Pygments==2.16.1 pyparsing==3.1.1 diff --git a/examples/cmake/setup_examples.cmake b/examples/cmake/setup_examples.cmake index 78e37159f17..a705b7e10f6 100644 --- a/examples/cmake/setup_examples.cmake +++ b/examples/cmake/setup_examples.cmake @@ -1,5 +1,6 @@ #=============================================================================== # Copyright 2023 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -101,6 +102,16 @@ endfunction() function (add_examples examples_paths) foreach(example_file_path ${examples_paths}) get_filename_component(example ${example_file_path} NAME_WE) + + # Detect CPU architecture + if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64") + set(CPU_ARCHITECTURE "intel_intel64") + elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "aarch64") + set(CPU_ARCHITECTURE "arm_aarch64") + else() + message(FATAL_ERROR "Unkown architecture ${CMAKE_HOST_SYSTEM_PROCESSOR}") + endif() + add_executable(${example} ${example_file_path}) target_include_directories(${example} PRIVATE ${oneDAL_INCLUDE_DIRS}) if (UNIX AND NOT APPLE) @@ -110,7 +121,7 @@ function (add_examples examples_paths) endif() target_compile_options(${example} PRIVATE ${ONEDAL_CUSTOM_COMPILE_OPTIONS}) target_link_options(${example} PRIVATE ${ONEDAL_CUSTOM_LINK_OPTIONS}) - set_target_properties(${example} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/intel_intel64_${LINK_TYPE}") + set_target_properties(${example} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/_cmake_results/${CPU_ARCHITECTURE}_${LINK_TYPE}") endforeach() set_common_compiler_options() endfunction() diff --git a/makefile b/makefile index f4ecdffa268..18211bbe973 100644 --- a/makefile +++ b/makefile @@ -1,5 +1,6 @@ #=============================================================================== # Copyright 2014 Intel Corporation +# Copyright contributors to the oneDAL project # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,21 +19,25 @@ # Common macros #=============================================================================== -ifeq ($(PLAT),) - PLAT:=$(shell bash dev/make/identify_os.sh) -endif - ifeq (help,$(MAKECMDGOALS)) PLAT:=win32e +else ifeq ($(PLAT),) + PLAT:=$(shell bash dev/make/identify_os.sh) endif -attr.lnx32e = lnx intel64 lin -attr.mac32e = mac intel64 -attr.win32e = win intel64 win +# Check that we know how to build for the identified platform +PLATs := lnx32e mac32e win32e lnxarm +$(if $(filter $(PLAT),$(PLATs)),,$(error Unknown platform $(PLAT))) -_OS := $(word 1,$(attr.$(PLAT))) -_IA := $(word 2,$(attr.$(PLAT))) -_OSc:= $(word 3,$(attr.$(PLAT))) +# Non-platform or architecture specific defines live in common.mk +include dev/make/common.mk + +# Platform specific variables are set in dev/make/function_definitions/$(PLAT).mk +# There are also files dev/make/function_definitions/$(ARCH).mk, but these are included from +# the $(PLAT).mk files, rather than here. +include dev/make/function_definitions/$(PLAT).mk + +$(if $(filter $(COMPILERs),$(COMPILER)),,$(error COMPILER must be one of $(COMPILERs))) MSVC_RUNTIME_VERSIONs = release debug MSVC_RUNTIME_VERSION ?= release @@ -43,6 +48,7 @@ OS_is_$(_OS) := yes IA_is_$(_IA) := yes PLAT_is_$(PLAT) := yes MSVC_RT_is_$(MSVC_RUNTIME_VERSION) := yes +ARCH_is_$(ARCH) := yes DEFAULT_BUILD_PARAMETERS_LIB := $(if $(OS_is_win),no,yes) BUILD_PARAMETERS_LIB ?= $(DEFAULT_BUILD_PARAMETERS_LIB) @@ -53,19 +59,10 @@ $(error Building with the parameters library is not available on Windows OS) endif endif -COMPILERs = icc icx gnu clang vc -COMPILER ?= icc - -BACKEND_CONFIG ?= mkl - -$(if $(filter $(COMPILERs),$(COMPILER)),,$(error COMPILER must be one of $(COMPILERs))) - -CPUs := sse2 sse42 avx2 avx512 -CPUs.files := nrh neh hsw skx - USERREQCPU := $(filter-out $(filter $(CPUs),$(REQCPU)),$(REQCPU)) USECPUS := $(if $(REQCPU),$(if $(USERREQCPU),$(error Unsupported value/s in REQCPU: $(USERREQCPU). List of supported CPUs: $(CPUs)),$(REQCPU)),$(CPUs)) -USECPUS := $(if $(filter sse2,$(USECPUS)),$(USECPUS),sse2 $(USECPUS)) + +$(eval $(call add_mandatory_cpu,USECPUS)) $(info Selected list of CPUs - USECPUS: $(USECPUS)) @@ -91,8 +88,8 @@ endif DPC.COMPILE.gcc_toolchain := $(GCC_TOOLCHAIN_PATH) endif -include dev/make/cmplr.$(COMPILER).$(BACKEND_CONFIG).mk -include dev/make/cmplr.dpcpp.mk +include dev/make/compiler_definitions/$(COMPILER).$(BACKEND_CONFIG).$(ARCH).mk +include dev/make/compiler_definitions/dpcpp.mk $(if $(filter $(PLATs.$(COMPILER)),$(PLAT)),,$(error PLAT for $(COMPILER) must be defined to one of $(PLATs.$(COMPILER)))) @@ -100,7 +97,6 @@ $(if $(filter $(PLATs.$(COMPILER)),$(PLAT)),,$(error PLAT for $(COMPILER) must b # Dependencies generation #=============================================================================== -include dev/make/common.mk include dev/make/deps.mk #=============================================================================== @@ -133,21 +129,16 @@ y := $(notdir $(filter $(_OS)/%,lnx/so win/dll mac/dylib)) -eGRP = $(if $(OS_is_lnx),-Wl$(comma)--end-group,) daalmake = make -p4_OPT := $(p4_OPT.$(COMPILER)) -mc3_OPT := $(mc3_OPT.$(COMPILER)) -avx2_OPT := $(avx2_OPT.$(COMPILER)) -skx_OPT := $(skx_OPT.$(COMPILER)) +$(eval $(call set_uarch_options_for_compiler,$(COMPILER))) -_OSr := $(if $(OS_is_win),win,$(if $(OS_is_lnx),lin,)) +$(eval $(call set_arch_file_suffix,USECPUS)) -USECPUS.files := $(subst sse2,nrh,$(subst sse42,neh,$(subst avx2,hsw,$(subst avx512,skx,$(USECPUS))))) USECPUS.out := $(filter-out $(USECPUS),$(CPUs)) USECPUS.out.for.grep.filter := $(addprefix _,$(addsuffix _,$(subst $(space),_|_,$(USECPUS.out)))) USECPUS.out.grep.filter := $(if $(USECPUS.out),| grep -v -E '$(USECPUS.out.for.grep.filter)') -USECPUS.out.defs := $(subst sse2,^\#define DAAL_KERNEL_SSE2$(sed.eow),\ - $(subst sse42,^\#define DAAL_KERNEL_SSE42$(sed.eow),\ - $(subst avx2,^\#define DAAL_KERNEL_AVX2$(sed.eow),\ - $(subst avx512,^\#define DAAL_KERNEL_AVX512$(sed.eow),$(USECPUS.out))))) + +$(eval $(call set_usecpu_defs)) + USECPUS.out.defs := $(subst $(space)^,|^,$(strip $(USECPUS.out.defs))) USECPUS.out.defs.filter := $(if $(USECPUS.out.defs),sed $(sed.-b) $(sed.-i) -E -e 's/$(USECPUS.out.defs)/$(sed.eol)/') @@ -291,7 +282,6 @@ mklgpufpk.HEADERS := $(MKLGPUFPKDIR.include)/mkl_dal_sycl.hpp $(MKLGPUFPKDIR.inc include dev/make/deps.$(BACKEND_CONFIG).mk - #============================= oneAPI folders ===================================== ifeq ($(if $(or $(OS_is_lnx),$(OS_is_win)),yes,),yes) ONEAPIDIR := $(call topf,$$ONEAPI_ROOT) @@ -349,25 +339,8 @@ release.PARAMETERS.LIBS_A.dpc := $(parameters_a.dpc) \ $(if $(OS_is_win),$(foreach ilib,$(parameters_a.dpc),$(ilib:%.lib=%_dll.lib)),) release.PARAMETERS.LIBS_Y.dpc := $(parameters_y.dpc) -# Libraries required for building -daaldep.lnx32e.rt.thr := -L$(TBBDIR.soia.lnx) -ltbb -ltbbmalloc -lpthread $(daaldep.lnx32e.rt.$(COMPILER)) $(if $(COV.libia),$(COV.libia)/libcov.a) -daaldep.lnx32e.rt.seq := -lpthread $(daaldep.lnx32e.rt.$(COMPILER)) $(if $(COV.libia),$(COV.libia)/libcov.a) -daaldep.lnx32e.rt.dpc := -lpthread -lOpenCL $(if $(COV.libia),$(COV.libia)/libcov.a) -daaldep.lnx32e.threxport := export_lnx32e.$(BACKEND_CONFIG).def -daaldep.lnx.threxport.create = grep -v -E '^(EXPORTS|;|$$)' $< $(USECPUS.out.grep.filter) | sed -e 's/^/-u /' - -daaldep.win32e.rt.thr := -LIBPATH:$(RELEASEDIR.tbb.libia) $(dep_thr) $(if $(CHECK_DLL_SIG),Wintrust.lib) -daaldep.win32e.rt.seq := $(dep_seq) $(if $(CHECK_DLL_SIG),Wintrust.lib) -daaldep.win32e.threxport := export.def - -daaldep.win.threxport.create = grep -v -E '^(;|$$)' $< $(USECPUS.out.grep.filter) - -daaldep.mac32e.rt.thr := -L$(RELEASEDIR.tbb.soia) -ltbb -ltbbmalloc $(daaldep.mac32e.rt.$(COMPILER)) -daaldep.mac32e.rt.seq := $(daaldep.mac32e.rt.$(COMPILER)) -daaldep.mac32e.threxport := export_mac.def - -daaldep.mac.threxport.create = grep -v -E '^(EXPORTS|;|$$)' $< $(USECPUS.out.grep.filter) | sed -e 's/^/-u /' +$(eval $(call set_daal_rt_deps)) daaldep.rt.thr := $(daaldep.$(PLAT).rt.thr) daaldep.rt.seq := $(daaldep.$(PLAT).rt.seq) @@ -514,12 +487,8 @@ $(CORE.objs_a): COPT += -D__TBB_NO_IMPLICIT_LINKAGE -DDAAL_NOTHROW_EXCEPTIONS \ $(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG) $(CORE.objs_a): COPT += @$(CORE.tmpdir_a)/inc_a_folders.txt $(filter %threading.$o, $(CORE.objs_a)): COPT += -D__DO_TBB_LAYER__ -$(call containing,_nrh, $(CORE.objs_a)): COPT += $(p4_OPT) -DDAAL_CPU=sse2 -$(call containing,_neh, $(CORE.objs_a)): COPT += $(mc3_OPT) -DDAAL_CPU=sse42 -$(call containing,_hsw, $(CORE.objs_a)): COPT += $(avx2_OPT) -DDAAL_CPU=avx2 -$(call containing,_skx, $(CORE.objs_a)): COPT += $(skx_OPT) -DDAAL_CPU=avx512 -$(call containing,_flt, $(CORE.objs_a)): COPT += -DDAAL_FPTYPE=float -$(call containing,_dbl, $(CORE.objs_a)): COPT += -DDAAL_FPTYPE=double + +$(eval $(call append_uarch_copt,$(CORE.objs_a))) $(CORE.objs_y): $(CORE.tmpdir_y)/inc_y_folders.txt $(CORE.objs_y): COPT += $(-fPIC) $(-cxx11) $(-Zl) $(-DEBC) @@ -529,12 +498,8 @@ $(CORE.objs_y): COPT += -D__DAAL_IMPLEMENTATION \ $(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG) $(CORE.objs_y): COPT += @$(CORE.tmpdir_y)/inc_y_folders.txt $(filter %threading.$o, $(CORE.objs_y)): COPT += -D__DO_TBB_LAYER__ -$(call containing,_nrh, $(CORE.objs_y)): COPT += $(p4_OPT) -DDAAL_CPU=sse2 -$(call containing,_neh, $(CORE.objs_y)): COPT += $(mc3_OPT) -DDAAL_CPU=sse42 -$(call containing,_hsw, $(CORE.objs_y)): COPT += $(avx2_OPT) -DDAAL_CPU=avx2 -$(call containing,_skx, $(CORE.objs_y)): COPT += $(skx_OPT) -DDAAL_CPU=avx512 -$(call containing,_flt, $(CORE.objs_y)): COPT += -DDAAL_FPTYPE=float -$(call containing,_dbl, $(CORE.objs_y)): COPT += -DDAAL_FPTYPE=double + +$(eval $(call append_uarch_copt,$(CORE.objs_y))) vpath vpath %.cpp $(CORE.srcdirs) @@ -546,19 +511,19 @@ $(CORE.tmpdir_y)/inc_y_folders.txt: makefile.lst | $(CORE.tmpdir_y)/. $(CORE.inc $(CORE.tmpdir_a)/library_version_info.$(o): $(VERSION_DATA_FILE) $(CORE.tmpdir_y)/library_version_info.$(o): $(VERSION_DATA_FILE) +# Used as $(eval $(call .compile.template.ay,obj_file)) define .compile.template.ay $(eval template_source_cpp := $(subst .$o,.cpp,$(notdir $1))) $(eval template_source_cpp := $(subst _fpt_flt,_fpt,$(template_source_cpp))) $(eval template_source_cpp := $(subst _fpt_dbl,_fpt,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_nrh,_cpu,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_neh,_cpu,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_hsw,_cpu,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_skx,_cpu,$(template_source_cpp))) + +$(eval $(call subst_arch_cpu_in_var,template_source_cpp)) + $1: $(template_source_cpp) ; $(value C.COMPILE) endef -$(foreach a,$(CORE.objs_a),$(eval $(call .compile.template.ay,$a,$(CORE.tmpdir_a)))) -$(foreach a,$(CORE.objs_y),$(eval $(call .compile.template.ay,$a,$(CORE.tmpdir_y)))) +$(foreach a,$(CORE.objs_a),$(eval $(call .compile.template.ay,$a))) +$(foreach a,$(CORE.objs_y),$(eval $(call .compile.template.ay,$a))) $(CORE.tmpdir_y)/dll.res: $(VERSION_DATA_FILE) $(CORE.tmpdir_y)/dll.res: RCOPT += $(addprefix -I, $(CORE.incdirs.common)) @@ -582,10 +547,6 @@ ONEAPI.incdirs.thirdp := $(CORE.incdirs.common) $(daaldep.math_backend.incdir) $ ONEAPI.incdirs := $(ONEAPI.incdirs.common) $(CORE.incdirs.thirdp) $(ONEAPI.incdirs.thirdp) ONEAPI.dispatcher_cpu = $(WORKDIR)/oneapi/dal/_dal_cpu_dispatcher_gen.hpp -ONEAPI.dispatcher_tag.nrh := -D__CPU_TAG__=__CPU_TAG_SSE2__ -ONEAPI.dispatcher_tag.neh := -D__CPU_TAG__=__CPU_TAG_SSE42__ -ONEAPI.dispatcher_tag.hsw := -D__CPU_TAG__=__CPU_TAG_AVX2__ -ONEAPI.dispatcher_tag.skx := -D__CPU_TAG__=__CPU_TAG_AVX512__ ONEAPI.srcdir := $(CPPDIR.onedal) ONEAPI.srcdirs.base := $(ONEAPI.srcdir) \ @@ -628,11 +589,9 @@ ONEAPI.objs_y.all := $(ONEAPI.objs_y) $(ONEAPI.objs_y.dpc) define .populate_cpus $(eval non_cpu_files := $(call notcontaining,_cpu,$2)) $(eval cpu_files := $(call containing,_cpu,$2)) -$(eval nrh_files := $(subst _nrh,_cpu_nrh,$(call containing,_nrh,$(non_cpu_files)))) -$(eval neh_files := $(subst _neh,_cpu_neh,$(call containing,_neh,$(non_cpu_files)))) -$(eval hsw_files := $(subst _hsw,_cpu_hsw,$(call containing,_hsw,$(non_cpu_files)))) -$(eval skx_files := $(subst _skx,_cpu_skx,$(call containing,_skx,$(non_cpu_files)))) -$(eval user_cpu_files := $(nrh_files) $(neh_files) $(hsw_files) $(skx_files)) + +$(eval $(call add_cpu_to_uarch_in_files,non_cpu_files)) + $(eval populated_cpu_files := $(foreach ccc,$(USECPUS.files),$(subst _cpu,_cpu_$(ccc),$(cpu_files)))) $(eval populated_cpu_files := $(filter-out $(user_cpu_files),$(populated_cpu_files))) $(eval $1 := $(non_cpu_files) $(populated_cpu_files)) @@ -655,10 +614,9 @@ $(eval $(call .populate_cpus,ONEAPI.objs_y.dpc,$(ONEAPI.objs_y.dpc))) define .ONEAPI.compile $(eval template_source_cpp := $(1:$2/%.$o=%.cpp)) $(eval template_source_cpp := $(subst -,/,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_nrh,_cpu,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_neh,_cpu,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_hsw,_cpu,$(template_source_cpp))) -$(eval template_source_cpp := $(subst _cpu_skx,_cpu,$(template_source_cpp))) + +$(eval $(call subst_arch_cpu_in_var,template_source_cpp)) + $1: $(template_source_cpp) | $(dir $1)/. ; $(value $3.COMPILE) endef @@ -671,10 +629,7 @@ $1: LOPT:= $1: $(1:%.$a=%_link.txt) | $(dir $1)/. ; $(value LINK.STATIC) endef -$(ONEAPI.dispatcher_cpu): | $(dir $(ONEAPI.dispatcher_cpu))/. - $(if $(filter sse42,$(USECPUS)),echo "#define ONEDAL_CPU_DISPATCH_SSE42" >> $@) - $(if $(filter avx2,$(USECPUS)),echo "#define ONEDAL_CPU_DISPATCH_AVX2" >> $@) - $(if $(filter avx512,$(USECPUS)),echo "#define ONEDAL_CPU_DISPATCH_AVX512" >> $@) +$(eval $(call dispatcher_cpu_rule,$(ONEAPI.dispatcher_cpu),$(USECPUS))) # Create file with include paths ONEAPI.include_options := $(addprefix -I, $(ONEAPI.incdirs.common)) \ @@ -701,10 +656,8 @@ $(ONEAPI.objs_a): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-EHsc) $(pedantic -D__TBB_NO_IMPLICIT_LINKAGE \ -DTBB_USE_ASSERT=0 \ @$(ONEAPI.tmpdir_a)/inc_a_folders.txt -$(call containing,_nrh, $(ONEAPI.objs_a)): COPT += $(p4_OPT) $(ONEAPI.dispatcher_tag.nrh) -$(call containing,_neh, $(ONEAPI.objs_a)): COPT += $(mc3_OPT) $(ONEAPI.dispatcher_tag.neh) -$(call containing,_hsw, $(ONEAPI.objs_a)): COPT += $(avx2_OPT) $(ONEAPI.dispatcher_tag.hsw) -$(call containing,_skx, $(ONEAPI.objs_a)): COPT += $(skx_OPT) $(ONEAPI.dispatcher_tag.skx) + +$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_a))) $(ONEAPI.objs_a.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_a.dpc)/inc_a_folders.txt $(ONEAPI.objs_a.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \ @@ -716,10 +669,8 @@ $(ONEAPI.objs_a.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.op -D_ENABLE_ATOMIC_ALIGNMENT_FIX \ -DTBB_USE_ASSERT=0 \ @$(ONEAPI.tmpdir_a.dpc)/inc_a_folders.txt -$(call containing,_nrh, $(ONEAPI.objs_a.dpc)): COPT += $(p4_OPT.dpcpp) $(ONEAPI.dispatcher_tag.nrh) -$(call containing,_neh, $(ONEAPI.objs_a.dpc)): COPT += $(mc3_OPT.dpcpp) $(ONEAPI.dispatcher_tag.neh) -$(call containing,_hsw, $(ONEAPI.objs_a.dpc)): COPT += $(avx2_OPT.dpcpp) $(ONEAPI.dispatcher_tag.hsw) -$(call containing,_skx, $(ONEAPI.objs_a.dpc)): COPT += $(skx_OPT.dpcpp) $(ONEAPI.dispatcher_tag.skx) + +$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_a.dpc),.dpcpp)) # Set compilation options to the object files which are part of DYNAMIC lib $(ONEAPI.objs_y): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y)/inc_y_folders.txt @@ -732,10 +683,8 @@ $(ONEAPI.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-EHsc) $(pedantic -D__TBB_NO_IMPLICIT_LINKAGE \ -DTBB_USE_ASSERT=0 \ @$(ONEAPI.tmpdir_y)/inc_y_folders.txt -$(call containing,_nrh, $(ONEAPI.objs_y)): COPT += $(p4_OPT) $(ONEAPI.dispatcher_tag.nrh) -$(call containing,_neh, $(ONEAPI.objs_y)): COPT += $(mc3_OPT) $(ONEAPI.dispatcher_tag.neh) -$(call containing,_hsw, $(ONEAPI.objs_y)): COPT += $(avx2_OPT) $(ONEAPI.dispatcher_tag.hsw) -$(call containing,_skx, $(ONEAPI.objs_y)): COPT += $(skx_OPT) $(ONEAPI.dispatcher_tag.skx) + +$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_y))) $(ONEAPI.objs_y.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y.dpc)/inc_y_folders.txt $(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \ @@ -749,10 +698,8 @@ $(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.op -D__TBB_NO_IMPLICIT_LINKAGE \ -DTBB_USE_ASSERT=0 \ @$(ONEAPI.tmpdir_y.dpc)/inc_y_folders.txt -$(call containing,_nrh, $(ONEAPI.objs_y.dpc)): COPT += $(p4_OPT.dpcpp) $(ONEAPI.dispatcher_tag.nrh) -$(call containing,_neh, $(ONEAPI.objs_y.dpc)): COPT += $(mc3_OPT.dpcpp) $(ONEAPI.dispatcher_tag.neh) -$(call containing,_hsw, $(ONEAPI.objs_y.dpc)): COPT += $(avx2_OPT.dpcpp) $(ONEAPI.dispatcher_tag.hsw) -$(call containing,_skx, $(ONEAPI.objs_y.dpc)): COPT += $(skx_OPT.dpcpp) $(ONEAPI.dispatcher_tag.skx) + +$(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_y.dpc),.dpcpp)) # Filtering parameter files PARAMETERS.objs_a.filtered := $(filter %parameters.$(o),$(ONEAPI.objs_a)) @@ -1122,8 +1069,9 @@ $(foreach t,$(releasetbb.LIBS_Y),$(eval $(call .release.t,$t,$(RELEASEDIR.tbb.so $(foreach t,$(releasetbb.LIBS_A),$(eval $(call .release.t,$t,$(RELEASEDIR.tbb.libia)))) #----- cmake configs generation + _release_cmake_configs: - $(if $(shell bash -c "command -v cmake"),cmake -DINSTALL_DIR=$(RELEASEDIR.lib)/cmake/oneDAL -P cmake/scripts/generate_config.cmake,echo 'cmake configs generation skipped') + $(if $(shell bash -c "command -v cmake"),cmake -DINSTALL_DIR=$(RELEASEDIR.lib)/cmake/oneDAL -DARCH_DIR_ONEDAL=$(ARCH_DIR_ONEDAL) -P cmake/scripts/generate_config.cmake,echo 'cmake configs generation skipped') #----- nuspecs generation _release_common: _release_nuspec diff --git a/makefile.ver b/makefile.ver index e9941372c02..a22557005c0 100644 --- a/makefile.ver +++ b/makefile.ver @@ -15,7 +15,7 @@ #=============================================================================== MAJOR = 2024 -MINOR = 2 +MINOR = 3 UPDATE = 0 BUILD = $(shell date +'%Y%m%d') STATUS = P diff --git a/samples/oneapi/dpc/ccl/onedal_lnx.lst b/samples/oneapi/dpc/ccl/onedal_lnx.lst index 43e4812bfd4..ffbee1afd11 100644 --- a/samples/oneapi/dpc/ccl/onedal_lnx.lst +++ b/samples/oneapi/dpc/ccl/onedal_lnx.lst @@ -20,7 +20,9 @@ MPI = basic_statistics_distr_ccl \ cor_distr_ccl \ + cov_biased_distr_ccl \ cov_distr_ccl \ + cov_online_distr_ccl \ dbscan_distr_ccl \ decision_forest_cls_hist_distr_ccl \ decision_forest_reg_hist_distr_ccl \ diff --git a/samples/oneapi/dpc/ccl/sources/cov_online_distr_ccl.cpp b/samples/oneapi/dpc/ccl/sources/cov_online_distr_ccl.cpp new file mode 100644 index 00000000000..d10831a6df4 --- /dev/null +++ b/samples/oneapi/dpc/ccl/sources/cov_online_distr_ccl.cpp @@ -0,0 +1,79 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include +#include + +#ifndef ONEDAL_DATA_PARALLEL +#define ONEDAL_DATA_PARALLEL +#endif + +#include "oneapi/dal/algo/covariance.hpp" +#include "oneapi/dal/spmd/ccl/communicator.hpp" +#include "oneapi/dal/io/csv.hpp" + +#include "utils.hpp" + +namespace dal = oneapi::dal; + +void run(sycl::queue& queue) { + const auto data_file_name = get_data_path("data/covcormoments_dense.csv"); + const std::int64_t nBlocks = 10; + const auto data = dal::read(queue, dal::csv::data_source{ data_file_name }); + + const auto cov_desc = dal::covariance::descriptor{}.set_result_options( + dal::covariance::result_options::cov_matrix); + + auto comm = dal::preview::spmd::make_communicator(queue); + auto rank_id = comm.get_rank(); + auto rank_count = comm.get_rank_count(); + + auto input_vec = split_table_by_rows(queue, data, rank_count); + + auto input_blocks = split_table_by_rows(queue, input_vec[rank_id], nBlocks); + dal::covariance::partial_compute_result<> partial_result; + + for (std::int64_t i = 0; i < nBlocks; i++) { + partial_result = dal::partial_compute(queue, cov_desc, partial_result, input_blocks[i]); + } + const auto result = dal::preview::finalize_compute(comm, cov_desc, partial_result); + + if (comm.get_rank() == 0) { + std::cout << "Sample covariance:\n" << result.get_cov_matrix() << std::endl; + } +} + +int main(int argc, char const* argv[]) { + ccl::init(); + int status = MPI_Init(nullptr, nullptr); + if (status != MPI_SUCCESS) { + throw std::runtime_error{ "Problem occurred during MPI init" }; + } + + auto device = sycl::device(sycl::gpu_selector_v); + std::cout << "Running on " << device.get_platform().get_info() + << ", " << device.get_info() << std::endl; + sycl::queue q{ device }; + run(q); + + status = MPI_Finalize(); + if (status != MPI_SUCCESS) { + throw std::runtime_error{ "Problem occurred during MPI finalize" }; + } + return 0; +} diff --git a/samples/oneapi/dpc/mpi/onedal_lnx.lst b/samples/oneapi/dpc/mpi/onedal_lnx.lst index 47764bfba7b..3bcceee3290 100644 --- a/samples/oneapi/dpc/mpi/onedal_lnx.lst +++ b/samples/oneapi/dpc/mpi/onedal_lnx.lst @@ -20,7 +20,9 @@ MPI = basic_statistics_distr_mpi \ cor_distr_mpi \ + cov_biased_distr_mpi \ cov_distr_mpi \ + cov_online_distr_mpi \ dbscan_distr_mpi \ decision_forest_cls_hist_distr_mpi \ decision_forest_reg_hist_distr_mpi \ diff --git a/samples/oneapi/dpc/mpi/sources/cov_online_distr_mpi.cpp b/samples/oneapi/dpc/mpi/sources/cov_online_distr_mpi.cpp new file mode 100644 index 00000000000..5126ccdbf5c --- /dev/null +++ b/samples/oneapi/dpc/mpi/sources/cov_online_distr_mpi.cpp @@ -0,0 +1,78 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include +#include +#include + +#ifndef ONEDAL_DATA_PARALLEL +#define ONEDAL_DATA_PARALLEL +#endif + +#include "oneapi/dal/algo/covariance.hpp" +#include "oneapi/dal/spmd/mpi/communicator.hpp" +#include "oneapi/dal/io/csv.hpp" + +#include "utils.hpp" + +namespace dal = oneapi::dal; + +void run(sycl::queue& queue) { + const auto data_file_name = get_data_path("data/covcormoments_dense.csv"); + const std::int64_t nBlocks = 10; + const auto data = dal::read(queue, dal::csv::data_source{ data_file_name }); + + const auto cov_desc = dal::covariance::descriptor{}.set_result_options( + dal::covariance::result_options::cov_matrix); + + auto comm = dal::preview::spmd::make_communicator(queue); + auto rank_id = comm.get_rank(); + auto rank_count = comm.get_rank_count(); + + auto input_vec = split_table_by_rows(queue, data, rank_count); + + auto input_blocks = split_table_by_rows(queue, input_vec[rank_id], nBlocks); + dal::covariance::partial_compute_result<> partial_result; + + for (std::int64_t i = 0; i < nBlocks; i++) { + partial_result = dal::partial_compute(queue, cov_desc, partial_result, input_blocks[i]); + } + const auto result = dal::preview::finalize_compute(comm, cov_desc, partial_result); + + if (comm.get_rank() == 0) { + std::cout << "Sample covariance:\n" << result.get_cov_matrix() << std::endl; + } +} + +int main(int argc, char const* argv[]) { + int status = MPI_Init(nullptr, nullptr); + if (status != MPI_SUCCESS) { + throw std::runtime_error{ "Problem occurred during MPI init" }; + } + + auto device = sycl::device(sycl::gpu_selector_v); + std::cout << "Running on " << device.get_platform().get_info() + << ", " << device.get_info() << std::endl; + sycl::queue q{ device }; + run(q); + + status = MPI_Finalize(); + if (status != MPI_SUCCESS) { + throw std::runtime_error{ "Problem occurred during MPI finalize" }; + } + return 0; +}