diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index 88498cd4d68..a58106aac42 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -33,8 +33,17 @@ else() endif() set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT}) -# enbale mlas for X86 cpus only -ie_dependent_option(ENABLE_MLAS_FOR_CPU "MLAS GEMM for OpenVINO CPU Plugin" ON "X86 OR X86_64" OFF) +if(X86 OR X86_64 OR AARCH64) + if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) + set(ENABLE_MLAS_FOR_CPU_DEFAULT OFF) + else() + set(ENABLE_MLAS_FOR_CPU_DEFAULT ON) + endif() +else() + set(ENABLE_MLAS_FOR_CPU_DEFAULT OFF) +endif() +ie_option(ENABLE_MLAS_FOR_CPU "Enable MLAS for OpenVINO CPU Plugin" ${ENABLE_MLAS_FOR_CPU_DEFAULT}) + add_subdirectory(thirdparty) if(WIN32) @@ -80,6 +89,7 @@ if(NOT (AARCH64 OR ARM)) endif() if (NOT ENABLE_MLAS_FOR_CPU) + list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/mlas/*) list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/mlas/*) endif() diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp index f79bbd86cd2..90d8e6e8d83 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp @@ -4,7 +4,7 @@ #pragma once -#include "../transpose.hpp" +#include "nodes/executors/transpose.hpp" #include "utils/debug_capabilities.h" namespace ov { diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp index 23195dcbd72..0edabdff028 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp @@ -4,7 +4,7 @@ #pragma once -#include "../transpose.hpp" +#include "nodes/executors/transpose.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp index 6b920dcd753..a74ac434cea 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp @@ -11,6 +11,13 @@ namespace ov { namespace intel_cpu { +#if defined(OV_CPU_WITH_MLAS) && defined(OPENVINO_ARCH_ARM64) +#define OV_CPU_INSTANCE_MLAS_ARM64(...) \ + {__VA_ARGS__}, +#else +#define OV_CPU_INSTANCE_MLAS_ARM64(...) +#endif + #if defined(OV_CPU_WITH_ACL) #define OV_CPU_INSTANCE_ACL(...) \ {__VA_ARGS__}, @@ -40,7 +47,8 @@ enum class ExecutorType { Common, x64, Dnnl, - Acl + Acl, + Mlas }; class ExecutorContext { diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp new file mode 100644 index 00000000000..9ba3ef051a2 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp @@ -0,0 +1,314 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mlas_transpose.hpp" +#include "ie_parallel.hpp" +#include "nodes/common/cpu_memcpy.h" +#include "mlas.h" + +using namespace InferenceEngine; + +namespace ov { +namespace intel_cpu { + +template +struct has_mlas_transpose : std::false_type {}; + +template <> +struct has_mlas_transpose : std::true_type {}; + +template <> +struct has_mlas_transpose : std::true_type {}; + +template <> +struct has_mlas_transpose : std::true_type {}; + +template +typename std::enable_if::value, void>::type SimpleTransposeSingleAxisOutwards( + const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) { + const T* end; + for (int64_t l = 0; l < num_loops; ++l) { + T* output_for_first_writer = output_data; + for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) { + T* output_for_current_writer = output_for_first_writer; + end = input_data + num_writers; + for (; input_data != end;) { + *output_for_current_writer = *input_data++; + // skip to output position for next writer + output_for_current_writer += writes_per_writer_per_loop; + } + ++output_for_first_writer; + } + output_data += writes_per_loop; + } +} + +template +typename std::enable_if::value, void>::type SimpleTransposeSingleAxisOutwards( + const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) { + for (int64_t l = 0; l < num_loops; ++l) { + MlasTranspose(input_data, output_data, static_cast(writes_per_writer_per_loop), static_cast(num_writers)); + input_data += writes_per_loop; + output_data += writes_per_loop; + } +} + +template +typename std::enable_if::value, void>::type SimpleTransposeSingleAxisInwards( + const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) { + T* end; + for (int64_t l = 0; l < num_loops; ++l) { + const T* input_for_first_reader = input_data; + for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) { + const T* input_for_current_reader = input_for_first_reader; + end = output_data + num_readers; + for (; output_data != end;) { + *output_data++ = *input_for_current_reader; + // skip to input position for next reader + input_for_current_reader += reads_per_reader_per_loop; + } + ++input_for_first_reader; + } + input_data += reads_per_loop; + } +} + +template +typename std::enable_if::value, void>::type SimpleTransposeSingleAxisInwards( + const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) { + for (int64_t l = 0; l < num_loops; ++l) { + MlasTranspose(input_data, output_data, static_cast(num_readers), static_cast(reads_per_reader_per_loop)); + input_data += reads_per_loop; + output_data += reads_per_loop; + } +} + +int64_t MlasTransposeExecutor::calcShapeSize(const Shape& shape, size_t start, size_t end) { + int64_t size = 1; + for (size_t i = start; i < end; i++) { + size *= shape.getDims()[i]; + } + return size; +} + +bool MlasTransposeExecutor::IsTransposeMovingSingleAxis(SizeVector permutations, size_t& from, size_t& to) { + // if a single axis moved to an outer dimension, the values should be one lower than the index until the slot the + // axis was moved from, and equal to the index after that. + // e.g. axis 3 moves out to 1 would be: 0, 3, 1, 2, 4 + auto check_moved_outwards = [&permutations](size_t cur, size_t moved_from) { + // we start processing with the slot after the moved one, so the expected value is one less than the index + size_t expected = cur - 1; + for (size_t end = permutations.size(); cur < end; ++cur) { + if (permutations[cur] != expected) { + return false; + } + // we are at the slot the axis moved from, so do an additional increment before checking the next value + if (cur == moved_from) { + ++expected; + } + ++expected; + } + return true; + }; + // if a single axis moved to an inner dimension, the values should be one higher than the index until the slot the + // axis was moved to, and equal to the index after that. + // e.g. axis 1 moves inwards to 3 would be: 0, 2, 3, 1, 4 + auto check_moved_inwards = [&permutations](size_t cur, size_t& moved_to) { + size_t started_at = cur; + size_t expected = cur + 1; + moved_to = std::numeric_limits::max(); + for (size_t end = permutations.size(); cur < end; ++cur) { + if (permutations[cur] != expected) { + // if a single axis moved it must have come from the location we started at + if (started_at != permutations[cur]) { + return false; + } + moved_to = cur; + } else { + ++expected; + } + } + return moved_to != std::numeric_limits::max(); + }; + bool single_axis_moved = false; + // check axis moving outwards (earlier entry in permutations) + for (size_t i = 0, end = permutations.size(); i < end; ++i) { + size_t axis = permutations[i]; + if (axis != i) { + if (check_moved_outwards(i + 1, axis)) { + single_axis_moved = true; + to = i; + from = axis; + } else if (check_moved_inwards(i, to)) { + single_axis_moved = true; + from = i; + } + break; + } + } + return single_axis_moved; +} + +void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) { + const auto& input_shape = input->getShape(); + const auto& input_dims = input_shape.getDims(); + const auto element_size = input->getDesc().getPrecision().size(); + + const auto* input_data = reinterpret_cast(input->getData()); + auto* output_data = reinterpret_cast(output->getData()); + + auto num_loops = calcShapeSize(input_shape, 0, to); + auto num_writers = input_dims[from]; + auto block_size = calcShapeSize(input_shape, from + 1, input_shape.getRank()); + auto writes_per_loop = int64_t(input_shape.getElementsCount() / num_loops / block_size); + auto writes_per_writer_per_loop = int64_t(writes_per_loop / num_writers); + // TODO: check integer overflow + const size_t bytes_per_write = static_cast(block_size) * element_size; + + switch (bytes_per_write) { + case (sizeof(uint8_t)): { + SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop, + writes_per_writer_per_loop); + break; + } + case (sizeof(uint16_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), num_loops, num_writers, + writes_per_loop, writes_per_writer_per_loop); + break; + } + case (sizeof(uint32_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), num_loops, num_writers, + writes_per_loop, writes_per_writer_per_loop); + break; + } + case (sizeof(uint64_t)): { + SimpleTransposeSingleAxisOutwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), num_loops, num_writers, + writes_per_loop, writes_per_writer_per_loop); + break; + } + default: { + // we need to use memcpy for each block + for (int64_t l = 0; l < num_loops; ++l) { + uint8_t* output_for_first_writer = output_data; + + for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) { + uint8_t* output_for_current_writer = output_for_first_writer; + + for (uint64_t w = 0; w < num_writers; ++w) { + memcpy(output_for_current_writer, input_data, bytes_per_write); + // skip to output position for next writer + output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write); + input_data += bytes_per_write; + } + output_for_first_writer += bytes_per_write; + } + output_data += writes_per_loop * bytes_per_write; + } + } + } +} + +void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) { + const auto& input_shape = input->getShape(); + const auto& input_dims = input_shape.getDims(); + + const auto element_size = input->getDesc().getPrecision().size(); + const auto* input_data = reinterpret_cast(input->getData()); + auto* output_data = reinterpret_cast(output->getData()); + + auto num_loops = calcShapeSize(input_shape, 0, from); + auto num_readers = input_dims[from]; + auto block_size = calcShapeSize(input_shape, to + 1, input_shape.getRank()); + auto reads_per_loop = int64_t(input_shape.getElementsCount() / num_loops / block_size); + auto reads_per_reader_per_loop = int64_t(reads_per_loop / num_readers); + // TODO: check integer overflow + const size_t bytes_per_read = static_cast(block_size) * element_size; + + switch (bytes_per_read) { + case (sizeof(uint8_t)): { + SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint16_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint32_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, + reads_per_reader_per_loop); + break; + } + case (sizeof(uint64_t)): { + SimpleTransposeSingleAxisInwards(reinterpret_cast(input_data), + reinterpret_cast(output_data), num_loops, num_readers, reads_per_loop, + reads_per_reader_per_loop); + break; + } + default: { + // we need to use memcpy for each block + for (int64_t l = 0; l < num_loops; ++l) { + const uint8_t* input_for_first_reader = input_data; + for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) { + const uint8_t* input_for_current_reader = input_for_first_reader; + for (uint64_t r = 0; r < num_readers; ++r) { + memcpy(output_data, input_for_current_reader, bytes_per_read); + output_data += bytes_per_read; + // skip to input position for next reader + input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read); + } + input_for_first_reader += bytes_per_read; + } + input_data += reads_per_loop * bytes_per_read; + } + } + } +} + +void MlasTransposeExecutor::exec(const std::vector& src, const std::vector& dst, const int MB) { + if (from > to) { + TransposeSingleAxisOutwards(src[0], dst[0], from, to); + } else { + TransposeSingleAxisInwards(src[0], dst[0], from, to); + } +} + +bool MlasTransposeExecutor::init(const TransposeParams &transposeParams, + const std::vector &srcDescs, + const std::vector &dstDescs, + const dnnl::primitive_attr &attr) { + if (!IsTransposeMovingSingleAxis(transposeParams.permuteParams.order, from, to)) { + DEBUG_LOG("MLAS Transpose executor supports moving single axis only"); + return false; + } + return true; +} + +bool MlasTransposeExecutorBuilder::isSupported(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs) const { + if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) || + !dstDescs[0]->hasLayoutType(LayoutType::ncsp)) { + DEBUG_LOG("MLAS Transpose executor supports NCHW layout only"); + return false; + } + if (!one_of(srcDescs[0]->getPrecision().size(), 1u, 2u, 4u, 8u)) { + DEBUG_LOG("MLAS Transpose executor supports 1, 2, 4, 8 byte precision sizes"); + return false; + } + return true; +} + +TransposeExecutorPtr MlasTransposeExecutorBuilder::makeExecutor(const ExecutorContext::CPtr context) const { + return std::make_shared(context); +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp new file mode 100644 index 00000000000..ab44d50ac44 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "nodes/executors/transpose.hpp" + +namespace ov { +namespace intel_cpu { +class MlasTransposeExecutor : public TransposeExecutor { +public: + using TransposeExecutor::TransposeExecutor; + bool init(const TransposeParams &transposeParams, + const std::vector &srcDescs, + const std::vector &dstDescs, + const dnnl::primitive_attr &attr) override; + void exec(const std::vector &src, const std::vector &dst, const int MB) override; + + impl_desc_type getImplType() const override { return implType; } +private: + static int64_t calcShapeSize(const Shape& shape, size_t start, size_t end); + static bool IsTransposeMovingSingleAxis(InferenceEngine::SizeVector permutations, size_t& from, size_t& to); + void TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to); + void TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to); + + static const impl_desc_type implType = impl_desc_type::mlas; + size_t from; + size_t to; +}; + +class MlasTransposeExecutorBuilder : public TransposeExecutorBuilder { +public: + bool isSupported(const TransposeParams& transposeParams, + const std::vector& srcDescs, + const std::vector& dstDescs) const override; + + TransposeExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override; +}; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp index 1249aad5685..eba1f9326c7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp @@ -9,6 +9,7 @@ namespace intel_cpu { const std::vector& getTransposeExecutorsList() { static const std::vector descs = { + OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared()) OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared()) OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared()) OV_CPU_INSTANCE_X64(ExecutorType::x64, std::make_shared()) diff --git a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp index 89d322ed91f..63fa736c4cd 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp @@ -13,6 +13,7 @@ #include "common/ref_opt_transpose.hpp" #include "common/ref_transpose.hpp" +#include "mlas/mlas_transpose.hpp" #include "x64/jit_transpose.hpp" #include "onednn/iml_type_mapper.h" diff --git a/src/plugins/intel_cpu/thirdparty/mlas b/src/plugins/intel_cpu/thirdparty/mlas index 519abf79de5..1d68240b511 160000 --- a/src/plugins/intel_cpu/thirdparty/mlas +++ b/src/plugins/intel_cpu/thirdparty/mlas @@ -1 +1 @@ -Subproject commit 519abf79de5ee295cfe5bbed97037a2623616c80 +Subproject commit 1d68240b5114326604c3f5af47ac1c098e30b254