[CPU][ARM] MLAS Transpose executor (#18879)

2023-08-02 14:48:09 +02:00 · 2023-08-02 14:48:09 +02:00 · 7b4a7e5eb4
commit 7b4a7e5eb4
parent 62fa09a181
9 changed files with 382 additions and 6 deletions
--- a/src/plugins/intel_cpu/CMakeLists.txt
+++ b/src/plugins/intel_cpu/CMakeLists.txt
@ -33,8 +33,17 @@ else()
 endif()
 set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT})

-# enbale mlas for X86 cpus only
-ie_dependent_option(ENABLE_MLAS_FOR_CPU "MLAS GEMM for OpenVINO CPU Plugin" ON "X86 OR X86_64" OFF)
+if(X86 OR X86_64 OR AARCH64)
+    if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)
+        set(ENABLE_MLAS_FOR_CPU_DEFAULT OFF)
+    else()
+        set(ENABLE_MLAS_FOR_CPU_DEFAULT ON)
+    endif()
+else()
+    set(ENABLE_MLAS_FOR_CPU_DEFAULT OFF)
+endif()
+ie_option(ENABLE_MLAS_FOR_CPU "Enable MLAS for OpenVINO CPU Plugin" ${ENABLE_MLAS_FOR_CPU_DEFAULT})
+
 add_subdirectory(thirdparty)

 if(WIN32)
@ -80,6 +89,7 @@ if(NOT (AARCH64 OR ARM))
 endif()

 if (NOT ENABLE_MLAS_FOR_CPU)
+    list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/mlas/*)
    list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/mlas/*)
 endif()

--- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_opt_transpose.hpp
@ -4,7 +4,7 @@

 #pragma once

-#include "../transpose.hpp"
+#include "nodes/executors/transpose.hpp"
 #include "utils/debug_capabilities.h"

 namespace ov {
--- a/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/common/ref_transpose.hpp
@ -4,7 +4,7 @@

 #pragma once

-#include "../transpose.hpp"
+#include "nodes/executors/transpose.hpp"

 namespace ov {
 namespace intel_cpu {
--- a/src/plugins/intel_cpu/src/nodes/executors/executor.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/executor.hpp
@ -11,6 +11,13 @@
 namespace ov {
 namespace intel_cpu {

+#if defined(OV_CPU_WITH_MLAS) && defined(OPENVINO_ARCH_ARM64)
+#define OV_CPU_INSTANCE_MLAS_ARM64(...) \
+    {__VA_ARGS__},
+#else
+#define OV_CPU_INSTANCE_MLAS_ARM64(...)
+#endif
+
 #if defined(OV_CPU_WITH_ACL)
 #define OV_CPU_INSTANCE_ACL(...) \
    {__VA_ARGS__},
@ -40,7 +47,8 @@ enum class ExecutorType {
    Common,
    x64,
    Dnnl,
-    Acl
+    Acl,
+    Mlas
 };

 class ExecutorContext {
--- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.cpp
@ -0,0 +1,314 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mlas_transpose.hpp"
+#include "ie_parallel.hpp"
+#include "nodes/common/cpu_memcpy.h"
+#include "mlas.h"
+
+using namespace InferenceEngine;
+
+namespace ov {
+namespace intel_cpu {
+
+template <typename T>
+struct has_mlas_transpose : std::false_type {};
+
+template <>
+struct has_mlas_transpose<uint8_t> : std::true_type {};
+
+template <>
+struct has_mlas_transpose<uint16_t> : std::true_type {};
+
+template <>
+struct has_mlas_transpose<uint32_t> : std::true_type {};
+
+template <typename T>
+typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
+    const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) {
+    const T* end;
+    for (int64_t l = 0; l < num_loops; ++l) {
+        T* output_for_first_writer = output_data;
+        for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
+            T* output_for_current_writer = output_for_first_writer;
+            end = input_data + num_writers;
+            for (; input_data != end;) {
+                *output_for_current_writer = *input_data++;
+                // skip to output position for next writer
+                output_for_current_writer += writes_per_writer_per_loop;
+            }
+            ++output_for_first_writer;
+        }
+        output_data += writes_per_loop;
+    }
+}
+
+template <typename T>
+typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
+        const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) {
+    for (int64_t l = 0; l < num_loops; ++l) {
+        MlasTranspose(input_data, output_data, static_cast<size_t>(writes_per_writer_per_loop), static_cast<size_t>(num_writers));
+        input_data += writes_per_loop;
+        output_data += writes_per_loop;
+    }
+}
+
+template <typename T>
+typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
+        const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) {
+    T* end;
+    for (int64_t l = 0; l < num_loops; ++l) {
+        const T* input_for_first_reader = input_data;
+        for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
+            const T* input_for_current_reader = input_for_first_reader;
+            end = output_data + num_readers;
+            for (; output_data != end;) {
+                *output_data++ = *input_for_current_reader;
+                // skip to input position for next reader
+                input_for_current_reader += reads_per_reader_per_loop;
+            }
+            ++input_for_first_reader;
+        }
+        input_data += reads_per_loop;
+    }
+}
+
+template <typename T>
+typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
+        const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) {
+    for (int64_t l = 0; l < num_loops; ++l) {
+        MlasTranspose(input_data, output_data, static_cast<size_t>(num_readers), static_cast<size_t>(reads_per_reader_per_loop));
+        input_data += reads_per_loop;
+        output_data += reads_per_loop;
+    }
+}
+
+int64_t MlasTransposeExecutor::calcShapeSize(const Shape& shape, size_t start, size_t end) {
+    int64_t size = 1;
+    for (size_t i = start; i < end; i++) {
+        size *= shape.getDims()[i];
+    }
+    return size;
+}
+
+bool MlasTransposeExecutor::IsTransposeMovingSingleAxis(SizeVector permutations, size_t& from, size_t& to) {
+    // if a single axis moved to an outer dimension, the values should be one lower than the index until the slot the
+    // axis was moved from, and equal to the index after that.
+    // e.g. axis 3 moves out to 1 would be: 0, 3, 1, 2, 4
+    auto check_moved_outwards = [&permutations](size_t cur, size_t moved_from) {
+        // we start processing with the slot after the moved one, so the expected value is one less than the index
+        size_t expected = cur - 1;
+        for (size_t end = permutations.size(); cur < end; ++cur) {
+            if (permutations[cur] != expected) {
+                return false;
+            }
+            // we are at the slot the axis moved from, so do an additional increment before checking the next value
+            if (cur == moved_from) {
+                ++expected;
+            }
+            ++expected;
+        }
+        return true;
+    };
+    // if a single axis moved to an inner dimension, the values should be one higher than the index until the slot the
+    // axis was moved to, and equal to the index after that.
+    // e.g. axis 1 moves inwards to 3 would be: 0, 2, 3, 1, 4
+    auto check_moved_inwards = [&permutations](size_t cur, size_t& moved_to) {
+        size_t started_at = cur;
+        size_t expected = cur + 1;
+        moved_to = std::numeric_limits<size_t>::max();
+        for (size_t end = permutations.size(); cur < end; ++cur) {
+            if (permutations[cur] != expected) {
+                // if a single axis moved it must have come from the location we started at
+                if (started_at != permutations[cur]) {
+                    return false;
+                }
+                moved_to = cur;
+            } else {
+                ++expected;
+            }
+        }
+        return moved_to != std::numeric_limits<size_t>::max();
+    };
+    bool single_axis_moved = false;
+    // check axis moving outwards (earlier entry in permutations)
+    for (size_t i = 0, end = permutations.size(); i < end; ++i) {
+        size_t axis = permutations[i];
+        if (axis != i) {
+            if (check_moved_outwards(i + 1, axis)) {
+                single_axis_moved = true;
+                to = i;
+                from = axis;
+            } else if (check_moved_inwards(i, to)) {
+                single_axis_moved = true;
+                from = i;
+            }
+            break;
+        }
+    }
+    return single_axis_moved;
+}
+
+void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) {
+    const auto& input_shape = input->getShape();
+    const auto& input_dims = input_shape.getDims();
+    const auto element_size = input->getDesc().getPrecision().size();
+
+    const auto* input_data = reinterpret_cast<const uint8_t*>(input->getData());
+    auto* output_data = reinterpret_cast<uint8_t*>(output->getData());
+
+    auto num_loops = calcShapeSize(input_shape, 0, to);
+    auto num_writers = input_dims[from];
+    auto block_size = calcShapeSize(input_shape, from + 1, input_shape.getRank());
+    auto writes_per_loop = int64_t(input_shape.getElementsCount() / num_loops / block_size);
+    auto writes_per_writer_per_loop = int64_t(writes_per_loop / num_writers);
+    // TODO: check integer overflow
+    const size_t bytes_per_write = static_cast<size_t>(block_size) * element_size;
+
+    switch (bytes_per_write) {
+        case (sizeof(uint8_t)): {
+            SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop,
+                                                                                writes_per_writer_per_loop);
+            break;
+        }
+        case (sizeof(uint16_t)): {
+            SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint16_t*>(input_data),
+                                                                                reinterpret_cast<uint16_t*>(output_data), num_loops, num_writers,
+                                                                                writes_per_loop, writes_per_writer_per_loop);
+            break;
+        }
+        case (sizeof(uint32_t)): {
+            SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint32_t*>(input_data),
+                                                                                reinterpret_cast<uint32_t*>(output_data), num_loops, num_writers,
+                                                                                writes_per_loop, writes_per_writer_per_loop);
+            break;
+        }
+        case (sizeof(uint64_t)): {
+            SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint64_t*>(input_data),
+                                                                                reinterpret_cast<uint64_t*>(output_data), num_loops, num_writers,
+                                                                                writes_per_loop, writes_per_writer_per_loop);
+            break;
+        }
+        default: {
+            // we need to use memcpy for each block
+            for (int64_t l = 0; l < num_loops; ++l) {
+                uint8_t* output_for_first_writer = output_data;
+
+                for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
+                    uint8_t* output_for_current_writer = output_for_first_writer;
+
+                    for (uint64_t w = 0; w < num_writers; ++w) {
+                        memcpy(output_for_current_writer, input_data, bytes_per_write);
+                        // skip to output position for next writer
+                        output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write);
+                        input_data += bytes_per_write;
+                    }
+                    output_for_first_writer += bytes_per_write;
+                }
+                output_data += writes_per_loop * bytes_per_write;
+            }
+        }
+    }
+}
+
+void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) {
+    const auto& input_shape = input->getShape();
+    const auto& input_dims = input_shape.getDims();
+
+    const auto element_size = input->getDesc().getPrecision().size();
+    const auto* input_data = reinterpret_cast<const uint8_t*>(input->getData());
+    auto* output_data = reinterpret_cast<uint8_t*>(output->getData());
+
+    auto num_loops = calcShapeSize(input_shape, 0, from);
+    auto num_readers = input_dims[from];
+    auto block_size = calcShapeSize(input_shape, to + 1, input_shape.getRank());
+    auto reads_per_loop = int64_t(input_shape.getElementsCount() / num_loops / block_size);
+    auto reads_per_reader_per_loop = int64_t(reads_per_loop / num_readers);
+    // TODO: check integer overflow
+    const size_t bytes_per_read = static_cast<size_t>(block_size) * element_size;
+
+    switch (bytes_per_read) {
+        case (sizeof(uint8_t)): {
+            SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop,
+                                                                             reads_per_reader_per_loop);
+            break;
+        }
+        case (sizeof(uint16_t)): {
+            SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint16_t*>(input_data),
+                                                                             reinterpret_cast<uint16_t*>(output_data), num_loops, num_readers, reads_per_loop,
+                                                                             reads_per_reader_per_loop);
+            break;
+        }
+        case (sizeof(uint32_t)): {
+            SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint32_t*>(input_data),
+                                                                             reinterpret_cast<uint32_t*>(output_data), num_loops, num_readers, reads_per_loop,
+                                                                             reads_per_reader_per_loop);
+            break;
+        }
+        case (sizeof(uint64_t)): {
+            SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint64_t*>(input_data),
+                                                                             reinterpret_cast<uint64_t*>(output_data), num_loops, num_readers, reads_per_loop,
+                                                                             reads_per_reader_per_loop);
+            break;
+        }
+        default: {
+            // we need to use memcpy for each block
+            for (int64_t l = 0; l < num_loops; ++l) {
+                const uint8_t* input_for_first_reader = input_data;
+                for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
+                    const uint8_t* input_for_current_reader = input_for_first_reader;
+                    for (uint64_t r = 0; r < num_readers; ++r) {
+                        memcpy(output_data, input_for_current_reader, bytes_per_read);
+                        output_data += bytes_per_read;
+                        // skip to input position for next reader
+                        input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read);
+                    }
+                    input_for_first_reader += bytes_per_read;
+                }
+                input_data += reads_per_loop * bytes_per_read;
+            }
+        }
+    }
+}
+
+void MlasTransposeExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const int MB) {
+    if (from > to) {
+            TransposeSingleAxisOutwards(src[0], dst[0], from, to);
+    } else {
+            TransposeSingleAxisInwards(src[0], dst[0], from, to);
+    }
+}
+
+bool MlasTransposeExecutor::init(const TransposeParams &transposeParams,
+                                 const std::vector<MemoryDescPtr> &srcDescs,
+                                 const std::vector<MemoryDescPtr> &dstDescs,
+                                 const dnnl::primitive_attr &attr) {
+    if (!IsTransposeMovingSingleAxis(transposeParams.permuteParams.order, from, to)) {
+        DEBUG_LOG("MLAS Transpose executor supports moving single axis only");
+        return false;
+    }
+    return true;
+}
+
+bool MlasTransposeExecutorBuilder::isSupported(const TransposeParams& transposeParams,
+                                               const std::vector<MemoryDescPtr>& srcDescs,
+                                               const std::vector<MemoryDescPtr>& dstDescs) const {
+    if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) ||
+        !dstDescs[0]->hasLayoutType(LayoutType::ncsp)) {
+        DEBUG_LOG("MLAS Transpose executor supports NCHW layout only");
+        return false;
+    }
+    if (!one_of(srcDescs[0]->getPrecision().size(), 1u, 2u, 4u, 8u)) {
+        DEBUG_LOG("MLAS Transpose executor supports 1, 2, 4, 8 byte precision sizes");
+        return false;
+    }
+    return true;
+}
+
+TransposeExecutorPtr MlasTransposeExecutorBuilder::makeExecutor(const ExecutorContext::CPtr context) const {
+    return std::make_shared<MlasTransposeExecutor>(context);
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
--- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_transpose.hpp
@ -0,0 +1,42 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "nodes/executors/transpose.hpp"
+
+namespace ov {
+namespace intel_cpu {
+class MlasTransposeExecutor : public TransposeExecutor {
+public:
+    using TransposeExecutor::TransposeExecutor;
+    bool init(const TransposeParams &transposeParams,
+              const std::vector<MemoryDescPtr> &srcDescs,
+              const std::vector<MemoryDescPtr> &dstDescs,
+              const dnnl::primitive_attr &attr) override;
+    void exec(const std::vector<MemoryCPtr> &src, const std::vector<MemoryPtr> &dst, const int MB) override;
+
+    impl_desc_type getImplType() const override { return implType; }
+private:
+    static int64_t calcShapeSize(const Shape& shape, size_t start, size_t end);
+    static bool IsTransposeMovingSingleAxis(InferenceEngine::SizeVector permutations, size_t& from, size_t& to);
+    void TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to);
+    void TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to);
+
+    static const impl_desc_type implType = impl_desc_type::mlas;
+    size_t from;
+    size_t to;
+};
+
+class MlasTransposeExecutorBuilder : public TransposeExecutorBuilder {
+public:
+    bool isSupported(const TransposeParams& transposeParams,
+                     const std::vector<MemoryDescPtr>& srcDescs,
+                     const std::vector<MemoryDescPtr>& dstDescs) const override;
+
+    TransposeExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override;
+};
+
+} // namespace intel_cpu
+} // namespace ov
--- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.cpp
@ -9,6 +9,7 @@ namespace intel_cpu {

 const std::vector<TransposeExecutorDesc>& getTransposeExecutorsList() {
    static const std::vector<TransposeExecutorDesc> descs = {
+            OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared<MlasTransposeExecutorBuilder>())
            OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared<RefOptimizedTransposeExecutorBuilder>())
            OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<ACLTransposeExecutorBuilder>())
            OV_CPU_INSTANCE_X64(ExecutorType::x64, std::make_shared<JitTransposeExecutorBuilder>())
--- a/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/transpose_list.hpp
@ -13,6 +13,7 @@

 #include "common/ref_opt_transpose.hpp"
 #include "common/ref_transpose.hpp"
+#include "mlas/mlas_transpose.hpp"
 #include "x64/jit_transpose.hpp"

 #include "onednn/iml_type_mapper.h"
--- a/src/plugins/intel_cpu/thirdparty/mlas
+++ b/src/plugins/intel_cpu/thirdparty/mlas
@ -1 +1 @@
-Subproject commit 519abf79de5ee295cfe5bbed97037a2623616c80
+Subproject commit 1d68240b5114326604c3f5af47ac1c098e30b254