[CPU][ARM] MLAS Transpose executor (#18879)
This commit is contained in:
parent
62fa09a181
commit
7b4a7e5eb4
@ -33,8 +33,17 @@ else()
|
||||
endif()
|
||||
set(OV_CPU_ARM_TARGET_ARCH ${OV_CPU_ARM_TARGET_ARCH_DEFAULT})
|
||||
|
||||
# enbale mlas for X86 cpus only
|
||||
ie_dependent_option(ENABLE_MLAS_FOR_CPU "MLAS GEMM for OpenVINO CPU Plugin" ON "X86 OR X86_64" OFF)
|
||||
if(X86 OR X86_64 OR AARCH64)
|
||||
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)
|
||||
set(ENABLE_MLAS_FOR_CPU_DEFAULT OFF)
|
||||
else()
|
||||
set(ENABLE_MLAS_FOR_CPU_DEFAULT ON)
|
||||
endif()
|
||||
else()
|
||||
set(ENABLE_MLAS_FOR_CPU_DEFAULT OFF)
|
||||
endif()
|
||||
ie_option(ENABLE_MLAS_FOR_CPU "Enable MLAS for OpenVINO CPU Plugin" ${ENABLE_MLAS_FOR_CPU_DEFAULT})
|
||||
|
||||
add_subdirectory(thirdparty)
|
||||
|
||||
if(WIN32)
|
||||
@ -80,6 +89,7 @@ if(NOT (AARCH64 OR ARM))
|
||||
endif()
|
||||
|
||||
if (NOT ENABLE_MLAS_FOR_CPU)
|
||||
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/mlas/*)
|
||||
list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/mlas/*)
|
||||
endif()
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../transpose.hpp"
|
||||
#include "nodes/executors/transpose.hpp"
|
||||
#include "utils/debug_capabilities.h"
|
||||
|
||||
namespace ov {
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "../transpose.hpp"
|
||||
#include "nodes/executors/transpose.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
@ -11,6 +11,13 @@
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
#if defined(OV_CPU_WITH_MLAS) && defined(OPENVINO_ARCH_ARM64)
|
||||
#define OV_CPU_INSTANCE_MLAS_ARM64(...) \
|
||||
{__VA_ARGS__},
|
||||
#else
|
||||
#define OV_CPU_INSTANCE_MLAS_ARM64(...)
|
||||
#endif
|
||||
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#define OV_CPU_INSTANCE_ACL(...) \
|
||||
{__VA_ARGS__},
|
||||
@ -40,7 +47,8 @@ enum class ExecutorType {
|
||||
Common,
|
||||
x64,
|
||||
Dnnl,
|
||||
Acl
|
||||
Acl,
|
||||
Mlas
|
||||
};
|
||||
|
||||
class ExecutorContext {
|
||||
|
@ -0,0 +1,314 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "mlas_transpose.hpp"
|
||||
#include "ie_parallel.hpp"
|
||||
#include "nodes/common/cpu_memcpy.h"
|
||||
#include "mlas.h"
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
template <typename T>
|
||||
struct has_mlas_transpose : std::false_type {};
|
||||
|
||||
template <>
|
||||
struct has_mlas_transpose<uint8_t> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_mlas_transpose<uint16_t> : std::true_type {};
|
||||
|
||||
template <>
|
||||
struct has_mlas_transpose<uint32_t> : std::true_type {};
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
|
||||
const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) {
|
||||
const T* end;
|
||||
for (int64_t l = 0; l < num_loops; ++l) {
|
||||
T* output_for_first_writer = output_data;
|
||||
for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
|
||||
T* output_for_current_writer = output_for_first_writer;
|
||||
end = input_data + num_writers;
|
||||
for (; input_data != end;) {
|
||||
*output_for_current_writer = *input_data++;
|
||||
// skip to output position for next writer
|
||||
output_for_current_writer += writes_per_writer_per_loop;
|
||||
}
|
||||
++output_for_first_writer;
|
||||
}
|
||||
output_data += writes_per_loop;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
|
||||
const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop, int64_t writes_per_writer_per_loop) {
|
||||
for (int64_t l = 0; l < num_loops; ++l) {
|
||||
MlasTranspose(input_data, output_data, static_cast<size_t>(writes_per_writer_per_loop), static_cast<size_t>(num_writers));
|
||||
input_data += writes_per_loop;
|
||||
output_data += writes_per_loop;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
|
||||
const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) {
|
||||
T* end;
|
||||
for (int64_t l = 0; l < num_loops; ++l) {
|
||||
const T* input_for_first_reader = input_data;
|
||||
for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
|
||||
const T* input_for_current_reader = input_for_first_reader;
|
||||
end = output_data + num_readers;
|
||||
for (; output_data != end;) {
|
||||
*output_data++ = *input_for_current_reader;
|
||||
// skip to input position for next reader
|
||||
input_for_current_reader += reads_per_reader_per_loop;
|
||||
}
|
||||
++input_for_first_reader;
|
||||
}
|
||||
input_data += reads_per_loop;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
|
||||
const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop, int64_t reads_per_reader_per_loop) {
|
||||
for (int64_t l = 0; l < num_loops; ++l) {
|
||||
MlasTranspose(input_data, output_data, static_cast<size_t>(num_readers), static_cast<size_t>(reads_per_reader_per_loop));
|
||||
input_data += reads_per_loop;
|
||||
output_data += reads_per_loop;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t MlasTransposeExecutor::calcShapeSize(const Shape& shape, size_t start, size_t end) {
|
||||
int64_t size = 1;
|
||||
for (size_t i = start; i < end; i++) {
|
||||
size *= shape.getDims()[i];
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
bool MlasTransposeExecutor::IsTransposeMovingSingleAxis(SizeVector permutations, size_t& from, size_t& to) {
|
||||
// if a single axis moved to an outer dimension, the values should be one lower than the index until the slot the
|
||||
// axis was moved from, and equal to the index after that.
|
||||
// e.g. axis 3 moves out to 1 would be: 0, 3, 1, 2, 4
|
||||
auto check_moved_outwards = [&permutations](size_t cur, size_t moved_from) {
|
||||
// we start processing with the slot after the moved one, so the expected value is one less than the index
|
||||
size_t expected = cur - 1;
|
||||
for (size_t end = permutations.size(); cur < end; ++cur) {
|
||||
if (permutations[cur] != expected) {
|
||||
return false;
|
||||
}
|
||||
// we are at the slot the axis moved from, so do an additional increment before checking the next value
|
||||
if (cur == moved_from) {
|
||||
++expected;
|
||||
}
|
||||
++expected;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
// if a single axis moved to an inner dimension, the values should be one higher than the index until the slot the
|
||||
// axis was moved to, and equal to the index after that.
|
||||
// e.g. axis 1 moves inwards to 3 would be: 0, 2, 3, 1, 4
|
||||
auto check_moved_inwards = [&permutations](size_t cur, size_t& moved_to) {
|
||||
size_t started_at = cur;
|
||||
size_t expected = cur + 1;
|
||||
moved_to = std::numeric_limits<size_t>::max();
|
||||
for (size_t end = permutations.size(); cur < end; ++cur) {
|
||||
if (permutations[cur] != expected) {
|
||||
// if a single axis moved it must have come from the location we started at
|
||||
if (started_at != permutations[cur]) {
|
||||
return false;
|
||||
}
|
||||
moved_to = cur;
|
||||
} else {
|
||||
++expected;
|
||||
}
|
||||
}
|
||||
return moved_to != std::numeric_limits<size_t>::max();
|
||||
};
|
||||
bool single_axis_moved = false;
|
||||
// check axis moving outwards (earlier entry in permutations)
|
||||
for (size_t i = 0, end = permutations.size(); i < end; ++i) {
|
||||
size_t axis = permutations[i];
|
||||
if (axis != i) {
|
||||
if (check_moved_outwards(i + 1, axis)) {
|
||||
single_axis_moved = true;
|
||||
to = i;
|
||||
from = axis;
|
||||
} else if (check_moved_inwards(i, to)) {
|
||||
single_axis_moved = true;
|
||||
from = i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return single_axis_moved;
|
||||
}
|
||||
|
||||
void MlasTransposeExecutor::TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) {
|
||||
const auto& input_shape = input->getShape();
|
||||
const auto& input_dims = input_shape.getDims();
|
||||
const auto element_size = input->getDesc().getPrecision().size();
|
||||
|
||||
const auto* input_data = reinterpret_cast<const uint8_t*>(input->getData());
|
||||
auto* output_data = reinterpret_cast<uint8_t*>(output->getData());
|
||||
|
||||
auto num_loops = calcShapeSize(input_shape, 0, to);
|
||||
auto num_writers = input_dims[from];
|
||||
auto block_size = calcShapeSize(input_shape, from + 1, input_shape.getRank());
|
||||
auto writes_per_loop = int64_t(input_shape.getElementsCount() / num_loops / block_size);
|
||||
auto writes_per_writer_per_loop = int64_t(writes_per_loop / num_writers);
|
||||
// TODO: check integer overflow
|
||||
const size_t bytes_per_write = static_cast<size_t>(block_size) * element_size;
|
||||
|
||||
switch (bytes_per_write) {
|
||||
case (sizeof(uint8_t)): {
|
||||
SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop,
|
||||
writes_per_writer_per_loop);
|
||||
break;
|
||||
}
|
||||
case (sizeof(uint16_t)): {
|
||||
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint16_t*>(input_data),
|
||||
reinterpret_cast<uint16_t*>(output_data), num_loops, num_writers,
|
||||
writes_per_loop, writes_per_writer_per_loop);
|
||||
break;
|
||||
}
|
||||
case (sizeof(uint32_t)): {
|
||||
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint32_t*>(input_data),
|
||||
reinterpret_cast<uint32_t*>(output_data), num_loops, num_writers,
|
||||
writes_per_loop, writes_per_writer_per_loop);
|
||||
break;
|
||||
}
|
||||
case (sizeof(uint64_t)): {
|
||||
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint64_t*>(input_data),
|
||||
reinterpret_cast<uint64_t*>(output_data), num_loops, num_writers,
|
||||
writes_per_loop, writes_per_writer_per_loop);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
// we need to use memcpy for each block
|
||||
for (int64_t l = 0; l < num_loops; ++l) {
|
||||
uint8_t* output_for_first_writer = output_data;
|
||||
|
||||
for (auto wwpl = 0; wwpl < writes_per_writer_per_loop; ++wwpl) {
|
||||
uint8_t* output_for_current_writer = output_for_first_writer;
|
||||
|
||||
for (uint64_t w = 0; w < num_writers; ++w) {
|
||||
memcpy(output_for_current_writer, input_data, bytes_per_write);
|
||||
// skip to output position for next writer
|
||||
output_for_current_writer += (writes_per_writer_per_loop * bytes_per_write);
|
||||
input_data += bytes_per_write;
|
||||
}
|
||||
output_for_first_writer += bytes_per_write;
|
||||
}
|
||||
output_data += writes_per_loop * bytes_per_write;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MlasTransposeExecutor::TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to) {
|
||||
const auto& input_shape = input->getShape();
|
||||
const auto& input_dims = input_shape.getDims();
|
||||
|
||||
const auto element_size = input->getDesc().getPrecision().size();
|
||||
const auto* input_data = reinterpret_cast<const uint8_t*>(input->getData());
|
||||
auto* output_data = reinterpret_cast<uint8_t*>(output->getData());
|
||||
|
||||
auto num_loops = calcShapeSize(input_shape, 0, from);
|
||||
auto num_readers = input_dims[from];
|
||||
auto block_size = calcShapeSize(input_shape, to + 1, input_shape.getRank());
|
||||
auto reads_per_loop = int64_t(input_shape.getElementsCount() / num_loops / block_size);
|
||||
auto reads_per_reader_per_loop = int64_t(reads_per_loop / num_readers);
|
||||
// TODO: check integer overflow
|
||||
const size_t bytes_per_read = static_cast<size_t>(block_size) * element_size;
|
||||
|
||||
switch (bytes_per_read) {
|
||||
case (sizeof(uint8_t)): {
|
||||
SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop,
|
||||
reads_per_reader_per_loop);
|
||||
break;
|
||||
}
|
||||
case (sizeof(uint16_t)): {
|
||||
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint16_t*>(input_data),
|
||||
reinterpret_cast<uint16_t*>(output_data), num_loops, num_readers, reads_per_loop,
|
||||
reads_per_reader_per_loop);
|
||||
break;
|
||||
}
|
||||
case (sizeof(uint32_t)): {
|
||||
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint32_t*>(input_data),
|
||||
reinterpret_cast<uint32_t*>(output_data), num_loops, num_readers, reads_per_loop,
|
||||
reads_per_reader_per_loop);
|
||||
break;
|
||||
}
|
||||
case (sizeof(uint64_t)): {
|
||||
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint64_t*>(input_data),
|
||||
reinterpret_cast<uint64_t*>(output_data), num_loops, num_readers, reads_per_loop,
|
||||
reads_per_reader_per_loop);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
// we need to use memcpy for each block
|
||||
for (int64_t l = 0; l < num_loops; ++l) {
|
||||
const uint8_t* input_for_first_reader = input_data;
|
||||
for (auto rrpl = 0; rrpl < reads_per_reader_per_loop; ++rrpl) {
|
||||
const uint8_t* input_for_current_reader = input_for_first_reader;
|
||||
for (uint64_t r = 0; r < num_readers; ++r) {
|
||||
memcpy(output_data, input_for_current_reader, bytes_per_read);
|
||||
output_data += bytes_per_read;
|
||||
// skip to input position for next reader
|
||||
input_for_current_reader += (reads_per_reader_per_loop * bytes_per_read);
|
||||
}
|
||||
input_for_first_reader += bytes_per_read;
|
||||
}
|
||||
input_data += reads_per_loop * bytes_per_read;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MlasTransposeExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const int MB) {
|
||||
if (from > to) {
|
||||
TransposeSingleAxisOutwards(src[0], dst[0], from, to);
|
||||
} else {
|
||||
TransposeSingleAxisInwards(src[0], dst[0], from, to);
|
||||
}
|
||||
}
|
||||
|
||||
bool MlasTransposeExecutor::init(const TransposeParams &transposeParams,
|
||||
const std::vector<MemoryDescPtr> &srcDescs,
|
||||
const std::vector<MemoryDescPtr> &dstDescs,
|
||||
const dnnl::primitive_attr &attr) {
|
||||
if (!IsTransposeMovingSingleAxis(transposeParams.permuteParams.order, from, to)) {
|
||||
DEBUG_LOG("MLAS Transpose executor supports moving single axis only");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MlasTransposeExecutorBuilder::isSupported(const TransposeParams& transposeParams,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const {
|
||||
if (!srcDescs[0]->hasLayoutType(LayoutType::ncsp) ||
|
||||
!dstDescs[0]->hasLayoutType(LayoutType::ncsp)) {
|
||||
DEBUG_LOG("MLAS Transpose executor supports NCHW layout only");
|
||||
return false;
|
||||
}
|
||||
if (!one_of(srcDescs[0]->getPrecision().size(), 1u, 2u, 4u, 8u)) {
|
||||
DEBUG_LOG("MLAS Transpose executor supports 1, 2, 4, 8 byte precision sizes");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TransposeExecutorPtr MlasTransposeExecutorBuilder::makeExecutor(const ExecutorContext::CPtr context) const {
|
||||
return std::make_shared<MlasTransposeExecutor>(context);
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,42 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "nodes/executors/transpose.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
class MlasTransposeExecutor : public TransposeExecutor {
|
||||
public:
|
||||
using TransposeExecutor::TransposeExecutor;
|
||||
bool init(const TransposeParams &transposeParams,
|
||||
const std::vector<MemoryDescPtr> &srcDescs,
|
||||
const std::vector<MemoryDescPtr> &dstDescs,
|
||||
const dnnl::primitive_attr &attr) override;
|
||||
void exec(const std::vector<MemoryCPtr> &src, const std::vector<MemoryPtr> &dst, const int MB) override;
|
||||
|
||||
impl_desc_type getImplType() const override { return implType; }
|
||||
private:
|
||||
static int64_t calcShapeSize(const Shape& shape, size_t start, size_t end);
|
||||
static bool IsTransposeMovingSingleAxis(InferenceEngine::SizeVector permutations, size_t& from, size_t& to);
|
||||
void TransposeSingleAxisOutwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to);
|
||||
void TransposeSingleAxisInwards(const MemoryCPtr& input, const MemoryPtr& output, size_t from, size_t to);
|
||||
|
||||
static const impl_desc_type implType = impl_desc_type::mlas;
|
||||
size_t from;
|
||||
size_t to;
|
||||
};
|
||||
|
||||
class MlasTransposeExecutorBuilder : public TransposeExecutorBuilder {
|
||||
public:
|
||||
bool isSupported(const TransposeParams& transposeParams,
|
||||
const std::vector<MemoryDescPtr>& srcDescs,
|
||||
const std::vector<MemoryDescPtr>& dstDescs) const override;
|
||||
|
||||
TransposeExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override;
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -9,6 +9,7 @@ namespace intel_cpu {
|
||||
|
||||
const std::vector<TransposeExecutorDesc>& getTransposeExecutorsList() {
|
||||
static const std::vector<TransposeExecutorDesc> descs = {
|
||||
OV_CPU_INSTANCE_MLAS_ARM64(ExecutorType::Mlas, std::make_shared<MlasTransposeExecutorBuilder>())
|
||||
OV_CPU_INSTANCE_COMMON(ExecutorType::Common, std::make_shared<RefOptimizedTransposeExecutorBuilder>())
|
||||
OV_CPU_INSTANCE_ACL(ExecutorType::Acl, std::make_shared<ACLTransposeExecutorBuilder>())
|
||||
OV_CPU_INSTANCE_X64(ExecutorType::x64, std::make_shared<JitTransposeExecutorBuilder>())
|
||||
|
@ -13,6 +13,7 @@
|
||||
|
||||
#include "common/ref_opt_transpose.hpp"
|
||||
#include "common/ref_transpose.hpp"
|
||||
#include "mlas/mlas_transpose.hpp"
|
||||
#include "x64/jit_transpose.hpp"
|
||||
|
||||
#include "onednn/iml_type_mapper.h"
|
||||
|
2
src/plugins/intel_cpu/thirdparty/mlas
vendored
2
src/plugins/intel_cpu/thirdparty/mlas
vendored
@ -1 +1 @@
|
||||
Subproject commit 519abf79de5ee295cfe5bbed97037a2623616c80
|
||||
Subproject commit 1d68240b5114326604c3f5af47ac1c098e30b254
|
Loading…
Reference in New Issue
Block a user