[CPU] Plugin migration on oneDNN (v1.6) (#3725)

This commit is contained in:
Gorokhov Dmitriy 2021-01-26 16:31:10 +03:00 committed by GitHub
parent 0284cd69a8
commit d58b4c65c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
140 changed files with 8355 additions and 5511 deletions

View File

@ -120,7 +120,7 @@ target_include_directories(${TARGET_NAME}_obj PRIVATE "${CMAKE_CURRENT_SOURCE_DI
target_link_libraries(${TARGET_NAME}_obj PRIVATE ${TARGET_NAME}_reader_api)
if(ENABLE_MKL_DNN)
target_include_directories(${TARGET_NAME}_obj SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu/xbyak")
target_include_directories(${TARGET_NAME}_obj SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu/x64/xbyak")
endif()
set_ie_threading_interface_for(${TARGET_NAME}_obj)

View File

@ -1539,6 +1539,9 @@ void ConvertPrecision(CNNNetwork& net, Precision from, Precision to) {
case getPrecisionMask(Precision::U16, Precision::I32):
convertPrecisionForAll<Precision::U16, Precision::I32>(net);
break;
case getPrecisionMask(Precision::I16, Precision::I32):
convertPrecisionForAll<Precision::I16, Precision::I32>(net);
break;
default:
THROW_IE_EXCEPTION << "Precision conversion from " << from << " to " << to
<< " currently is not supported. You may expand precision"

View File

@ -12,6 +12,7 @@ if (WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")
endif()
## TODO
set(LAYERS
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_batchnorm_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_bin_conv_node.cpp
@ -34,8 +35,8 @@ set(LAYERS
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reorder_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reshape_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_rnn.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_roi_pooling_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_roi_align_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_roi_pooling_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_softmax_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_split_node.cpp
${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_tensoriterator_node.cpp
@ -144,27 +145,13 @@ include_directories(
$<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>
${CMAKE_CURRENT_SOURCE_DIR}/mkldnn
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_BINARY_DIR}/include)
include_directories(SYSTEM
${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/common
${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu
${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include)
${CMAKE_BINARY_DIR}/include
)
if (GEMM STREQUAL "MKL")
log_rpath_from_dir(MKL "${MKL}/lib")
endif()
if (THREADING STREQUAL "TBB")
set(MKLDNN_THR MKLDNN_THR_TBB)
elseif (THREADING STREQUAL "TBB_AUTO")
set(MKLDNN_THR MKLDNN_THR_TBB_AUTO)
elseif (THREADING STREQUAL "OMP")
set(MKLDNN_THR MKLDNN_THR_OMP)
else()
set(MKLDNN_THR MKLDNN_THR_SEQ)
endif()
# create plugin
ie_add_plugin(NAME ${TARGET_NAME}
@ -174,11 +161,12 @@ ie_add_plugin(NAME ${TARGET_NAME}
set_ie_threading_interface_for(${TARGET_NAME})
target_compile_definitions(${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
target_link_libraries(${TARGET_NAME} PRIVATE mkldnn inference_engine inference_engine_legacy
inference_engine_transformations inference_engine_lp_transformations openvino::conditional_compilation)
target_include_directories(${TARGET_NAME} PRIVATE
$<TARGET_PROPERTY:mkldnn,INCLUDE_DIRECTORIES>)
# Cross compiled function
# TODO: The same for proposal, proposalONNX, topk
cross_compiled_file(${TARGET_NAME}
@ -201,18 +189,22 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
# add test object library
add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)
target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_legacy,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_transformations,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:openvino::itt,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:openvino::conditional_compilation,INTERFACE_INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>)
$<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
$<TARGET_PROPERTY:mkldnn,INCLUDE_DIRECTORIES>)
set_ie_threading_interface_for(${TARGET_NAME}_obj)
target_compile_definitions(${TARGET_NAME}_obj PUBLIC -DMKLDNN_THR=${MKLDNN_THR}
PRIVATE USE_STATIC_IE IMPLEMENT_INFERENCE_ENGINE_PLUGIN)
target_compile_definitions(${TARGET_NAME}_obj
PRIVATE USE_STATIC_IE IMPLEMENT_INFERENCE_ENGINE_PLUGIN
)
set_target_properties(${TARGET_NAME}_obj PROPERTIES EXCLUDE_FROM_ALL ON)

View File

@ -56,12 +56,6 @@ void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
InputsDataMap inputs = network.getInputsInfo();
OutputsDataMap outputs = network.getOutputsInfo();
for (auto iter : sortedLayers) {
if (CaselessEq<std::string>()(iter->type, "convolution")) {
auto dims = iter->insData[0].lock()->getDims();
if ((dims.size() == 4 || dims.size() == 5) && (dims[1] == 1 || dims[1] == 3))
continue;
}
// check, if memory output node needs to be transformed
if (iter->type == "Memory" && iter->outData.size() == 0 &&
iter->insData[0].lock()->getPrecision() == Precision::FP32) {

View File

@ -10,12 +10,11 @@
#include "ie_plugin_config.hpp"
#include "ie_common.h"
#include "ie_parallel.hpp"
#include "ie_system_conf.h"
#include <cpp_interfaces/exception2status.hpp>
#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
#include <ie_parallel.hpp>
#include <ie_system_conf.h>
namespace MKLDNNPlugin {

View File

@ -1,68 +0,0 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "desc_layer.h"
#include "desc_tensor.h"
#include "desc_tensor_comb.h"
#include "cpu_prim_layer.h"
#include "cpu_prim_tensor.h"
#include "mkldnn.hpp"
#include <memory>
#include <vector>
using namespace InferenceEngine;
namespace MKLDNNPlugin {
class CpuEngine;
using CpuEnginePtr = std::shared_ptr<CpuEngine>;
class CpuEngine : public details::no_copy {
public:
CpuEngine() : eng(mkldnn::engine(mkldnn::engine::kind::cpu, 0)) {}
void bindThreads();
void createDescription(DescTensorPtr tns, bool isWeights = false);
void createDescription(DescLayerPtr layer);
void setFlatFormat(DescTensorPtr tns);
void createPrimitive(DescTensorPtr tns);
void createPrimitive(DescLayerPtr tns);
void setData(const TBlob<float> &src, DescTensorPtr dst);
void getData(const DescTensorPtr src, TBlob<float> &dst);
void subtraction(DescTensorPtr dst, DescTensorPtr sub);
void subtraction(DescTensorPtr dst, std::vector<float> sub);
void score(std::vector<DescLayerPtr> layers);
void score(DescLayerPtr layer);
void process(std::vector<mkldnn::primitive> exec_queue);
mkldnn::engine eng; // TODO: Move me back to private section
private:
static inline mkldnn::memory::desc *get_desc(std::vector<DescTensorPtr> tensors, size_t indx = 0);
static inline mkldnn::memory::desc *get_desc(DescTensorPtr tns);
static inline mkldnn::memory *get_prim(std::vector<DescTensorPtr> tns, size_t indx = 0);
static inline mkldnn::memory *get_prim(DescTensorPtr tns);
void createPrimitiveCombined(DescTensorComb &tns, void *data);
};
} // namespace MKLDNNPlugin

View File

@ -1,50 +0,0 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "prim_layer.h"
#include "mkldnn.hpp"
#include <memory>
using namespace InferenceEngine;
using namespace mkldnn;
namespace MKLDNNPlugin {
class CpuPrimLayer : public PrimLayer {
friend class CpuEngine;
mkldnn::engine eng;
std::shared_ptr<mkldnn::primitive> prim;
public:
explicit CpuPrimLayer(engine eng) : eng(eng) {}
};
template<typename LYR>
class Layer : public CpuPrimLayer {
typename LYR::desc desc;
typename LYR::primitive_desc prim_desc;
public:
Layer(typename LYR::desc desc, engine eng) :
CpuPrimLayer(eng),
desc(desc),
prim_desc(desc, eng) {}
friend class CpuEngine;
};
class ReorderLayer : public CpuPrimLayer {
reorder::primitive_desc prim_desc;
public:
ReorderLayer(reorder::primitive_desc desc, engine eng) :
CpuPrimLayer(eng),
prim_desc(desc) {}
friend class CpuEngine;
};
} // namespace MKLDNNPlugin

View File

@ -1,34 +0,0 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "prim_tensor.h"
#include "mkldnn.hpp"
#include <memory>
namespace MKLDNNPlugin {
class CpuPrimTensor;
using CpuPrimTensorPtr = std::shared_ptr<CpuPrimTensor>;
class CpuPrimTensor : public PrimTensor {
public:
using Memory = std::shared_ptr<mkldnn::memory>;
using PrimitiveDesc = std::shared_ptr<mkldnn::memory::primitive_desc>;
explicit CpuPrimTensor(mkldnn::memory::desc desc) :
desc(desc) {}
mkldnn::memory getPrimitive() { return *(memory.get()); }
private:
Memory memory;
mkldnn::memory::desc desc;
friend class CpuEngine;
};
} // namespace MKLDNNPlugin

View File

@ -1,166 +0,0 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "mkldnn.hpp"
#include <string>
#include <mkldnn_types.h>
#include <mkldnn.h>
namespace mkldnn {
struct primitive_desc_iterator : public handle<mkldnn_primitive_desc_iterator_t> {
template <typename T>
primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr, const engine &aengine) {
mkldnn_primitive_desc_iterator_t result;
auto sts = mkldnn_primitive_desc_iterator_create_v2(
&result, &adesc.data, aattr.get(), aengine.get(), nullptr);
if (sts == mkldnn_status_t::mkldnn_success)
reset(result);
else if (sts == mkldnn_status_t::mkldnn_unimplemented)
reset(nullptr);
else
THROW_IE_EXCEPTION << "could not create a primitive descriptor iterator";
}
template <typename T, typename TF>
primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr,
const engine &aengine, const TF &hint_fwd_primitive_desc) {
mkldnn_primitive_desc_iterator_t result;
auto sts = mkldnn_primitive_desc_iterator_create_v2(&result,
&adesc.data,
aattr.get(),
aengine.get(),
hint_fwd_primitive_desc.get());
if (sts == mkldnn_status_t::mkldnn_success)
reset(result);
else if (sts == mkldnn_status_t::mkldnn_unimplemented)
reset(nullptr);
else
THROW_IE_EXCEPTION << "could not create a primitive descriptor iterator";
}
bool is_not_end() const {
return (handle::get() != nullptr);
}
memory::primitive_desc fetch() const {
memory::primitive_desc adesc;
mkldnn_primitive_desc_t cdesc = nullptr;
cdesc = mkldnn_primitive_desc_iterator_fetch(get());
adesc.reset(cdesc);
return adesc;
}
primitive_desc_iterator operator++(int) {
mkldnn_status_t status = mkldnn_primitive_desc_iterator_next(get());
if (status == mkldnn_status_t::mkldnn_iterator_ends)
reset(nullptr);
else if (status != mkldnn_status_t::mkldnn_success)
THROW_IE_EXCEPTION << "could not get next iteration";
return *this;
}
memory::primitive_desc src_primitive_desc(size_t index = 0) const {
memory::primitive_desc adesc;
memory::primitive_desc cdesc_elem;
mkldnn_primitive_desc_t cdesc = nullptr;
cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
const_mkldnn_primitive_desc_t const_cdesc =
mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
mkldnn::convert_to_c(src_pd), index);
error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
"could not clone a src primititve descriptor");
adesc.reset(cdesc);
return adesc;
}
memory::primitive_desc dst_primitive_desc(size_t index = 0) const {
memory::primitive_desc adesc;
memory::primitive_desc cdesc_elem;
mkldnn_primitive_desc_t cdesc = nullptr;
cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
const_mkldnn_primitive_desc_t const_cdesc =
mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
mkldnn::convert_to_c(dst_pd), index);
error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
"could not clone a dst primitive descriptor");
adesc.reset(cdesc);
return adesc;
}
memory::primitive_desc diff_src_primitive_desc(size_t index = 0) const {
memory::primitive_desc adesc;
memory::primitive_desc cdesc_elem;
mkldnn_primitive_desc_t cdesc = nullptr;
cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
const_mkldnn_primitive_desc_t const_cdesc =
mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
mkldnn::convert_to_c(diff_src_pd), index);
error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
"could not clone a diff_src primititve descriptor");
adesc.reset(cdesc);
return adesc;
}
memory::primitive_desc weights_primitive_desc(size_t index = 0) const {
memory::primitive_desc adesc;
memory::primitive_desc cdesc_elem;
mkldnn_primitive_desc_t cdesc = nullptr;
cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
const_mkldnn_primitive_desc_t const_cdesc =
mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
mkldnn::convert_to_c(weights_pd), index);
error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
"could not clone a weights primitive descriptor");
adesc.reset(cdesc);
return adesc;
}
memory::primitive_desc diff_dst_primitive_desc(size_t index = 0) const {
memory::primitive_desc adesc;
memory::primitive_desc cdesc_elem;
mkldnn_primitive_desc_t cdesc = nullptr;
cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
const_mkldnn_primitive_desc_t const_cdesc =
mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
mkldnn::convert_to_c(diff_dst_pd), index);
error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
"could not clone a diff_dst primitive descriptor");
adesc.reset(cdesc);
return adesc;
}
std::string get_impl_info_str() const {
memory::primitive_desc cdesc_elem;
cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
const char *info;
error::wrap_c_api(mkldnn_primitive_desc_query(cdesc_elem.get(),
mkldnn::convert_to_c(impl_info_str), 0, &info),
"could not query info string of primitive descriptor");
return std::string(info);
}
template <typename T>
void getPrimitiveDescriptor(T& pdesc) const {
mkldnn_primitive_desc_t cdesc = nullptr;
memory::primitive_desc cdescpd;
cdescpd.reset(mkldnn_primitive_desc_iterator_fetch(get()));
error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, cdescpd.get()),
"could not clone a src primititve descriptor");
pdesc.reset(cdesc);
}
};
} // namespace mkldnn

View File

@ -0,0 +1,152 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "ie_mkldnn.h"
#include <dnnl_debug.h>
#include <cpu/platform.hpp>
#include <cpu/x64/cpu_isa_traits.hpp>
#include <cassert>
#include <cstring>
namespace mkldnn {
namespace utils {
const char* fmt2str(memory::format_tag fmt) {
return dnnl_fmt_tag2str(static_cast<dnnl_format_tag_t>(fmt));
}
mkldnn::memory::format_tag str2fmt(const char *str) {
#define CASE(_fmt) do { \
if (!strcmp(#_fmt, str) \
|| !strcmp("mkldnn_" #_fmt, str)) \
return static_cast<dnnl::memory::format_tag>(dnnl_ ## _fmt); \
} while (0)
CASE(x);
CASE(nc);
CASE(ncw);
CASE(nwc);
CASE(nCw4c);
CASE(nCw8c);
CASE(nCw16c);
CASE(nchw);
CASE(nhwc);
CASE(chwn);
CASE(nChw4c);
CASE(nChw8c);
CASE(nChw16c);
CASE(oi);
CASE(io);
CASE(oiw);
CASE(wio);
CASE(OIw16i16o);
CASE(OIw16o16i);
CASE(Oiw16o);
CASE(Owi16o);
CASE(OIw8i16o2i);
CASE(OIw4i16o4i);
CASE(oihw);
CASE(ihwo);
CASE(hwio);
CASE(iohw);
CASE(dhwio);
CASE(OIhw8i8o);
CASE(OIhw16i16o);
CASE(OIhw8i16o2i);
CASE(OIdhw8i16o2i);
CASE(OIhw4i16o4i);
CASE(OIdhw4i16o4i);
CASE(OIhw8o16i2o);
CASE(IOhw8o16i2o);
CASE(OIhw8o8i);
CASE(OIhw8o32i);
CASE(OIhw16o32i);
CASE(OIhw16o16i);
CASE(IOhw16o16i);
CASE(Oihw16o);
CASE(Ohwi8o);
CASE(Ohwi16o);
CASE(goiw);
CASE(goihw);
CASE(hwigo);
CASE(giohw);
CASE(dhwigo);
CASE(goiw);
CASE(gOIw16i16o);
CASE(gOIw16o16i);
CASE(gOiw16o);
CASE(gOwi16o);
CASE(gOIw8i16o2i);
CASE(gOIw4i16o4i);
CASE(Goiw16g);
CASE(gOIhw8i8o);
CASE(gOIhw16i16o);
CASE(gOIhw8i16o2i);
CASE(gOIdhw8i16o2i);
CASE(gOIhw2i8o4i);
CASE(gOIhw4i16o4i);
CASE(gOIdhw4i16o4i);
CASE(gOIhw8o16i2o);
CASE(gIOhw8o16i2o);
CASE(gOIhw4o4i);
CASE(gOIhw8o8i);
CASE(gOIhw16o16i);
CASE(gIOhw16o16i);
CASE(gOihw16o);
CASE(gOhwi8o);
CASE(gOhwi16o);
CASE(Goihw8g);
CASE(Goihw16g);
CASE(Goidhw4g);
CASE(Goidhw8g);
CASE(Goidhw16g);
CASE(ncdhw);
CASE(ndhwc);
CASE(oidhw);
CASE(goidhw);
CASE(nCdhw4c);
CASE(nCdhw8c);
CASE(nCdhw16c);
CASE(OIdhw16i16o);
CASE(gOIdhw16i16o);
CASE(OIdhw16o16i);
CASE(gOIdhw16o16i);
CASE(Oidhw16o);
CASE(Odhwi16o);
CASE(gOidhw16o);
CASE(gOdhwi16o);
CASE(ntc);
CASE(tnc);
CASE(ldigo);
CASE(ldgoi);
CASE(ldgo);
#undef CASE
assert(!"unknown memory format");
return dnnl::memory::format_tag::undef;
}
int get_cache_size(int level, bool per_core) {
if (per_core) {
return mkldnn::impl::cpu::platform::get_per_core_cache_size(level);
} else {
using namespace mkldnn::impl::cpu::x64;
if (cpu().getDataCacheLevels() == 0) {
// this function can return stub values in case of unknown CPU type
return mkldnn::impl::cpu::platform::get_per_core_cache_size(level);
}
if (level > 0 && (unsigned) level <= cpu().getDataCacheLevels()) {
unsigned l = level - 1;
return cpu().getDataCacheSize(l);
} else {
return 0;
}
}
DNNL_THROW_ERROR(dnnl_unimplemented, "get_cache_size has no mode per_core == false");
}
} // namespace utils
} // namespace mkldnn

View File

@ -0,0 +1,21 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "mkldnn.hpp"
namespace mkldnn {
using primitive_desc_iterator = mkldnn::primitive_desc;
namespace utils {
int get_cache_size(int level, bool per_core);
const char* fmt2str(memory::format_tag fmt);
mkldnn::memory::format_tag str2fmt(const char *str);
} // namespace utils
} // namespace mkldnn

View File

@ -17,31 +17,36 @@ impl_desc_type MKLDNNPlugin::parse_impl_name(std::string impl_desc_name) {
#define SEARCH_WORD(_wrd) if (impl_desc_name.find(#_wrd) != std::string::npos) \
res = static_cast<impl_desc_type>(res | impl_desc_type::_wrd);
#define SEARCH_WORD_2(_wrd, _key) if (impl_desc_name.find(#_wrd) != std::string::npos) \
res = static_cast<impl_desc_type>(res | impl_desc_type::_key);
SEARCH_WORD(ref);
SEARCH_WORD(jit);
SEARCH_WORD(gemm);
SEARCH_WORD(blas);
SEARCH_WORD(sse42);
SEARCH_WORD_2(sse41, sse42);
SEARCH_WORD(avx2);
SEARCH_WORD(avx512);
SEARCH_WORD(any);
SEARCH_WORD(uni);
SEARCH_WORD(_1x1);
SEARCH_WORD(_dw);
SEARCH_WORD(reorder);
if ((res & impl_desc_type::avx2) != impl_desc_type::avx2 &&
(res & impl_desc_type::avx512) != impl_desc_type::avx512)
SEARCH_WORD(avx);
#undef SEARCH_WORD
#define SEARCH_WORD_2(_wrd, _key) if (impl_desc_name.find(#_wrd) != std::string::npos) \
res = static_cast<impl_desc_type>(res | impl_desc_type::_key);
if ((res & impl_desc_type::sse42) != impl_desc_type::sse42 &&
(res & impl_desc_type::avx) != impl_desc_type::avx &&
(res & impl_desc_type::avx2) != impl_desc_type::avx2 &&
(res & impl_desc_type::avx512) != impl_desc_type::avx512)
SEARCH_WORD(uni);
SEARCH_WORD_2(nchw, ref);
SEARCH_WORD_2(ncdhw, ref);
SEARCH_WORD_2(wino, winograd);
#undef SEARCH_WORD_2
#undef SEARCH_WORD
return res;
}

View File

@ -11,20 +11,10 @@ mkldnn::primitive_desc_iterator MKLDNNDescriptor::createPrimitiveDescriptorItera
}
MKLDNNDescriptor::operator bool() {
return desc.get() != nullptr;
return desc != nullptr;
}
size_t MKLDNNDescriptor::inputNumbers() const {
DescFwdImpl<mkldnn::roi_pooling_forward::desc> *roiPooling =
dynamic_cast<DescFwdImpl<mkldnn::roi_pooling_forward::desc> *>(desc.get());
if (roiPooling != nullptr) {
return roiPooling->getPtr()->c_api_inputs.size();
}
DescFwdImpl<mkldnn::deformable_convolution_forward::desc> *defConv =
dynamic_cast<DescFwdImpl<mkldnn::deformable_convolution_forward::desc> *>(desc.get());
if (defConv != nullptr) {
return defConv->getPtr()->c_api_inputs.size();
}
return 1;
}
@ -37,8 +27,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::batch_normalization_f
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::batch_normalization_forward::desc>() {
DescFwdImpl<mkldnn::batch_normalization_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::batch_normalization_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::batch_normalization_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -50,8 +39,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::convolution_forward::
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_forward::desc>() {
DescFwdImpl<mkldnn::convolution_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::convolution_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::convolution_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -66,9 +54,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::convolution_backward_
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_backward_data::desc>() {
DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc> *typeDesc =
dynamic_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc,
mkldnn::convolution_forward::primitive_desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -76,9 +62,7 @@ MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_backward_data::de
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_forward::primitive_desc>() {
DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc> *typeDesc =
dynamic_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc,
mkldnn::convolution_forward::primitive_desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -90,8 +74,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::inner_product_forward
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::inner_product_forward::desc>() {
DescFwdImpl<mkldnn::inner_product_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::inner_product_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::inner_product_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -103,8 +86,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::lrn_forward::desc> de
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::lrn_forward::desc>() {
DescFwdImpl<mkldnn::lrn_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::lrn_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::lrn_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -116,21 +98,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::pooling_forward::desc
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::pooling_forward::desc>() {
DescFwdImpl<mkldnn::pooling_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::pooling_forward::desc> *>(desc.get());
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::roi_pooling_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::roi_pooling_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::roi_pooling_forward::desc>() {
DescFwdImpl<mkldnn::roi_pooling_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::roi_pooling_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::pooling_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -142,21 +110,55 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::softmax_forward::desc
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::softmax_forward::desc>() {
DescFwdImpl<mkldnn::softmax_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::softmax_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::softmax_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::rnn_forward::desc>(desc));
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::vanilla_rnn_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::vanilla_rnn_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::rnn_forward::desc>() {
DescFwdImpl<mkldnn::rnn_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::rnn_forward::desc> *>(desc.get());
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::vanilla_rnn_forward::desc>() {
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::vanilla_rnn_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::lstm_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::lstm_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::lstm_forward::desc>() {
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::lstm_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::gru_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::gru_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::gru_forward::desc>() {
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::gru_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::lbr_gru_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::lbr_gru_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::lbr_gru_forward::desc>() {
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::lbr_gru_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
@ -168,44 +170,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::eltwise_forward::desc
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::eltwise_forward::desc>() {
DescFwdImpl<mkldnn::eltwise_forward::desc> *typeDesc =
dynamic_cast<DescFwdImpl<mkldnn::eltwise_forward::desc> *>(desc.get());
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::quantization_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::quantization_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::quantization_forward::desc>() {
auto *typeDesc = dynamic_cast<DescFwdImpl<mkldnn::quantization_forward::desc> *>(desc.get());
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::binary_convolution_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::binary_convolution_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::binary_convolution_forward::desc>() {
auto *typeDesc = dynamic_cast<DescFwdImpl<mkldnn::binary_convolution_forward::desc> *>(desc.get());
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}
return typeDesc->getPtr();
}
MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::deformable_convolution_forward::desc> desc) {
this->desc.reset(new DescFwdImpl<mkldnn::deformable_convolution_forward::desc>(desc));
}
MKLDNNDescriptor::operator std::shared_ptr<mkldnn::deformable_convolution_forward::desc>() {
auto *typeDesc = dynamic_cast<DescFwdImpl<mkldnn::deformable_convolution_forward::desc> *>(desc.get());
auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::eltwise_forward::desc>>(desc);
if (typeDesc == nullptr) {
THROW_IE_EXCEPTION << "Cannot cast descriptor!";
}

View File

@ -6,8 +6,7 @@
#include <memory>
#include <string>
#include <mkldnn.hpp>
#include <mkldnn/desc_iterator.hpp>
#include "mkldnn/ie_mkldnn.h"
class MKLDNNDescriptor {
public:
@ -31,27 +30,24 @@ public:
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::pooling_forward::desc> desc);
operator std::shared_ptr<mkldnn::pooling_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::roi_pooling_forward::desc> desc);
operator std::shared_ptr<mkldnn::roi_pooling_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::softmax_forward::desc> desc);
operator std::shared_ptr<mkldnn::softmax_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc);
operator std::shared_ptr<mkldnn::rnn_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::vanilla_rnn_forward::desc> desc);
operator std::shared_ptr<mkldnn::vanilla_rnn_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::lstm_forward::desc> desc);
operator std::shared_ptr<mkldnn::lstm_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::gru_forward::desc> desc);
operator std::shared_ptr<mkldnn::gru_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::lbr_gru_forward::desc> desc);
operator std::shared_ptr<mkldnn::lbr_gru_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::eltwise_forward::desc> desc);
operator std::shared_ptr<mkldnn::eltwise_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::quantization_forward::desc> desc);
operator std::shared_ptr<mkldnn::quantization_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::binary_convolution_forward::desc> desc);
operator std::shared_ptr<mkldnn::binary_convolution_forward::desc>();
explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::deformable_convolution_forward::desc> desc);
operator std::shared_ptr<mkldnn::deformable_convolution_forward::desc>();
mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::engine &engine,
const mkldnn::primitive_attr &attr = mkldnn::primitive_attr()) const;
@ -66,6 +62,7 @@ private:
virtual ~IDesc() {}
virtual mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::primitive_attr &attr,
const mkldnn::engine &engine) const = 0;
static constexpr bool allow_empty = true;
};
template <class T>
@ -76,7 +73,7 @@ private:
mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::primitive_attr &attr,
const mkldnn::engine &engine) const override {
return mkldnn::primitive_desc_iterator(*desc, attr, engine);
return mkldnn::primitive_desc_iterator(&desc->data, &attr, engine, nullptr, allow_empty);
}
std::shared_ptr<T>& getPtr() {
@ -95,7 +92,7 @@ private:
mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::primitive_attr &attr,
const mkldnn::engine &engine) const override {
return mkldnn::primitive_desc_iterator(*desc, attr, engine, *prim);
return mkldnn::primitive_desc_iterator(&desc->data, &attr, engine, prim.get()->get(), allow_empty);
}
std::shared_ptr<T>& getPtr() {

View File

@ -64,7 +64,8 @@ public:
}
operator mkldnn::memory::dims() const {
return dims;
// TODO: it will convert each time.. not good
return mkldnn::memory::dims(dims.begin(), dims.end());
}
bool operator == (const MKLDNNDims& rhs) const {

View File

@ -270,7 +270,7 @@ bool MKLDNNEdge::nodeCanChangeDesc(const MKLDNNNodePtr &node) const {
/// In we have {any, any, any} -> {any} or {any} -> {any, any, any} or {any} -> {any} it means that
/// layer doesn't change memory format
/// We don't support {any, any, nchw} -> {any}
InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<memory::format_tag, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
InferenceEngine::TensorDesc inDesc;
if (inputDesc.getLayout() != InferenceEngine::Layout::ANY) {
@ -312,8 +312,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
}
if (child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size() <= childIdx)
childIdx = 0;
memory::format childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
if (childInDesc != memory::any && childInDesc != memory::format_undef) {
memory::format_tag childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
if (formats.find(childInDesc) == formats.end())
formats[childInDesc] = 1;
else
@ -325,7 +325,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
if (enterCountUp < 2) {
childInDesc = MKLDNNMemoryDesc(childEdge->getSpecifiedOutputDesc(formats, enterCountUp, ++enterCountDown)).getFormat();
if (childInDesc != memory::any && childInDesc != memory::format_undef) {
if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
if (formats.find(childInDesc) == formats.end())
formats[childInDesc] = 1;
else
@ -346,8 +346,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
if (parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size() <= parentIdx) {
parentIdx = 0;
}
memory::format parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
memory::format_tag parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
if (formats.find(parentOutDesc) == formats.end())
formats[parentOutDesc] = 1;
else
@ -359,7 +359,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
if (enterCountUp < 2) {
parentOutDesc = MKLDNNMemoryDesc(parentEdge->getSpecifiedInputDesc(formats, ++enterCountUp, enterCountDown)).getFormat();
if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
if (formats.find(parentOutDesc) == formats.end())
formats[parentOutDesc] = 1;
else
@ -370,7 +370,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
}
size_t maxFormatCount = 0;
memory::format desc = MKLDNNMemory::GetPlainFormat(getDims());
memory::format_tag desc = MKLDNNMemory::GetPlainFormat(getDims());
for (auto &it : formats) {
if (maxFormatCount < it.second && MKLDNNMemory::isConsistant(getDims(), it.first)) {
maxFormatCount = it.second;
@ -389,7 +389,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
return MKLDNNMemoryDesc(getDims(), inDataType, desc);
}
InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<memory::format_tag, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
InferenceEngine::TensorDesc outDesc;
if (outputDesc.getLayout() != InferenceEngine::Layout::ANY) {
@ -446,8 +446,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
if (parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size() <= parentIdx) {
parentIdx = 0;
}
memory::format parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
memory::format_tag parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
if (formats.find(parentOutDesc) == formats.end())
formats[parentOutDesc] = 1;
else
@ -459,7 +459,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
if (enterCountDown < 2) {
parentOutDesc = MKLDNNMemoryDesc(parentEdge->getSpecifiedInputDesc(formats, ++enterCountUp, enterCountDown)).getFormat();
if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
if (formats.find(parentOutDesc) == formats.end())
formats[parentOutDesc] = 1;
else
@ -480,8 +480,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
if (child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size() <= childIdx) {
childIdx = 0;
}
memory::format childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
if (childInDesc != memory::any && childInDesc != memory::format_undef) {
memory::format_tag childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
if (formats.find(childInDesc) == formats.end())
formats[childInDesc] = 1;
else
@ -493,7 +493,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
if (enterCountDown < 2) {
childInDesc = MKLDNNMemoryDesc(childEdge->getSpecifiedOutputDesc(formats, enterCountUp, ++enterCountDown)).getFormat();
if (childInDesc != memory::any && childInDesc != memory::format_undef) {
if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
if (formats.find(childInDesc) == formats.end())
formats[childInDesc] = 1;
else
@ -504,7 +504,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
}
size_t maxFormatCount = 0;
memory::format format = MKLDNNMemory::GetPlainFormat(getDims());
memory::format_tag format = MKLDNNMemory::GetPlainFormat(getDims());
for (auto &it : formats) {
if (maxFormatCount < it.second && MKLDNNMemory::isConsistant(getDims(), it.first)) {
maxFormatCount = it.second;

View File

@ -5,10 +5,13 @@
#pragma once
#include <ie_blob.h>
#include <memory>
#include "mkldnn_memory.h"
#include "mkldnn_dims.h"
#include "mkldnn/ie_mkldnn.h"
#include <map>
#include <memory>
#include <vector>
namespace MKLDNNPlugin {
@ -76,8 +79,10 @@ private:
InferenceEngine::TensorDesc getInputDesc();
InferenceEngine::TensorDesc getOutputDesc();
InferenceEngine::TensorDesc getSpecifiedInputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp = 1, size_t enterCountDown = 0);
InferenceEngine::TensorDesc getSpecifiedOutputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp = 0, size_t enterCountDown = 1);
InferenceEngine::TensorDesc getSpecifiedInputDesc(std::map<mkldnn::memory::format_tag, size_t> formats,
size_t enterCountUp = 1, size_t enterCountDown = 0);
InferenceEngine::TensorDesc getSpecifiedOutputDesc(std::map<mkldnn::memory::format_tag, size_t> formats,
size_t enterCountUp = 0, size_t enterCountDown = 1);
InferenceEngine::TensorDesc inputDesc;
InferenceEngine::TensorDesc outputDesc;

View File

@ -77,17 +77,12 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
}
OV_ITT_TASK_NEXT(taskChain, "createConstInputs");
auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, std::string name) {
LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, const std::vector<size_t>& shape, const std::string& name) {
LayerParams attrs = {layer->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
constLayer->blobs["custom"] = blob;
std::vector<size_t> constDims(layer->insData[0].lock()->getDims().size(), 1);
if (constDims.size() > 1)
constDims[1] = blob.get()->size();
else
constDims[0] = blob.get()->size();
const TensorDesc& td = {blob->getTensorDesc().getPrecision(), constDims, TensorDesc::getLayoutByDims(constDims)};
const TensorDesc& td = {blob->getTensorDesc().getPrecision(), shape, TensorDesc::getLayoutByDims(shape)};
DataPtr newEdgeAfterLayer(new Data(constLayer->name, td));
newEdgeAfterLayer->setName(constLayer->name);
@ -107,16 +102,27 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
layer->insData.push_back(newEdgeAfterLayer);
};
// The code block below transforms legacy layers to the form more compatible with opset1 in order to simplify future migration
// TODO: remove after plug-in is migrated on opset1
auto all_layers = details::CNNNetSortTopologically(_clonedNetwork);
for (auto &layer : all_layers) {
if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
auto constDimsRank = layer->insData[0].lock()->getDims().size();
Blob::Ptr scalesBlob = layer->blobs["weights"];
if (scalesBlob != nullptr)
createConstInputTo(layer, scalesBlob, "weights");
if (scalesBlob != nullptr) {
std::vector<size_t> shape(constDimsRank, 1);
shape[shape.size() > 1 ? 1 : 0] = scalesBlob->size();
createConstInputTo(layer, scalesBlob, shape, "weights");
}
Blob::Ptr shiftBlob = layer->blobs["biases"];
if (shiftBlob != nullptr) {
createConstInputTo(layer, shiftBlob, "biases");
std::vector<size_t> shape(constDimsRank, 1);
shape[shape.size() > 1 ? 1 : 0] = shiftBlob->size();
createConstInputTo(layer, shiftBlob, shape, "biases");
} else if (scalesBlob != nullptr) {
Blob::Ptr biases = make_shared_blob<float>(scalesBlob->getTensorDesc());
if (biases == nullptr)
@ -126,12 +132,65 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
for (size_t i = 0; i < biases->size(); i++)
biasesPtr[i] = 0;
createConstInputTo(layer, biases, "biases");
std::vector<size_t> shape(constDimsRank, 1);
shape[shape.size() > 1 ? 1 : 0] = biases->size();
createConstInputTo(layer, biases, shape, "biases");
}
} else if (layer->type == "PReLU" && layer->insData.size() == 1) {
Blob::Ptr scalesBlob = layer->blobs["weights"];
if (scalesBlob != nullptr)
createConstInputTo(layer, scalesBlob, "weights");
if (scalesBlob != nullptr) {
std::vector<size_t> shape(layer->insData[0].lock()->getDims().size(), 1);
shape[shape.size() > 1 ? 1 : 0] = scalesBlob->size();
createConstInputTo(layer, scalesBlob, shape, "weights");
}
} else if (layer->type == "DeformableConvolution") {
auto * defConvLayer = dynamic_cast<DeformableConvolutionLayer*>(layer.get());
if (defConvLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert deformable convolution layer.";
Blob::Ptr weightsBlob = defConvLayer->blobs["weights"];
if (weightsBlob != nullptr) {
std::vector<size_t> shape;
if (defConvLayer->_group != 1) {
shape.push_back(defConvLayer->_group);
}
shape.push_back(defConvLayer->_out_depth);
shape.push_back(defConvLayer->input()->getDims()[1]);
for (int i = 1; i <= defConvLayer->_kernel.size(); i++) {
shape.push_back(defConvLayer->_kernel[defConvLayer->_kernel.size() - i]);
}
createConstInputTo(layer, weightsBlob, shape, "weights");
defConvLayer->blobs.clear();
defConvLayer->_weights = nullptr;
}
} else if (layer->type == "BinaryConvolution") {
auto * binConvLayer = dynamic_cast<BinaryConvolutionLayer*>(layer.get());
if (binConvLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert binary convolution layer.";
Blob::Ptr weightsBlob = binConvLayer->blobs["weights"];
if (weightsBlob != nullptr) {
std::vector<size_t> shape;
if (binConvLayer->_group != 1) {
shape.push_back(binConvLayer->_group);
}
shape.push_back(binConvLayer->_out_depth);
shape.push_back(binConvLayer->input()->getDims()[1]);
for (int i = 1; i <= binConvLayer->_kernel.size(); i++) {
shape.push_back(binConvLayer->_kernel[binConvLayer->_kernel.size() - i]);
}
createConstInputTo(layer, weightsBlob, shape, "weights");
binConvLayer->blobs.clear();
binConvLayer->_weights = nullptr;
}
}
}

View File

@ -3,8 +3,10 @@
//
#include "mkldnn_extension_utils.h"
#include "utils/general_utils.h"
#include <limits>
#include <vector>
#include <numeric>
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -15,8 +17,6 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
return 4;
case mkldnn::memory::data_type::s32:
return 4;
case mkldnn::memory::data_type::s16:
return 2;
case mkldnn::memory::data_type::bf16:
return 2;
case mkldnn::memory::data_type::s8:
@ -25,9 +25,8 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
return 1;
case mkldnn::memory::data_type::bin:
return 1;
case mkldnn::memory::data_type::data_undef:
case mkldnn::memory::data_type::undef:
return 0;
default:
THROW_IE_EXCEPTION << "Unsupported data type.";
}
@ -36,21 +35,18 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
memory::data_type MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision prec) {
switch (prec) {
case InferenceEngine::Precision::FP32:
return memory::f32;
return memory::data_type::f32;
case InferenceEngine::Precision::I32:
return memory::s32;
case InferenceEngine::Precision::I16:
return memory::s16;
return memory::data_type::s32;
case InferenceEngine::Precision::BF16:
return memory::bf16;
return memory::data_type::bf16;
case InferenceEngine::Precision::I8:
return memory::s8;
return memory::data_type::s8;
case InferenceEngine::Precision::U8:
case InferenceEngine::Precision::BOOL:
return memory::u8;
return memory::data_type::u8;
case InferenceEngine::Precision::BIN:
return memory::bin;
return memory::data_type::bin;
default: {
THROW_IE_EXCEPTION << "The plugin does not support " << prec.name();
}
@ -59,21 +55,18 @@ memory::data_type MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::P
InferenceEngine::Precision MKLDNNExtensionUtils::DataTypeToIEPrecision(memory::data_type dataType) {
switch (dataType) {
case memory::f32:
return InferenceEngine::Precision(InferenceEngine::Precision::FP32);
case memory::s32:
case memory::data_type::f32:
return InferenceEngine::Precision::FP32;
case memory::data_type::s32:
return InferenceEngine::Precision::I32;
case memory::s16:
return InferenceEngine::Precision::I16;
case memory::bf16:
case memory::data_type::bf16:
return InferenceEngine::Precision::BF16;
case memory::s8:
case memory::data_type::s8:
return InferenceEngine::Precision::I8;
case memory::u8:
case memory::data_type::u8:
return InferenceEngine::Precision::U8;
case memory::bin:
case memory::data_type::bin:
return InferenceEngine::Precision::BIN;
default: {
THROW_IE_EXCEPTION << "Unsupported data type.";
}
@ -125,15 +118,82 @@ bool MKLDNNExtensionUtils::initTensorsAreEqual(const InferenceEngine::TensorDesc
in1Block.getOffsetPadding() != uninitNum && in2Block.getOffsetPadding() != uninitNum);
}
PartialBlkDesc PartialBlkDesc::makePlain(const InferenceEngine::SizeVector &dims) {
PartialBlkDesc res;
res.outer_order.resize(dims.size());
std::iota(res.outer_order.begin(), res.outer_order.end(), 0);
return res;
}
PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size) {
PartialBlkDesc res;
res.outer_order.resize(dims.size());
std::iota(res.outer_order.begin(), res.outer_order.end(), 0);
res.inner_blk_size = {block_size};
res.inner_blk_idxes = {1};
return res;
}
PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
if (desc.getLayout() == InferenceEngine::ANY)
THROW_IE_EXCEPTION << "Cannot extract partial blocked descriptor for `ANY` layout";
const auto &dims = desc.getDims();
const auto &blk = desc.getBlockingDesc();
const auto &blk_dims = blk.getBlockDims();
const auto &blk_order = blk.getOrder();
PartialBlkDesc res;
res.outer_order = {blk_order.begin(), blk_order.begin() + dims.size()};
res.inner_blk_idxes = {blk_order.begin() + dims.size(), blk_order.end()};
res.inner_blk_size = {blk_dims.begin() + dims.size(), blk_dims.end()};
return res;
}
bool PartialBlkDesc::isAutoExtendedWith(const InferenceEngine::SizeVector &dims) const {
auto tmp_dims = dims;
for (int i = 0; i < inner_blk_size.size(); i++) {
auto idx = inner_blk_idxes[i];
auto blk = inner_blk_size[i];
if (tmp_dims[idx] % blk == 0)
tmp_dims[idx] /= blk;
else
return true;
}
return false;
}
bool PartialBlkDesc::operator == (const PartialBlkDesc& it) const {
return std::tie(this->inner_blk_idxes,
this->inner_blk_size,
this->outer_order) ==
std::tie(it.inner_blk_idxes,
it.inner_blk_size,
it.outer_order);
}
// Lexicographical compare of content
bool PartialBlkDesc::operator < (const PartialBlkDesc& it) const {
return std::tie(this->inner_blk_idxes,
this->inner_blk_size,
this->outer_order) <
std::tie(it.inner_blk_idxes,
it.inner_blk_size,
it.outer_order);
}
std::string MKLDNNExtensionUtils::getReorderArgs(const InferenceEngine::TensorDesc &parentDesc, const InferenceEngine::TensorDesc &childDesc) {
std::string inArgs, outArgs;
if (parentDesc.getPrecision() != childDesc.getPrecision()) {
inArgs += (inArgs.empty() ? "" : "_") + std::string(parentDesc.getPrecision().name());
outArgs += (outArgs.empty() ? "" : "_") + std::string(childDesc.getPrecision().name());
}
if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
auto fmt_tag_src = MKLDNNMemoryDesc(parentDesc).getFormat();
auto fmt_tag_dst = MKLDNNMemoryDesc(childDesc).getFormat();
if (fmt_tag_src != fmt_tag_dst || one_of(mkldnn::memory::format_tag::undef, fmt_tag_src, fmt_tag_dst)) {
inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(fmt_tag_src);
outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(fmt_tag_dst);
}
return inArgs + "_" + outArgs;
}

View File

@ -15,6 +15,61 @@
namespace MKLDNNPlugin {
/**
* Partial tensor descriptor
*
* Represent a classes of layout. As example Plain, TailC, CBlocked and other.
*
* The tensor are in one layout family if they have same PartialBlkDesc.
*
* Any tensor will have same PartialBlkDesc as it subview tensor.
*
* PartialBlkDesc plus Dims allow to reconstruct real tensorDesc (dense representation).
*/
class PartialBlkDesc {
public:
/**
* Check if this partial blocking desc will lead to additional zero padding
* for real tensor with provided dims
*
* Example: dims [2, 3, 8, 8] with blocking by 16 for second dim. Will lead
* to effective dims [2, 16, 8, 8] with zeroing all values
* [:, 3:16, :, :]
*
* @param dims to check on zero auto padding
* @return true if provided dims will use auto padding. Otherwise false.
*/
bool isAutoExtendedWith(const InferenceEngine::SizeVector &dims) const;
/**
* Construct PartialBlkDesc from provided TensorDesc
*
* PartialBlkDesc has less expressiveness power so some information from TensorDesc will be dropped.
* The different TensorDesc object will has equal PartialBlkDesc.
*
* @param desc to extract PartialBlkDesc information about kind of layout
* @return PartialBlkDesc object corresponds layout described in desc
*/
static PartialBlkDesc extractFrom(const InferenceEngine::TensorDesc &desc);
/** Construct plain PartialBlkDesc based on dims information */
static PartialBlkDesc makePlain(const InferenceEngine::SizeVector &dims);
/** Construct blocked Channel PartialBlkDesc based on dims information */
static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);
/** Compare operators. Allow to use it as key for std::map */
bool operator == (const PartialBlkDesc& it) const;
bool operator < (const PartialBlkDesc& it) const;
private:
PartialBlkDesc() = default;
InferenceEngine::SizeVector outer_order;
InferenceEngine::SizeVector inner_blk_size;
InferenceEngine::SizeVector inner_blk_idxes;
};
class MKLDNNExtensionUtils {
public:
static uint8_t sizeOfDataType(mkldnn::memory::data_type dataType);

View File

@ -35,6 +35,7 @@
#include <ie_plugin_config.hpp>
#include "utils/blob_dump.h"
#include "utils/general_utils.h"
/*****************************************************
* Debug capability
@ -447,7 +448,7 @@ void MKLDNNGraph::InitOptimalPrimitiveDescriptors() {
void MKLDNNGraph::ExecuteConstantNodesOnly() {
OV_ITT_SCOPED_TASK(itt::domains::MKLDNN_LT, "MKLDNNGraph::ExecuteConstantNodesOnly");
mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
mkldnn::stream stream(eng);
for (auto &graphNode : graphNodes) {
if (!graphNode->isConstant())
continue;
@ -683,13 +684,12 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::
void *inter_data_ptr = input->second->getChildEdgeAt(0)->getMemory().GetData();
if (ext_data_ptr != inter_data_ptr) {
auto l = in->getTensorDesc().getLayout();
if (l == CHW && input->second->getChildEdgeAt(0)->getDims().ndims() == 4)
l = NCHW;
auto ext_tdesc = MKLDNNMemoryDesc {in->getTensorDesc()};
input->second->getChildEdgeAt(0)->getMemory().SetData(
MKLDNNExtensionUtils::IEPrecisionToDataType(in->getTensorDesc().getPrecision()),
MKLDNNMemory::Convert(l), ext_data_ptr, in->byteSize(), false);
auto ext_mem = MKLDNNMemory(eng);
ext_mem.Create(ext_tdesc, ext_data_ptr, false);
input->second->getChildEdgeAt(0)->getMemory().SetData(ext_mem, 0, false);
}
// todo: make sure 'name' exists in this map...
@ -760,7 +760,8 @@ void MKLDNNGraph::Infer(int batch) {
THROW_IE_EXCEPTION << "Wrong state. Topology is not ready.";
}
mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
mkldnn::stream stream(eng);
for (int i = 0; i < graphNodes.size(); i++) {
if (IsCancellationRequested()) {
ResetCancellationRequest();
@ -778,7 +779,6 @@ void MKLDNNGraph::Infer(int batch) {
OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, graphNodes[i]->profiling.execute);
graphNodes[i]->execute(stream);
}
ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
}

View File

@ -30,7 +30,7 @@ public:
Ready = 1,
};
MKLDNNGraph(): status(NotReady), eng(mkldnn::engine(mkldnn::engine::kind::cpu, 0)), cancelation_requested(false) {}
MKLDNNGraph(mkldnn::engine eng = mkldnn::engine(mkldnn::engine::kind::cpu, 0)) : status(NotReady), eng(eng), cancelation_requested(false) {}
Status GetStatus() {
return status;

View File

@ -250,13 +250,15 @@ std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &no
serialization_info[ExecGraphInfoSerialization::OUTPUT_PRECISIONS] = outputPrecisionsStr;
std::string outputLayoutsStr;
auto outLayouts = node->getSelectedPrimitiveDescriptor()->getOutputLayouts();
if (!outLayouts.empty()) {
outputLayoutsStr = mkldnn_fmt2str(static_cast<mkldnn_memory_format_t>(outLayouts[0]));
auto outDescs = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs;
if (!outDescs.empty()) {
auto fmt0 = MKLDNNMemoryDesc(outDescs[0].desc).getFormat();
outputLayoutsStr = mkldnn::utils::fmt2str(fmt0);
bool isAllEqual = true;
for (size_t i = 1; i < outLayouts.size(); i++) {
if (outLayouts[i - 1] != outLayouts[i]) {
for (size_t i = 1; i < outDescs.size(); i++) {
if (MKLDNNMemoryDesc(outDescs[i - 1].desc).getFormat() != MKLDNNMemoryDesc(outDescs[i].desc).getFormat()) {
isAllEqual = false;
break;
}
@ -264,11 +266,13 @@ std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &no
// If all output layouts are the same, we store the name only once
if (!isAllEqual) {
for (size_t i = 1; i < outLayouts.size(); i++)
outputLayoutsStr += "," + std::string(mkldnn_fmt2str(static_cast<mkldnn_memory_format_t>(outLayouts[i])));
for (size_t i = 1; i < outDescs.size(); i++) {
auto fmt = MKLDNNMemoryDesc(outDescs[i].desc).getFormat();
outputLayoutsStr += "," + std::string(mkldnn::utils::fmt2str(fmt));
}
}
} else {
outputLayoutsStr = mkldnn_fmt2str(mkldnn_format_undef);
outputLayoutsStr = mkldnn::utils::fmt2str(mkldnn::memory::format_tag::undef);
}
serialization_info[ExecGraphInfoSerialization::OUTPUT_LAYOUTS] = outputLayoutsStr;

View File

@ -18,8 +18,11 @@
#include "nodes/mkldnn_interpolate_node.h"
#include "nodes/mkldnn_input_node.h"
#include "mkldnn/ie_mkldnn.h"
#include <blob_factory.hpp>
#include <legacy/ie_layers_internal.hpp>
#include "utils/general_utils.h"
// WA for xbyak.h
#ifdef _WIN32
@ -30,7 +33,7 @@
# define _WINSOCK2API_
#endif
#endif
#include <cpu_isa_traits.hpp>
#include <cpu/x64/cpu_isa_traits.hpp>
#include <string>
#include <list>
@ -97,10 +100,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
FuseConvolutionAndDWConvolution(graph);
graph.RemoveDroppedNodes();
#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
FuseBinaryConvolutionAndQuantize(graph);
graph.RemoveDroppedNodes();
#endif
FuseBatchNormWithScale(graph);
graph.RemoveDroppedNodes();
@ -108,10 +109,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
RemoveIdentityOperator(graph);
graph.RemoveDroppedNodes();
#if defined(COMPILED_CPU_MKLDNN_ELTWISE_NODE)
FuseConvolutionSumAndConvolutionSumActivation(graph);
graph.RemoveDroppedNodes();
#endif
FuseConvolutionAndSimpleOperation(graph);
graph.RemoveDroppedNodes();
@ -140,7 +139,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
RemoveIOScaleShifts(graph);
graph.RemoveDroppedNodes();
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
ChangeConvertToReorder(graph);
graph.RemoveDroppedNodes();
@ -149,7 +147,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
DropConvertReorder(graph);
graph.RemoveDroppedNodes();
#endif
MergePermuteAndReorder(graph);
graph.RemoveDroppedNodes();
@ -277,54 +274,54 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
return true;
};
auto initializeWeightsZeroPoints = [](MKLDNNNodePtr node, MKLDNNNodePtr parent0) {
auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
if (convNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
if (parent0->getType() == Eltwise) {
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
if (eltwiseNode->getOpType() != Subtract)
return false;
if (parent0->getParentEdges().size() != 2)
return false;
if (parent0->getParentEdgesAtPort(1)[0]->getParent()->getCnnLayer()->type == "Const") {
auto arg0 = parent0->getParentEdgesAtPort(1)[0]->getParent();
if (arg0->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
return false;
if (parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != 1 &&
parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != OC)
return false;
auto arg1 = parent0->getParentEdgesAtPort(0)[0]->getParent();
if (arg1->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
return false;
auto zeroPointsBlob = dynamic_cast<TBlob<int8_t>*>(arg0->getCnnLayer()->blobs["custom"].get());
if (zeroPointsBlob == nullptr)
THROW_IE_EXCEPTION << "Cannot cast to TBlob internal zero points blob";
auto zeroPointsData = zeroPointsBlob->buffer().as<int8_t*>();
if (zeroPointsData == nullptr)
THROW_IE_EXCEPTION << "zeroPointsBlob has not allocated buffer";
for (int j = 0; j < parent0->getParentEdgesAtPort(1)[0]->getDims()[0]; j++) {
convNode->weightsZeroPoints.push_back(static_cast<float>(zeroPointsData[j]));
}
} else {
return false;
}
} else {
return false;
}
return true;
};
// auto initializeWeightsZeroPoints = [](MKLDNNNodePtr node, MKLDNNNodePtr parent0) {
// auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
// if (convNode == nullptr)
// THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
//
// int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
//
// if (parent0->getType() == Eltwise) {
// auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
// if (eltwiseNode->getOpType() != Subtract)
// return false;
//
// if (parent0->getParentEdges().size() != 2)
// return false;
//
// if (parent0->getParentEdgesAtPort(1)[0]->getParent()->getCnnLayer()->type == "Const") {
// auto arg0 = parent0->getParentEdgesAtPort(1)[0]->getParent();
// if (arg0->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
// return false;
//
// if (parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != 1 &&
// parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != OC)
// return false;
//
// auto arg1 = parent0->getParentEdgesAtPort(0)[0]->getParent();
// if (arg1->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
// return false;
//
// auto zeroPointsBlob = dynamic_cast<TBlob<int8_t>*>(arg0->getCnnLayer()->blobs["custom"].get());
// if (zeroPointsBlob == nullptr)
// THROW_IE_EXCEPTION << "Cannot cast to TBlob internal zero points blob";
//
// auto zeroPointsData = zeroPointsBlob->buffer().as<int8_t*>();
// if (zeroPointsData == nullptr)
// THROW_IE_EXCEPTION << "zeroPointsBlob has not allocated buffer";
//
// for (int j = 0; j < parent0->getParentEdgesAtPort(1)[0]->getDims()[0]; j++) {
// convNode->weightsZeroPoints.push_back(static_cast<float>(zeroPointsData[j]));
// }
// } else {
// return false;
// }
// } else {
// return false;
// }
//
// return true;
// };
auto initializeOutputCompensation = [](MKLDNNNodePtr node) {
auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
@ -405,13 +402,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
graph.DropNode(dataEltwise);
}
auto weightsEltwise = conv->getParentEdgesAtPort(1)[0]->getParent();
if (initializeWeightsZeroPoints(conv, weightsEltwise)) {
auto p_edge = weightsEltwise->getParentEdgesAtPort(1)[0];
removeEdge(graph, p_edge);
graph.DropNode(weightsEltwise);
}
// [TODO] Weights zero point is not supported on oneDNN side for the moment
// auto weightsEltwise = conv->getParentEdgesAtPort(1)[0]->getParent();
// if (initializeWeightsZeroPoints(conv, weightsEltwise)) {
// auto p_edge = weightsEltwise->getParentEdgesAtPort(1)[0];
// removeEdge(graph, p_edge);
//
// graph.DropNode(weightsEltwise);
// }
initializeOutputCompensation(conv);
}
@ -617,6 +615,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
auto* binConv = dynamic_cast<MKLDNNBinaryConvolutionNode *>(conv.get());
if (binConv) {
if (!binConv->canFuse(activation))
return false;
}
if (!activation->getCnnLayer())
return false;
@ -792,17 +796,23 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1;
};
auto isSutableChildNode = [](MKLDNNNodePtr node) {
if (node->getType() != Eltwise)
auto isSutableChildNode = [](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
if (childNode->getType() != Eltwise)
return false;
if (!node->getCnnLayer())
if (!childNode->getCnnLayer())
return false;
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
auto* binConv = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parentNode.get());
if (binConv) {
if (!binConv->canFuse(childNode))
return false;
}
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(childNode.get());
if (eltwiseNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) ||
THROW_IE_EXCEPTION << "Cannot get eltwise node " << childNode->getName();
return ((eltwiseNode->getOpType() == MulAdd && childNode->getCnnLayer()->blobs.size() == 2) ||
(eltwiseNode->getOpType() == Prelu));
};
@ -811,14 +821,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
if (!isSutableParentNode(conv)) continue;
auto depthwise0 = conv->getChildEdgeAt(0)->getChild();
if (!isSutableChildNode(depthwise0)) continue;
if (!isSutableChildNode(conv, depthwise0)) continue;
conv->fuseWith(depthwise0);
if (depthwise0->getChildEdges().size() == 1) {
auto depthwise1 = depthwise0->getChildEdgeAt(0)->getChild();
if (isSutableChildNode(depthwise1)) {
if (isSutableChildNode(conv, depthwise1)) {
conv->fuseWith(depthwise1);
auto parents = depthwise1->parentEdges;
@ -854,41 +864,29 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
return node->getType() == Convolution;
};
auto isBinaryConvolutionNode = [](MKLDNNNodePtr node) {
return node->getType() == BinaryConvolution;
};
auto is1x1Convolution = [](ConvolutionLayer* layer) {
return layer->_kernel[X_AXIS] == 1 && layer->_kernel[Y_AXIS] == 1;
};
auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
if (isBinaryConvolutionNode(node)) {
auto *layer = dynamic_cast<BinaryConvolutionLayer *>(node->getCnnLayer().get());
if (layer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
if (layer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
bool isSupportedParams = layer->_group == 1;
if (!isSupportedParams) return false;
} else {
auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
if (layer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
if (parentConvolutionNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
if (parentConvolutionNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
if (!parentConvolutionNode->weightsZeroPoints.empty())
return false;
if (!parentConvolutionNode->weightsZeroPoints.empty())
return false;
bool isSupportedParams =
layer->_group == 1 &&
((is1x1Convolution(layer) && layer->_stride[X_AXIS] == 1 && layer->_stride[Y_AXIS] == 1) || !is1x1Convolution(layer)) &&
(layer->outData[0].get()->getPrecision() == Precision::FP32 || layer->outData[0].get()->getPrecision() == Precision::U8) &&
node->getChildEdgeAt(0)->getDims().ndims() == 4;
if (!isSupportedParams) return false;
}
// TODO [oneDNN]: is it still valide constrain on conv to fuse in?
bool isSupportedParams = layer->_group == 1 &&
is1x1Convolution(layer) && // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
everyone_is(1, layer->_stride[X_AXIS], layer->_stride[Y_AXIS]) &&
one_of(layer->outData[0].get()->getPrecision(), Precision::FP32, Precision::U8) &&
node->getChildEdgeAt(0)->getDims().ndims() == 4;
if (!isSupportedParams) return false;
return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
};
@ -898,28 +896,26 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
if (childLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << childNode->getName();
if (!isBinaryConvolutionNode(parentNode)) {
auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
if (parentLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << parentNode->getName();
auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
if (parentLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << parentNode->getName();
if (parentLayer->outData[0].get()->getPrecision() != childLayer->outData[0].get()->getPrecision())
return false;
if (parentLayer->outData[0].get()->getPrecision() != childLayer->outData[0].get()->getPrecision())
return false;
if (parentLayer->precision != childLayer->precision)
return false;
if (parentLayer->precision != childLayer->precision)
return false;
auto parentOutputPrecision = !parentNode->fusedWith.empty()
? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
: parentNode->getCnnLayer()->outData[0].get()->getPrecision();
auto parentOutputPrecision = !parentNode->fusedWith.empty()
? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
: parentNode->getCnnLayer()->outData[0].get()->getPrecision();
auto childOutputPrecision = !childNode->fusedWith.empty()
? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
: childNode->getCnnLayer()->outData[0].get()->getPrecision();
auto childOutputPrecision = !childNode->fusedWith.empty()
? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
: childNode->getCnnLayer()->outData[0].get()->getPrecision();
if (parentOutputPrecision != childOutputPrecision)
return false;
}
if (parentOutputPrecision != childOutputPrecision)
return false;
auto* childConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(childNode.get());
if (childConvolutionNode == nullptr)
@ -928,50 +924,24 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
if (!childConvolutionNode->inputZeroPoints.empty() || !childConvolutionNode->weightsZeroPoints.empty())
return false;
bool withBias = (childLayer->_biases != nullptr && childLayer->_biases->size() != 0) ||
childConvolutionNode->getBaseIntputsNumber() == 3;
auto allPads = getPaddings(*childLayer);
bool isSupportedParams = childLayer->_out_depth == childLayer->_group &&
childLayer->_out_depth != 1 &&
childLayer->_kernel[X_AXIS] == 3 && childLayer->_kernel[Y_AXIS] == 3 &&
allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 &&
childLayer->_dilation[X_AXIS] == 1 && childLayer->_dilation[Y_AXIS] == 1 &&
withBias &&
everyone_is(3, childLayer->_kernel[X_AXIS], childLayer->_kernel[Y_AXIS]) &&
everyone_is(1, allPads.begin[X_AXIS], allPads.begin[Y_AXIS]) &&
everyone_is(1, allPads.end[X_AXIS], allPads.end[Y_AXIS]) &&
everyone_is(1, childLayer->_dilation[X_AXIS], childLayer->_dilation[Y_AXIS]) &&
childLayer->_stride[X_AXIS] == childLayer->_stride[Y_AXIS] &&
false && // TODO [oneDNN]: disabled while not ported
one_of(childLayer->_stride[X_AXIS], 1 /*, 2*/) && // TODO [oneDNN]: stride 2 should also be supported
childNode->getChildEdgeAt(0)->getDims().ndims() == 4;
return isSupportedParams;
};
auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
if (isBinaryConvolutionNode(parentNode)) {
return true;
}
auto* layer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
if (layer == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution layer " << childNode->getName();
auto inDims = childNode->inDims[0];
auto outDims = childNode->outDims[0];
int elemSize = MKLDNNExtensionUtils::sizeOfDataType(MKLDNNExtensionUtils::IEPrecisionToDataType(layer->precision));
int L3_cache_size = mkldnn_get_cache_size(3, false);
int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(parentNode.get());
if (parentConvolutionNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get convolution node " << parentNode->getName();
bool isInt8 = parentConvolutionNode->canBeExecutedInInt8();
bool isAVX512NotSupported = !mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common);
return isInt8 ? isAVX512NotSupported : (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
};
for (int i = 0; i < graphNodes.size(); i++) {
if (!isConvolutionNode(graphNodes[i]) && !isBinaryConvolutionNode(graphNodes[i])) continue;
if (!isConvolutionNode(graphNodes[i])) continue;
auto parentConvNode = graphNodes[i];
if (!isSutableParentConvolution(parentConvNode)) continue;
@ -979,8 +949,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;
if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
parentConvNode->fuseWith(childConvNode);
for (auto node : childConvNode->getFusedWith())
@ -991,7 +959,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
}
}
#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
@ -1117,18 +1084,16 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
return isSutableBinConv && node->getChildEdges().size() == 1;
};
auto isSutableChildNode = [](MKLDNNNodePtr node) {
if (!node->getCnnLayer())
auto isSutableChildNode = [](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
if (childNode->getType() != Quantize)
return false;
if (node->getType() != Quantize)
auto* binConv = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parentNode.get());
if (!binConv) {
return false;
}
auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
if (quantizeNode == nullptr)
THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
return quantizeNode->isBinarization();
return binConv->canFuse(childNode);
};
for (int i = 0; i < graphNodes.size(); i++) {
@ -1136,7 +1101,7 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
if (!isSutableParentNode(parent)) continue;
auto child = parent->getChildEdgeAt(0)->getChild();
if (!isSutableChildNode(child)) continue;
if (!isSutableChildNode(parent, child)) continue;
parent->fuseWith(child);
@ -1205,7 +1170,6 @@ void MKLDNNGraphOptimizer::FusePoolingAndQuantize(MKLDNNGraph &graph) {
graph.DropNode(child);
}
}
#endif
/**
* Check if there is a data dependency between parent and child
@ -1273,7 +1237,6 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
* ***
*/
#if defined(COMPILED_CPU_MKLDNN_ELTWISE_NODE)
void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
@ -1308,17 +1271,27 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
bool isSutableParent1 = parent1->getType() == Convolution || parent1->getType() == BinaryConvolution;
bool isSutableParent2 = parent2->getType() == Convolution || parent2->getType() == BinaryConvolution;
auto* parentNode1 = dynamic_cast<MKLDNNConvolutionNode *>(parent1.get());
if (parentNode1) {
if (!parentNode1->canBeExecutedInInt8()) {
isSutableParent1 = isSutableParent1 && parentNode1->getFusedWith().empty();
auto* binConvNode1 = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parent1.get());
if (binConvNode1) {
isSutableParent1 = isSutableParent1 && binConvNode1->canFuse(graphNode);
}
auto* binConvNode2 = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parent2.get());
if (binConvNode2) {
isSutableParent2 = isSutableParent2 && binConvNode2->canFuse(graphNode);
}
auto* convNode1 = dynamic_cast<MKLDNNConvolutionNode *>(parent1.get());
if (convNode1) {
if (!convNode1->canBeExecutedInInt8()) {
isSutableParent1 = isSutableParent1 && convNode1->getFusedWith().empty();
}
}
auto* parentNode2 = dynamic_cast<MKLDNNConvolutionNode *>(parent2.get());
if (parentNode2) {
if (!parentNode2->canBeExecutedInInt8()) {
isSutableParent2 = isSutableParent2 && parentNode2->getFusedWith().empty();
auto* convNode2 = dynamic_cast<MKLDNNConvolutionNode *>(parent2.get());
if (convNode2) {
if (!convNode2->canBeExecutedInInt8()) {
isSutableParent2 = isSutableParent2 && convNode2->getFusedWith().empty();
}
}
@ -1387,6 +1360,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
if (mergedConvNode != nullptr)
childPort = mergedConvNode->getParentEdges().size();
auto* mergedBinConvNode = dynamic_cast<MKLDNNBinaryConvolutionNode*>(mergedConv.get());
if (mergedBinConvNode != nullptr)
childPort = mergedBinConvNode->getParentEdges().size();
MKLDNNEdgePtr edgePtr(new MKLDNNEdge(peerNode, mergedConv, peer_port, childPort));
graph.GetEdges().push_back(edgePtr);
@ -1415,7 +1392,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
sum->remove();
}
}
#endif
void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
auto& graphNodes = graph.GetNodes();
@ -1759,7 +1735,6 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
}
}
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
std::set<MKLDNNNodePtr> processed;
int graphNodesSize = graph.GetNodes().size();
@ -1897,7 +1872,6 @@ void MKLDNNGraphOptimizer::ChangeConvertToReorder(MKLDNNGraph& graph) {
graph.DropNode(convertCandidate);
}
}
#endif
void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
for (MKLDNNNodePtr& node : graph.GetNodes()) {
@ -1913,7 +1887,6 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
if (cur->getTensorDesc().getPrecision() != l->outData[0]->getTensorDesc().getPrecision()) {
if (node->name.find("_iScaleShift_") != std::string::npos) {
auto child = node->childEdges[0].lock()->getChild();
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
if (child->type == Reorder) {
MKLDNNReorderNode* rn = dynamic_cast<MKLDNNReorderNode*>(child.get());
if (rn != nullptr) {
@ -1921,16 +1894,11 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
graph.DropNode(node);
}
} else {
#else
THROW_IE_EXCEPTION << "Strange case. No Reorder after iScaleShift";
#endif
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
}
#endif
} else if (node->name.find("_oScaleShift_") != std::string::npos) {
auto parent = node->parentEdges[0].lock()->getParent();
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
if (parent->type == Reorder) {
MKLDNNReorderNode* rn = dynamic_cast<MKLDNNReorderNode*>(parent.get());
if (rn != nullptr) {
@ -1938,12 +1906,8 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
graph.DropNode(node);
}
} else {
#else
THROW_IE_EXCEPTION << "Strange case. No Reorder before oScaleShift";
#endif
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
}
#endif
}
}
}
@ -2054,9 +2018,9 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) {
std::vector<float> newCropLow(cropLowData.size());
std::vector<float> newCropHigh(cropHighData.size());
for (int i = 0; i < cropLowData.size(); i++)
newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getBeta());
newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getAlpha());
for (int i = 0; i < cropHighData.size(); i++)
newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getAlpha());
newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getBeta());
quantizeNode->setCropLow(newCropLow);
quantizeNode->setCropHigh(newCropHigh);

View File

@ -27,26 +27,20 @@ private:
void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);
#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
void FuseConvolutionAndQuantize(MKLDNNGraph &graph);
void FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph);
void FusePoolingAndQuantize(MKLDNNGraph &graph);
#endif
void FuseBatchNormWithScale(MKLDNNGraph& graph);
#if defined(COMPILED_CPU_MKLDNN_ELTWISE_NODE)
void FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph);
#endif
void FuseMVNAndSimpleOperation(MKLDNNGraph &graph);
void FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph);
void FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph);
void RemoveIdentityOperator(MKLDNNGraph& graph);
void RemoveIOScaleShifts(MKLDNNGraph& graph);
#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
void DropDoubleReorders(MKLDNNGraph& graph);
void DropConvertReorder(MKLDNNGraph& graph);
void ChangeConvertToReorder(MKLDNNGraph &graph);
#endif
void FuseConvolutionAndZeroPoints(MKLDNNGraph &graph);
void FuseBroadcastAndEltwise(MKLDNNGraph &graph);
void FuseEltwiseAndSimple(MKLDNNGraph &graph);

View File

@ -113,14 +113,14 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
// these precisions are supported by mkldnn, so we push the blob directly
// BUT if a mean image exists, we convert the blob and send FP32
case InferenceEngine::Precision::U8:
case InferenceEngine::Precision::BOOL:
case InferenceEngine::Precision::I16: {
case InferenceEngine::Precision::BOOL: {
if (graph->hasMeanImageFor(input.first))
inPrec = InferenceEngine::Precision::FP32;
break;
}
// these precisions are unsupported by mkldnn, so we convert the blob and send I32
case InferenceEngine::Precision::U16:
case InferenceEngine::Precision::I16:
case InferenceEngine::Precision::I64:
case InferenceEngine::Precision::U64: {
inPrec = InferenceEngine::Precision::I32;
@ -143,9 +143,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushStates() {
auto cur_state_mem = cur_node->getStore();
auto data_ptr = state->GetState()->cbuffer().as<void*>();
auto data_size = state->GetState()->byteSize();
auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(cur_state_mem->GetDataType());
auto padSize = cur_state_mem->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetData()) + padSize * elemSize;
auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetPtr());
cpu_memcpy(cur_state_mem_buf, data_ptr, data_size);
}
@ -164,9 +162,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::PullStates() {
auto cur_state_mem = cur_node->getStore();
auto data_ptr = state->GetState()->cbuffer().as<void*>();
auto data_size = state->GetState()->byteSize();
auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(cur_state_mem->GetDataType());
auto padSize = cur_state_mem->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetData()) + padSize * elemSize;
auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetPtr());
cpu_memcpy(data_ptr, cur_state_mem_buf, data_size);
}

File diff suppressed because it is too large Load Diff

View File

@ -4,40 +4,71 @@
#pragma once
#include <memory>
#include <vector>
#include "ie_layouts.h"
#include "mkldnn_dims.h"
#include <mkldnn.hpp>
#include <string>
#include <mkldnn_types.h>
#include <string>
#include <functional>
#include <memory>
#include <vector>
/**
* @file contains a concept classes to work with memory/tensor/blob abstractions on plugin level.
*
* MKLDNNMemoryDesc - the descriptor of tensor representation in memory. Describes all required information
* for proper allocation and handling tensor in some buffer. The real memory is not present, just description.
* This object answers on question how and where data with logical index [x1, x2, .. xN] placed in real buffer.
* In the simplest case it describe a mapping between "logical offset" and "real offset".
*
* MKLDNNMemory is an abstraction of some real tensor which contains some data. As in short it's a pair of
* memory descriptor and raw buffer handler to contains data. In case of system memory raw buffer it's simple
* "void*" on some system memory buffer.
*
*/
namespace MKLDNNPlugin {
/**
* Represent internal plugin abstraction of tensor description
*
*/
class MKLDNNMemoryDesc {
public:
MKLDNNMemoryDesc(): desc({}, mkldnn::memory::data_type::f32, mkldnn::memory::format::format_undef) {}
/** Empty constructor - doesn't define any tensor representation */
MKLDNNMemoryDesc(): desc() {}
/** Construct a tensor desc with plain layout format (like ND C array) */
MKLDNNMemoryDesc(const mkldnn::memory::dims& dims, mkldnn::memory::data_type dataType);
/** Construct a tensor desc with specified layout format tag. Any and Undef is not supported */
MKLDNNMemoryDesc(const mkldnn::memory::dims& dims, mkldnn::memory::data_type dataType, mkldnn::memory::format_tag format);
explicit MKLDNNMemoryDesc(const InferenceEngine::TensorDesc& tDesc);
explicit MKLDNNMemoryDesc(const mkldnn::memory::desc& desc): desc(desc) {}
MKLDNNMemoryDesc(mkldnn::memory::dims dims, mkldnn::memory::data_type dataType, mkldnn::memory::format format);
mkldnn::memory::format getFormat() const {
return static_cast<mkldnn::memory::format>(desc.data.format);
}
/**
* Try to define original format tag use on creation
*
* @return format tag if was able to define it
*/
mkldnn::memory::format_tag getFormat() const;
mkldnn::memory::data_type getDataType() const {
return static_cast<mkldnn::memory::data_type>(desc.data.data_type);
}
size_t GetElementSize() const;
MKLDNNDims getDims() const {
return MKLDNNDims(desc.data.dims, desc.data.ndims);
}
bool blocksExtended() const;
operator bool() const {
return getFormat() != mkldnn::memory::format::any && getFormat() != mkldnn::memory::format::format_undef;
return getFormat() != mkldnn::memory::format_tag::any && getFormat() != mkldnn::memory::format_tag::undef;
}
bool operator == (const MKLDNNMemoryDesc& rhs) const;
@ -46,15 +77,19 @@ public:
operator mkldnn::memory::desc() const;
operator InferenceEngine::TensorDesc() const;
bool isPlainFormat() const;
bool isBlockedCFormat(size_t blk_size = UNREACHABLE_DIM) const;
bool isTailCFormat() const;
bool isSame(mkldnn::memory::format_tag fmt) const;
private:
static constexpr size_t UNREACHABLE_DIM = std::numeric_limits<size_t>::max();
mkldnn::memory::desc desc;
};
class MKLDNNMemory;
using MKLDNNMemoryPtr = std::shared_ptr<MKLDNNMemory>;
class MKLDNNMemory {
public:
explicit MKLDNNMemory(const mkldnn::engine& eng);
@ -68,13 +103,17 @@ public:
}
mkldnn::memory::desc GetDescriptor() const {
return prim->get_primitive_desc().desc();
return prim->get_desc();
}
mkldnn::memory::primitive_desc GetPrimitiveDescriptor() const {
return prim->get_primitive_desc();
const MKLDNNMemoryDesc GetDesc() const {
return MKLDNNMemoryDesc {prim->get_desc()};
}
/**
* Return handler of buffer. Real data may starts from some other offset
* @return
*/
void* GetData() const {
void* data = prim->get_data_handle();
if (data == nullptr)
@ -82,6 +121,18 @@ public:
return data;
}
/**
* Return raw pointer on first element
* Like a GetData() but offset is applied.
* @return
*/
void* GetPtr() const {
auto ptr = static_cast<uint8_t*>(GetData());
ptr += GetDescriptor().data.offset0 * GetDesc().GetElementSize();
return ptr;
}
mkldnn::memory::data_type GetDataType() const {
return static_cast<mkldnn::memory::data_type>(GetDescriptor().data.data_type);
}
@ -89,41 +140,35 @@ public:
size_t GetSize() const;
size_t GetElementsCount() const;
mkldnn::memory::format GetFormat() const {
return static_cast<mkldnn::memory::format>(prim->get_primitive_desc().desc().data.format);
}
mkldnn::memory::dims GetDims() const {
auto data = GetDescriptor().data;
return std::vector<ptrdiff_t>(data.dims, data.dims + data.ndims);
return {std::begin(data.dims), std::begin(data.dims) + data.ndims};
}
void Create(mkldnn::memory::dims dims, mkldnn::memory::data_type data_type, mkldnn::memory::format format,
void Create(const mkldnn::memory::dims& dims, mkldnn::memory::data_type data_type, mkldnn::memory::format_tag format,
const void* data = nullptr);
void Create(const mkldnn::memory::desc& desc, const void* data = nullptr, bool pads_zeroing = true);
void SetData(mkldnn::memory::data_type dataType, mkldnn::memory::format format, const void* data, size_t size, bool ftz = true) const;
void SetData(const MKLDNNMemory& memory, bool ftz = true) const;
// Like a plain format
void SetData(mkldnn::memory::data_type dataType, mkldnn::memory::format_tag format, const void* data, size_t size, bool ftz = true) const;
void SetData(const MKLDNNMemory& memory, size_t size = 0, bool ftz = true) const;
void FillZero();
static bool IsPlainFormat(mkldnn::memory::format format);
static bool IsGroupedFormat(mkldnn::memory::format format);
static mkldnn::memory::format GetPlainFormat(mkldnn::memory::dims dims);
static InferenceEngine::Layout GetPlainLayout(mkldnn::memory::dims dims);
static bool isConsistant(mkldnn::memory::dims dims, mkldnn::memory::format format);
static mkldnn::memory::format Convert(const InferenceEngine::Layout layout);
static mkldnn::memory::format_tag GetPlainFormat(const mkldnn::memory::dims& dims);
static InferenceEngine::Layout GetPlainLayout(const mkldnn::memory::dims& dims);
static bool isConsistant(const mkldnn::memory::dims& dims, mkldnn::memory::format_tag format);
static mkldnn::memory::format_tag Convert(const InferenceEngine::Layout layout);
static InferenceEngine::Precision convertToIePrec(mkldnn::memory::data_type dataType);
static mkldnn::memory::data_type convertToDataType(const InferenceEngine::Precision &precision);
static std::string formatToString(mkldnn::memory::format fmt);
static void CreateBlockingDesc(mkldnn::memory::desc& desc);
static std::string formatToString(mkldnn::memory::format_tag fmt);
private:
std::shared_ptr<mkldnn::memory> prim;
mkldnn::engine eng;
};
using MKLDNNMemoryPtr = std::shared_ptr<MKLDNNMemory>;
} // namespace MKLDNNPlugin

View File

@ -27,7 +27,7 @@ namespace MKLDNNPlugin {
*
* Example:
*
* Mem
* Mem(offset)
* | |____| Box {4, 5}
* | |_____________| Box {2, 6}
* | |____| Box {3, 4}
@ -38,7 +38,7 @@ namespace MKLDNNPlugin {
*
* Boxes which has an ExecOrder-axis intersection should have no Mem-axis intersections.
* The goal is to define a minimal required memory blob to store all boxes with such
* constraints and specify all corresponfing position on Mem axis(through offset field).
* constraints and specify all corresponding position on Mem axis(through offset field).
*
* NOTE!
* Exec order is predefined.

View File

@ -27,17 +27,12 @@
#include <nodes/mkldnn_pooling_node.h>
#include <nodes/mkldnn_reorder_node.h>
#include <nodes/mkldnn_reshape_node.h>
#include <nodes/mkldnn_roi_pooling_node.h>
#include <nodes/mkldnn_softmax_node.h>
#include <nodes/mkldnn_tile_node.h>
#include <nodes/mkldnn_split_node.h>
#include <nodes/mkldnn_pad_node.h>
#include <nodes/mkldnn_permute_node.h>
#include <nodes/mkldnn_memory_node.hpp>
#include <nodes/mkldnn_rnn.h>
#include <nodes/mkldnn_quantize_node.h>
#include <nodes/mkldnn_bin_conv_node.h>
#include <nodes/mkldnn_def_conv_node.h>
#include <nodes/mkldnn_mvn_node.h>
#include <nodes/mkldnn_normalize_node.h>
#include <nodes/mkldnn_reduce_node.h>
@ -45,6 +40,7 @@
#include <nodes/mkldnn_scatter_update_node.h>
#include <nodes/mkldnn_interpolate_node.h>
#include <mkldnn_types.h>
#include <dnnl_types.h>
#include "mkldnn_extension_utils.h"
#include "nodes/common/cpu_memcpy.h"
@ -202,7 +198,7 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::
while (getline(stream, str, ',')) {
if (str.substr(0, 4) != "cpu:")
continue;
inputMemoryFormatsFilter.push_back(mkldnn_str2fmt(str.substr(4, str.size()).c_str()));
inputMemoryFormatsFilter.push_back(mkldnn::utils::str2fmt(str.substr(4, str.size()).c_str()));
}
}
@ -213,7 +209,7 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::
while (getline(stream, str, ',')) {
if (str.substr(0, 4) != "cpu:")
continue;
outputMemoryFormatsFilter.push_back(mkldnn_str2fmt(str.substr(4, str.size()).c_str()));
outputMemoryFormatsFilter.push_back(mkldnn::utils::str2fmt(str.substr(4, str.size()).c_str()));
}
}
}
@ -488,25 +484,25 @@ const std::vector<MKLDNNEdgePtr> MKLDNNNode::getChildEdgesAtPort(size_t idx) con
}
std::vector<memory::format> MKLDNNNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
std::vector<memory::format_tag> MKLDNNNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
if (dims.ndims() == 0)
return {memory::format::x};
return {memory::format_tag::x};
else if (dims.ndims() == 1)
return {memory::format::x};
return {memory::format_tag::x};
else if (dims.ndims() == 2)
return {memory::format::nc};
return {memory::format_tag::nc};
else if (dims.ndims() == 3)
return {memory::format::tnc, memory::format::ntc};
return {memory::format_tag::tnc, memory::format_tag::ntc};
else if (dims.ndims() == 4)
return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c};
return {memory::format_tag::nchw, memory::format_tag::nChw8c, memory::format_tag::nChw16c};
else if (dims.ndims() == 5)
return {memory::format::ncdhw, memory::format::nCdhw8c, memory::format::nCdhw16c};
return {memory::format::any};
return {memory::format_tag::ncdhw, memory::format_tag::nCdhw8c, memory::format_tag::nCdhw16c};
return {memory::format_tag::any};
}
void MKLDNNNode::execute(mkldnn::stream strm) {
if (prim) {
strm.submit({*prim});
(*prim).execute(strm, primArgs);
}
}
@ -516,7 +512,8 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() {
for (auto& desc : descs) {
auto itpd = desc.createPrimitiveDescriptorIterator(engine);
while (itpd.is_not_end()) {
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
for (size_t i = 0; i < descInputNumbers(desc); i++) {
@ -527,35 +524,35 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() {
config.inConfs.push_back(dataConfig);
}
std::vector<mkldnn::memory::format> outFormats;
for (size_t i = 0; i < descOutputNumbers(desc); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = canBeInPlace() ? 0 : -1;
dataConfig.constant = false;
dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getDstMemDesc(itpd, i));
config.outConfs.push_back(dataConfig);
auto primDesc = itpd.fetch();
auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0);
if (dstPrimDesc) {
outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
} else {
// This path is needed to correctly handle Deconvolution node
auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0);
if (diffSrcPrimDesc) {
outFormats.emplace_back(static_cast<memory::format>(itpd.diff_src_primitive_desc().desc().data.format));
}
}
}
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
itpd++;
supportedPrimitiveDescriptors.emplace_back(config, impl_type);
if (!itpd.next_impl())
break;
}
}
}
void MKLDNNNode::filterSupportedPrimitiveDescriptors() {
// Compare by partial layout descriptor (without particular strides values)
auto areCompatible = [](const TensorDesc& tdesc, mkldnn::memory::format_tag fmt) {
TensorDesc fmt_tdesc = MKLDNNMemoryDesc{
MKLDNNDims(tdesc.getDims()),
MKLDNNExtensionUtils::IEPrecisionToDataType(tdesc.getPrecision()),
fmt};
auto tmp_partial_tdesc = PartialBlkDesc::extractFrom(fmt_tdesc);
auto actual_partial_tdesc = PartialBlkDesc::extractFrom(tdesc);
return tmp_partial_tdesc == actual_partial_tdesc;
};
if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty()) {
auto itpd = supportedPrimitiveDescriptors.begin();
while (itpd != supportedPrimitiveDescriptors.end()) {
@ -565,12 +562,12 @@ void MKLDNNNode::filterSupportedPrimitiveDescriptors() {
bool isSuitableDesc = true;
for (int i = 0; i < inputMemoryFormatsFilter.size(); i++) {
if (inputMemoryFormatsFilter[i] != MKLDNNMemoryDesc(config.inConfs[i].desc).getFormat())
isSuitableDesc = false;
const bool matched = areCompatible(config.inConfs[i].desc, inputMemoryFormatsFilter[i]);
isSuitableDesc &= matched;
}
for (int i = 0; i < outputMemoryFormatsFilter.size(); i++) {
if (outputMemoryFormatsFilter[i] != MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat())
isSuitableDesc = false;
const bool matched = areCompatible(config.outConfs[i].desc, outputMemoryFormatsFilter[i]);
isSuitableDesc &= matched;
}
if (!isSuitableDesc) {
itpd = supportedPrimitiveDescriptors.erase(itpd);
@ -600,20 +597,20 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
size_t selected_count = 0;
for (size_t j = 0; j < descs.size(); j++) {
const auto &desc = descs[j];
std::shared_ptr<primitive_desc_iterator> itpd;
primitive_desc_iterator itpd;
if (attr == nullptr) {
itpd = std::make_shared<primitive_desc_iterator>(desc.createPrimitiveDescriptorIterator(engine));
itpd = desc.createPrimitiveDescriptorIterator(engine);
} else {
itpd = std::make_shared<primitive_desc_iterator>(desc.createPrimitiveDescriptorIterator(engine, *(attr.get())));
itpd = desc.createPrimitiveDescriptorIterator(engine, *(attr.get()));
}
while (itpd->is_not_end()) {
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig cfg;
cfg.dynBatchSupport = true;
for (size_t i = 0; i < descInputNumbers(desc); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = canBeInPlace() ? 0 : -1;
dataConfig.constant = false;
dataConfig.desc = getSrcMemDesc(*itpd, i);
dataConfig.desc = getSrcMemDesc(itpd, i);
cfg.inConfs.push_back(dataConfig);
}
@ -621,10 +618,10 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = getDstMemDesc(*itpd, i);
dataConfig.desc = getDstMemDesc(itpd, i);
cfg.outConfs.push_back(dataConfig);
}
impl_desc_type impl_type = parse_impl_name(itpd->get_impl_info_str().c_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (selected_count == selectedPrimitiveDescriptorIndex) {
if (impl_type != selectedPD->getImplementationType()) {
THROW_IE_EXCEPTION << "Cannot get the original layer configuration!";
@ -637,7 +634,8 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
}
}
selected_count++;
(*itpd)++;
if (!itpd.next_impl())
break;
}
}
@ -747,16 +745,9 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri
auto create = [&] () {
auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
auto newFormat = newDesc.getFormat();
if (newFormat == mkldnn::memory::ncdhw) {
newFormat = mkldnn::memory::goihw;
}
if (newFormat == mkldnn::memory::nchw) {
newFormat = mkldnn::memory::oihw;
}
MKLDNNMemory memory{ engine };
memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
memory.Create(newDesc, internalBlob->buffer());
MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(engine));
_ptr->Create(intDescs[i]);
@ -1045,7 +1036,7 @@ bool MKLDNNNode::isInitConfig(const InferenceEngine::LayerConfig& config) const
}
MKLDNNMemoryDesc MKLDNNNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY)
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getParentEdgeAt(idx)->getDims().ToSizeVector(),
@ -1057,7 +1048,7 @@ MKLDNNMemoryDesc MKLDNNNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &prim
}
MKLDNNMemoryDesc MKLDNNNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc());
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY)
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),
@ -1091,8 +1082,25 @@ int MKLDNNNode::getMaxBatch() {
void MKLDNNNode::setDynamicBatchLim(int lim) {
dynBatchLim = lim;
if (prim) {
prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
auto setDynamicBatch = [this](int argType, int newBatch) {
auto param = primArgs.find(argType);
if (param != primArgs.end()) {
auto oldMem = param->second;
mkldnn::memory::desc newMemDesc(oldMem.get_desc());
newMemDesc.data.dims[0] = newBatch;
newMemDesc.data.padded_dims[0] = newBatch;
mkldnn::memory newMem(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
primArgs.at(argType) = newMem;
}
};
if (!primArgs.empty()) {
int newBatch = batchToProcess();
setDynamicBatch(DNNL_ARG_SRC, newBatch);
setDynamicBatch(DNNL_ARG_DST, newBatch);
setDynamicBatch(DNNL_ARG_DIFF_SRC, newBatch);
setDynamicBatch(DNNL_ARG_DIFF_DST, newBatch);
}
}

View File

@ -207,16 +207,16 @@ static std::string NameFromType(Type type) {
class PrimitiveDescInfo {
public:
PrimitiveDescInfo(const InferenceEngine::LayerConfig conf, impl_desc_type type): config(conf) {
PrimitiveDescInfo(const InferenceEngine::LayerConfig& conf, impl_desc_type type): config(conf) {
implementationType = type;
}
PrimitiveDescInfo(const InferenceEngine::LayerConfig conf, impl_desc_type type, std::vector<mkldnn::memory::format> outFmts): config(conf) {
PrimitiveDescInfo(const InferenceEngine::LayerConfig& conf, impl_desc_type type, const std::vector<mkldnn::memory::format_tag>& outFmts): config(conf) {
implementationType = type;
outputLayouts = outFmts;
}
PrimitiveDescInfo(const InferenceEngine::LayerConfig conf, impl_desc_type type, mkldnn::memory::format outFmt): config(conf) {
PrimitiveDescInfo(const InferenceEngine::LayerConfig& conf, impl_desc_type type, mkldnn::memory::format_tag outFmt): config(conf) {
implementationType = type;
setOutputLayouts(outFmt);
@ -238,7 +238,7 @@ public:
return implementationType;
}
const std::vector<mkldnn::memory::format>& getOutputLayouts() const {
const std::vector<mkldnn::memory::format_tag>& getOutputLayouts() const {
return outputLayouts;
}
@ -246,7 +246,7 @@ public:
implementationType = type;
}
void setOutputLayouts(mkldnn::memory::format outFmt) {
void setOutputLayouts(mkldnn::memory::format_tag outFmt) {
outputLayouts.clear();
for (int i = 0; i < config.outConfs.size(); i++) {
@ -257,7 +257,7 @@ public:
private:
InferenceEngine::LayerConfig config;
impl_desc_type implementationType;
std::vector<mkldnn::memory::format> outputLayouts;
std::vector<mkldnn::memory::format_tag> outputLayouts;
};
class MKLDNNNode : public InferenceEngine::details::no_copy {
@ -458,7 +458,7 @@ public:
for (const auto& desc : descs) {
auto itpd = desc.createPrimitiveDescriptorIterator(engine, attr);
while (itpd.is_not_end()) {
while (static_cast<bool>(itpd)) {
std::vector<InferenceEngine::TensorDesc> srcDescs;
for (size_t i = 0; i < descInputNumbers(desc); i++)
srcDescs.push_back(getSrcMemDesc(itpd, i));
@ -467,17 +467,17 @@ public:
for (size_t i = 0; i < descOutputNumbers(desc); i++)
dstDescs.push_back(getDstMemDesc(itpd, i));
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type == selected_pd->getImplementationType() &&
descsEqual(srcDescs, selected_pd->getConfig().inConfs) &&
descsEqual(dstDescs, selected_pd->getConfig().outConfs)) {
prepareMemory(selected_pd, itpd);
PD prim_desc = createPd<PD, D, FPD>(desc);
itpd.getPrimitiveDescriptor(prim_desc);
return prim_desc;
return {itpd.get()};
}
itpd++;
if (!itpd.next_impl())
break;
}
}
@ -553,8 +553,8 @@ protected:
std::vector <MKLDNNNodePtr> fusedWith;
std::vector <MKLDNNNodePtr> mergedWith;
std::vector <impl_desc_type> implPriorities;
std::vector <mkldnn_memory_format_t> inputMemoryFormatsFilter;
std::vector <mkldnn_memory_format_t> outputMemoryFormatsFilter;
std::vector <mkldnn::memory::format_tag> inputMemoryFormatsFilter;
std::vector <mkldnn::memory::format_tag> outputMemoryFormatsFilter;
std::string originalLayers; // contains names of the original layers separated by comma
@ -573,6 +573,7 @@ protected:
std::vector<InferenceEngine::Blob::Ptr> internalBlobs;
std::vector<MKLDNNMemoryPtr> internalBlobMemory;
std::vector<PrimitiveDescInfo> supportedPrimitiveDescriptors;
std::unordered_map<int, mkldnn::memory> primArgs;
MKLDNNPrimitive prim;
std::vector<MKLDNNDescriptor> descs;
@ -590,7 +591,7 @@ protected:
virtual const std::vector<impl_desc_type>& getPrimitivesPriority();
std::vector<mkldnn::memory::format> getAvailableFormatsForDims(const MKLDNNDims& dims) const;
virtual std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims& dims) const;
int batchToProcess();
InferenceEngine::Blob::Ptr createInternalBlob(InferenceEngine::SizeVector dims, bool weights, bool is_grouped = false);
@ -679,15 +680,4 @@ static struct REG_MKLDNN_CONCAT3(Registrar4, __prim, __LINE__) {
} \
} REG_MKLDNN_CONCAT3(_reg_, __prim, __LINE__);
template <typename T, typename U>
inline T div_up(const T a, const U b) {
assert(b);
return (a + b - 1) / b;
}
template <typename T, typename U>
inline T rnd_up(const T a, const U b) {
return div_up(a, b) * b;
}
} // namespace MKLDNNPlugin

View File

@ -120,6 +120,7 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
std::vector<std::pair<ngraph::element::Type, ngraph::element::Type>> convert_precision_list{
{ngraph::element::i64, ngraph::element::i32},
{ngraph::element::u64, ngraph::element::i32},
{ngraph::element::i16, ngraph::element::i32},
{ngraph::element::u16, ngraph::element::i32},
{ngraph::element::u32, ngraph::element::i32},
{ngraph::element::f16, ngraph::element::f32},
@ -325,6 +326,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
NetPass::ConvertPrecision(implNetworkWrapper, Precision::FP16, Precision::FP32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::BOOL, Precision::U8);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::U16, Precision::I32);
NetPass::ConvertPrecision(implNetworkWrapper, Precision::I16, Precision::I32);
}
}

View File

@ -4,9 +4,6 @@
#include <mkldnn_types.h>
#include "mkldnn_primitive.h"
#include "../../thirdparty/mkl-dnn/src/common/primitive_desc.hpp"
#include "../../thirdparty/mkl-dnn/src/common/memory_pd.hpp"
#include "../../thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp"
using namespace MKLDNNPlugin;
@ -20,65 +17,11 @@ mkldnn::primitive MKLDNNPrimitive::operator*() {
return *prim;
}
void MKLDNNPrimitive::reset(mkldnn::primitive* prim) {
this->prim.reset(prim);
void MKLDNNPrimitive::reset(mkldnn::primitive* primitive) {
prim.reset(primitive);
}
MKLDNNPrimitive &MKLDNNPrimitive::operator=(const std::shared_ptr<mkldnn::primitive>& prim) {
this->prim = prim;
MKLDNNPrimitive &MKLDNNPrimitive::operator=(const std::shared_ptr<mkldnn::primitive>& primitive) {
prim = primitive;
return *this;
}
void MKLDNNPrimitive::setBatchLimit(int batch, size_t inputNum, size_t outputNum) {
bool success = true;
auto * primDesc = prim->get_primitive_desc();
auto * concatPrimDesc = dynamic_cast<const mkldnn::impl::cpu::cpu_concat_pd_t *>(primDesc);
for (int i = 0; success && i < primDesc->n_inputs() && i < inputNum; i++) {
// Depthwise layers contains weights as input
if (primDesc->input_pd()->desc()->ndims != primDesc->input_pd(i)->desc()->ndims)
break;
auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->input_pd(i)->desc());
if (originInputBatches.size() <= i)
originInputBatches.push_back(memDesc->dims[0]);
if (batch > originInputBatches[i])
success = false;
memDesc->dims[0] = batch;
memDesc->layout_desc.blocking.padding_dims[0] = batch;
if (concatPrimDesc != nullptr) {
memDesc = const_cast<mkldnn_memory_desc_t *>(concatPrimDesc->src_image_pd(i)->desc());
memDesc->dims[0] = batch;
memDesc->layout_desc.blocking.padding_dims[0] = batch;
}
}
for (int i = 0; success && i < primDesc->n_outputs() && i < outputNum; i++) {
if (primDesc->output_pd()->desc()->ndims != primDesc->output_pd(i)->desc()->ndims)
break;
auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->output_pd(i)->desc());
if (i < inputNum && memDesc == primDesc->input_pd(i)->desc())
continue;
if (originOutputBatches.size() <= i)
originOutputBatches.push_back(memDesc->dims[0]);
if (batch > originOutputBatches[i])
success = false;
memDesc->dims[0] = batch;
memDesc->layout_desc.blocking.padding_dims[0] = batch;
}
if (success)
return;
for (int i = 0; i < primDesc->n_inputs() && i < originInputBatches.size(); i++) {
auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->input_pd(i)->desc());
memDesc->dims[0] = originInputBatches[i];
memDesc->layout_desc.blocking.padding_dims[0] = originInputBatches[i];
}
for (int i = 0; i < primDesc->n_outputs() && i < originOutputBatches.size(); i++) {
auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->output_pd(i)->desc());
memDesc->dims[0] = originOutputBatches[i];
memDesc->layout_desc.blocking.padding_dims[0] = originOutputBatches[i];
}
THROW_IE_EXCEPTION << "Dynamic batch cannot be changed!";
}

View File

@ -17,16 +17,13 @@ class MKLDNNPrimitive {
public:
MKLDNNPrimitive();
operator bool();
MKLDNNPrimitive& operator=(const std::shared_ptr<mkldnn::primitive>& prim);
MKLDNNPrimitive& operator=(const std::shared_ptr<mkldnn::primitive>& primitive);
mkldnn::primitive operator*();
void reset(mkldnn::primitive* prim);
void setBatchLimit(int batch, size_t inputNum, size_t outputNum);
void reset(mkldnn::primitive* primitive);
private:
std::shared_ptr<mkldnn::primitive> prim;
std::vector<int> originInputBatches;
std::vector<int> originOutputBatches;
};
} // namespace MKLDNNPlugin

View File

@ -3,6 +3,9 @@
//
#include "emitter.h"
#include "utils/general_utils.h"
#include <vector>
using namespace mkldnn::impl::cpu;
@ -11,28 +14,19 @@ using namespace Xbyak;
namespace MKLDNNPlugin {
template <typename T, typename P>
constexpr bool one_of(T val, P item) { return val == item; }
template <typename T, typename P, typename... Args>
constexpr bool one_of(T val, P item, Args... item_others) {
return val == item || one_of(val, item_others...);
}
size_t jit_emitter::get_max_vecs_count() const {
return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 32 : 16;
return one_of(host_isa_, cpu::x64::avx512_common, cpu::x64::avx512_core) ? 32 : 16;
}
size_t jit_emitter::get_vec_length() const {
return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 64 :
one_of(host_isa_, cpu::avx2) ? 32 : 16;
return one_of(host_isa_, cpu::x64::avx512_common, cpu::x64::avx512_core) ? 64 :
one_of(host_isa_, cpu::x64::avx2) ? 32 : 16;
}
void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const {
if (host_isa_ == cpu::sse42) {
if (host_isa_ == cpu::x64::sse41) {
h->uni_vmovups(addr, Xmm(vec_idx));
} else if (host_isa_ == cpu::avx2) {
} else if (host_isa_ == cpu::x64::avx2) {
h->uni_vmovups(addr, Ymm(vec_idx));
} else {
h->uni_vmovups(addr, Zmm(vec_idx));
@ -40,9 +34,9 @@ void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const {
}
void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const {
if (host_isa_ == cpu::sse42) {
if (host_isa_ == cpu::x64::sse41) {
h->uni_vmovups(Xmm(vec_idx), addr);
} else if (host_isa_ == cpu::avx2) {
} else if (host_isa_ == cpu::x64::avx2) {
h->uni_vmovups(Ymm(vec_idx), addr);
} else {
h->uni_vmovups(Zmm(vec_idx), addr);
@ -69,8 +63,8 @@ void jit_emitter::emitter_preamble(const std::vector<size_t> &in_vec_idxs, const
for (auto idx : pool_vec_idxs)
aux_vec_idxs.push_back(idx);
// For sse42 mask register has to be Xmm(0)
if (host_isa_ == cpu::sse42 && aux_vecs_count() > 0) {
// For sse41 mask register has to be Xmm(0)
if (host_isa_ == cpu::x64::sse41 && aux_vecs_count() > 0) {
size_t idx = 0;
assert(std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) == in_vec_idxs.end());
if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) {

View File

@ -5,15 +5,16 @@
#pragma once
#include <ie_common.h>
#include "jit_generator.hpp"
#include <cpu/x64/jit_generator.hpp>
#include "mkldnn_node.h"
#include <set>
namespace MKLDNNPlugin {
class jit_emitter {
public:
jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_emitter(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
: h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) {
k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well
@ -33,8 +34,8 @@ protected:
size_t get_vec_length() const;
const MKLDNNNode* n;
mkldnn::impl::cpu::jit_generator* h;
mkldnn::impl::cpu::cpu_isa_t host_isa_;
mkldnn::impl::cpu::x64::jit_generator* h;
mkldnn::impl::cpu::x64::cpu_isa_t host_isa_;
InferenceEngine::Precision exec_prc_;
Xbyak::Opmask k_mask;
@ -63,12 +64,12 @@ protected:
Xbyak::Label l_table;
enum {
_cmp_eq_oq = mkldnn::impl::cpu::jit_generator::_cmp_eq_oq,
_cmp_neq_uq = mkldnn::impl::cpu::jit_generator::_cmp_neq_uq,
_cmp_lt_os = mkldnn::impl::cpu::jit_generator::_cmp_lt_os,
_cmp_le_os = mkldnn::impl::cpu::jit_generator::_cmp_le_os,
_cmp_ge_os = mkldnn::impl::cpu::jit_generator::_cmp_nlt_us,
_cmp_gt_os = mkldnn::impl::cpu::jit_generator::_cmp_nle_us,
_cmp_eq_oq = mkldnn::impl::cpu::x64::jit_generator::_cmp_eq_oq,
_cmp_neq_uq = mkldnn::impl::cpu::x64::jit_generator::_cmp_neq_uq,
_cmp_lt_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_lt_os,
_cmp_le_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_le_os,
_cmp_ge_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_nlt_us,
_cmp_gt_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_nle_us,
};
virtual void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,

View File

@ -2,18 +2,23 @@
// SPDX-License-Identifier: Apache-2.0
//
#include <vector>
#include <ie_parallel.hpp>
#include <mkldnn_extension_utils.h>
#include "jit_generator.hpp"
#include "jit_uni_eltwise.hpp"
#include "utils/bfloat16.hpp"
#include "softmax.h"
#include <ie_parallel.hpp>
#include <cpu/x64/jit_generator.hpp>
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
#include <mkldnn.hpp> // TODO: just to replace mkldnn->dnnl via macros
#include "utils/bfloat16.hpp"
#include <algorithm>
#include <cassert>
#include <vector>
using namespace InferenceEngine;
using namespace MKLDNNPlugin;
using namespace mkldnn;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
#define GET_OFF(field) offsetof(jit_args_softmax, field)
@ -39,14 +44,23 @@ struct jit_uni_softmax_kernel {
jit_uni_softmax_kernel() : ker_(nullptr) {}
virtual ~jit_uni_softmax_kernel() {}
virtual void create_ker() = 0;
};
template <cpu_isa_t isa>
struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32)
jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator() {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jcp_(jcp), jit_uni_softmax_kernel(), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -69,23 +83,23 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
mov(aux_reg_work_amount, reg_work_amount);
mov(aux_reg_src, reg_src);
load_vector(vmm_max, ptr[aux_reg_src], jcp.src_dt);
load_vector(vmm_max, ptr[aux_reg_src], jcp_.src_dt);
L(max_loop_label); {
cmp(aux_reg_work_amount, 0);
jle(max_loop_end_label, T_NEAR);
load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
load_vector(vmm_val, ptr[aux_reg_src], jcp_.src_dt);
if (isa == cpu::sse42) {
if (isa == x64::sse41) {
uni_vmovups(vmm_mask, vmm_val);
uni_vcmpgtps(vmm_mask, vmm_mask, vmm_max);
} else if (isa == cpu::avx2) {
} else if (isa == x64::avx2) {
uni_vcmpgtps(vmm_mask, vmm_val, vmm_max);
} else {
vcmpps(k_mask, vmm_val, vmm_max, _cmp_nle_us);
}
if (isa == cpu::avx512_common) {
if (isa == x64::avx512_common) {
vptestmd(k_mask, vmm_mask, vmm_mask);
vblendmps(vmm_max | k_mask, vmm_max, vmm_val);
} else {
@ -108,13 +122,13 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
cmp(aux_reg_work_amount, 0);
jle(exp_loop_end_label, T_NEAR);
load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
load_vector(vmm_val, ptr[aux_reg_src], jcp_.src_dt);
uni_vsubps(vmm_val, vmm_val, vmm_max);
exp_injector->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
uni_vaddps(vmm_exp_sum, vmm_exp_sum, vmm_val);
store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt);
store_vector(ptr[aux_reg_dst], vmm_val, jcp_.dst_dt);
add(aux_reg_src, reg_src_stride);
add(aux_reg_dst, reg_dst_stride);
@ -131,11 +145,11 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
cmp(aux_reg_work_amount, 0);
jle(div_loop_end_label, T_NEAR);
load_vector(vmm_val, ptr[aux_reg_dst], jcp.dst_dt);
load_vector(vmm_val, ptr[aux_reg_dst], jcp_.dst_dt);
uni_vdivps(vmm_val, vmm_val, vmm_exp_sum);
store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt);
store_vector(ptr[aux_reg_dst], vmm_val, jcp_.dst_dt);
add(aux_reg_dst, reg_dst_stride);
sub(aux_reg_work_amount, 1);
@ -151,13 +165,10 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
emu_vcvtneps2bf16->emit_table();
exp_injector->prepare_table();
ker_ = (decltype(ker_))this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
using Vmm = typename conditional3<isa == x64::sse41, Xbyak::Xmm, isa == x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
Xbyak::Reg64 reg_src = r8;
@ -181,6 +192,8 @@ private:
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
jit_softmax_config_params jcp_;
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_dt) {
switch (src_dt) {
case Precision::FP32:
@ -227,16 +240,18 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
jcp.src_dt = inpPrc;
jcp.dst_dt = outPrc;
if (mayiuse(cpu::avx512_common)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx512_common>(jcp));
if (mayiuse(x64::avx512_common)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<x64::avx512_common>(jcp));
block_size = 16;
} else if (mayiuse(cpu::avx2)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx2>(jcp));
} else if (mayiuse(x64::avx2)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<x64::avx2>(jcp));
block_size = 8;
} else if (mayiuse(cpu::sse42)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::sse42>(jcp));
} else if (mayiuse(x64::sse41)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<x64::sse41>(jcp));
block_size = 4;
}
if (softmax_kernel)
softmax_kernel->create_ker();
}
template<typename in_data_t, typename out_data_t>

View File

@ -5,12 +5,15 @@
#include "common/emitter.h"
#include "jit_eltwise_emitters.hpp"
#include "mkldnn_eltwise_node.h"
#include "jit_uni_eltwise.hpp"
#include <cpu/x64/jit_uni_eltwise.hpp>
#include "legacy/ie_layers.h"
using namespace InferenceEngine;
using namespace mkldnn::impl::utils;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu::x64;
using namespace Xbyak;
namespace MKLDNNPlugin {
@ -23,25 +26,25 @@ size_t jit_add_emitter::get_inputs_num() { return 2; }
void jit_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
} else {
@ -57,27 +60,27 @@ size_t jit_mul_add_emitter::get_inputs_num() { return 3; }
void jit_mul_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_mul_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_src2 = Vmm(in_vec_idxs[2]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vmovups(vmm_dst, vmm_src0);
h->mulps(vmm_dst, vmm_src1);
h->addps(vmm_dst, vmm_src2);
@ -116,25 +119,25 @@ size_t jit_subtract_emitter::get_inputs_num() { return 2; }
void jit_subtract_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
} else {
@ -151,25 +154,25 @@ size_t jit_multiply_emitter::get_inputs_num() { return 2; }
void jit_multiply_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
} else {
@ -186,20 +189,20 @@ size_t jit_divide_emitter::get_inputs_num() { return 2; }
void jit_divide_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -226,7 +229,7 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
}
};
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vmovups(vmm_dst, vmm_src0);
uni_vdiv(vmm_dst, vmm_dst, vmm_src1);
} else {
@ -250,26 +253,26 @@ size_t jit_floor_mod_emitter::get_inputs_num() { return 2; }
void jit_floor_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_floor_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
if (vmm_dst.getIdx() != vmm_src0.getIdx())
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vmovups(vmm_aux0, vmm_src0);
@ -299,26 +302,26 @@ size_t jit_mod_emitter::get_inputs_num() { return 2; }
void jit_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
if (vmm_dst.getIdx() != vmm_src0.getIdx())
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vmovups(vmm_aux0, vmm_src0);
@ -348,20 +351,20 @@ size_t jit_maximum_emitter::get_inputs_num() { return 2; }
void jit_maximum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_maximum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -374,7 +377,7 @@ void jit_maximum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
}
};
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
if (vmm_src0.getIdx() != vmm_dst.getIdx())
h->uni_vmovups(vmm_dst, vmm_src0);
uni_vmax(vmm_dst, vmm_dst, vmm_src1);
@ -395,20 +398,20 @@ size_t jit_minimum_emitter::get_inputs_num() { return 2; }
void jit_minimum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_minimum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -421,7 +424,7 @@ void jit_minimum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
}
};
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
if (vmm_src0.getIdx() != vmm_dst.getIdx())
h->uni_vmovups(vmm_dst, vmm_src0);
uni_vmin(vmm_dst, vmm_dst, vmm_src1);
@ -442,25 +445,25 @@ size_t jit_squared_difference_emitter::get_inputs_num() { return 2; }
void jit_squared_difference_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
if (vmm_src0.getIdx() != vmm_dst.getIdx())
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
@ -480,20 +483,20 @@ size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; }
void jit_power_dynamic_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -512,7 +515,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
// caller obligation to save k-regs as callee may use them
size_t n_k_regs_to_save = 8;
if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
for (size_t i = 0; i < n_k_regs_to_save; ++i) {
if (mayiuse(avx512_core))
@ -561,7 +564,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
// restore k registers
if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
if (mayiuse(avx512_core))
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
@ -588,33 +591,33 @@ size_t jit_equal_emitter::get_inputs_num() { return 2; }
void jit_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->movups(vmm_aux0, vmm_src0);
h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
h->movups(vmm_aux1, table_val("one"));
h->pxor(vmm_dst, vmm_dst);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
h->uni_vmovups(vmm_dst, table_val("zero"));
h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -644,33 +647,33 @@ size_t jit_not_equal_emitter::get_inputs_num() { return 2; }
void jit_not_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_not_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->movups(vmm_aux0, vmm_src0);
h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
h->movups(vmm_dst, table_val("one"));
h->pxor(vmm_aux1, vmm_aux1);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
h->uni_vmovups(vmm_dst, table_val("one"));
h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -700,33 +703,33 @@ size_t jit_greater_emitter::get_inputs_num() { return 2; }
void jit_greater_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_greater_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->movups(vmm_aux0, vmm_src0);
h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os);
h->movups(vmm_aux1, table_val("one"));
h->pxor(vmm_dst, vmm_dst);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1);
h->uni_vmovups(vmm_dst, table_val("zero"));
h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -756,33 +759,33 @@ size_t jit_greater_equal_emitter::get_inputs_num() { return 2; }
void jit_greater_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_greater_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->movups(vmm_aux0, vmm_src0);
h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os);
h->movups(vmm_aux1, table_val("one"));
h->pxor(vmm_dst, vmm_dst);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1);
h->uni_vmovups(vmm_dst, table_val("zero"));
h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -812,33 +815,33 @@ size_t jit_less_emitter::get_inputs_num() { return 2; }
void jit_less_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_less_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->movups(vmm_aux0, vmm_src0);
h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os);
h->movups(vmm_aux1, table_val("one"));
h->pxor(vmm_dst, vmm_dst);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1);
h->uni_vmovups(vmm_dst, table_val("zero"));
h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -868,20 +871,20 @@ size_t jit_less_equal_emitter::get_inputs_num() { return 2; }
void jit_less_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_less_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -889,13 +892,13 @@ void jit_less_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, co
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->movups(vmm_aux0, vmm_src0);
h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os);
h->movups(vmm_aux1, table_val("one"));
h->pxor(vmm_dst, vmm_dst);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1);
h->uni_vmovups(vmm_dst, table_val("zero"));
h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -925,20 +928,20 @@ size_t jit_logical_and_emitter::get_inputs_num() { return 2; }
void jit_logical_and_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -946,7 +949,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->pxor(vmm_aux0, vmm_aux0);
h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
h->movups(vmm_dst, table_val("one"));
@ -960,7 +963,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
h->blendvps(vmm_aux2, vmm_aux1);
h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
h->uni_vmovups(vmm_dst, table_val("one"));
h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -1002,20 +1005,20 @@ size_t jit_logical_or_emitter::get_inputs_num() { return 2; }
void jit_logical_or_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -1023,7 +1026,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, co
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->pxor(vmm_aux0, vmm_aux0);
h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
h->movups(vmm_dst, table_val("one"));
@ -1037,7 +1040,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, co
h->blendvps(vmm_aux2, vmm_aux1);
h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
h->uni_vmovups(vmm_dst, table_val("one"));
h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -1078,20 +1081,20 @@ size_t jit_logical_xor_emitter::get_inputs_num() { return 2; }
void jit_logical_xor_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -1099,7 +1102,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->pxor(vmm_aux0, vmm_aux0);
h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
h->movups(vmm_dst, table_val("one"));
@ -1113,7 +1116,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
h->blendvps(vmm_aux2, vmm_aux1);
h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
h->uni_vmovups(vmm_dst, table_val("one"));
h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -1154,32 +1157,32 @@ size_t jit_logical_not_emitter::get_inputs_num() { return 1; }
void jit_logical_not_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_logical_not_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->pxor(vmm_aux0, vmm_aux0);
h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
h->movups(vmm_aux1, table_val("one"));
h->pxor(vmm_dst, vmm_dst);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
h->uni_vmovups(vmm_dst, table_val("zero"));
h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -1209,20 +1212,20 @@ size_t jit_power_static_emitter::get_inputs_num() { return 1; }
void jit_power_static_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
@ -1238,7 +1241,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
Xmm xmm0 = Xmm(0), xmm1 = Xmm(1);
if (scale != 1.f || shift != 0.f) {
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vmovups(vmm_aux0, table_val("scale"));
h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0);
h->uni_vmovups(vmm_dst, table_val("shift"));
@ -1264,7 +1267,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
if (power < 0.f) {
h->uni_vmovups(vmm_aux0, table_val("one"));
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
h->uni_vmovups(vmm_dst, vmm_aux0);
} else {
@ -1280,7 +1283,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
if (power < 0.f) {
h->uni_vmovups(vmm_aux0, table_val("one"));
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
h->uni_vmovups(vmm_dst, vmm_aux0);
} else {
@ -1302,7 +1305,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
// caller obligation to save k-regs as callee may use them
size_t n_k_regs_to_save = 8;
if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
for (size_t i = 0; i < n_k_regs_to_save; ++i) {
if (mayiuse(avx512_core))
@ -1351,7 +1354,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());
// restore k registers
if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
if (mayiuse(avx512_core))
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
@ -1397,27 +1400,27 @@ size_t jit_prelu_emitter::get_inputs_num() { return 2; }
void jit_prelu_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx2) {
emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::avx512_common) {
emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx2) {
emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
} else if (host_isa_ == cpu::x64::avx512_common) {
emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
} else {
assert(!"unsupported isa");
}
}
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void jit_prelu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
h->pxor(vmm_aux0, vmm_aux0);
h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os);
h->movups(vmm_aux1, vmm_src1);
@ -1425,12 +1428,12 @@ void jit_prelu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const s
if (vmm_src0.getIdx() != vmm_dst.getIdx())
h->movups(vmm_dst, vmm_src0);
h->blendvps(vmm_dst, vmm_aux1);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
h->vmulps(vmm_aux0, vmm_src0, vmm_src1);
h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1);
h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1);
h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1);
} else if (isa == cpu::avx512_common) {
} else if (isa == cpu::x64::avx512_common) {
h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0);
if (vmm_src0.getIdx() != vmm_dst.getIdx())
h->vmovups(vmm_dst, vmm_src0);

View File

@ -5,14 +5,14 @@
#pragma once
#include "common/emitter.h"
#include "jit_generator.hpp"
#include <cpu/x64/jit_generator.hpp>
#include "mkldnn_node.h"
namespace MKLDNNPlugin {
class jit_add_emitter : public jit_emitter {
public:
jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -21,13 +21,13 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_mul_add_emitter : public jit_emitter {
public:
jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -36,7 +36,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
size_t aux_vecs_count() const override;
@ -45,7 +45,7 @@ private:
class jit_subtract_emitter : public jit_emitter {
public:
jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -54,14 +54,14 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_multiply_emitter : public jit_emitter {
public:
jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -70,14 +70,14 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_divide_emitter : public jit_emitter {
public:
jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -87,7 +87,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
size_t aux_vecs_count() const override;
};
@ -95,7 +95,7 @@ private:
class jit_floor_mod_emitter : public jit_emitter {
public:
jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -104,7 +104,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
size_t aux_vecs_count() const override;
};
@ -112,7 +112,7 @@ private:
class jit_mod_emitter : public jit_emitter {
public:
jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -121,7 +121,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
size_t aux_vecs_count() const override;
};
@ -129,7 +129,7 @@ private:
class jit_maximum_emitter : public jit_emitter {
public:
jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -139,14 +139,14 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_minimum_emitter : public jit_emitter {
public:
jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -156,14 +156,14 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_squared_difference_emitter : public jit_emitter {
public:
jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_squared_difference_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -172,14 +172,14 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_power_dynamic_emitter : public jit_emitter {
public:
jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -188,14 +188,14 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
};
class jit_equal_emitter : public jit_emitter {
public:
jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -204,7 +204,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -214,7 +214,7 @@ private:
class jit_not_equal_emitter : public jit_emitter {
public:
jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -223,7 +223,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -233,7 +233,7 @@ private:
class jit_greater_emitter : public jit_emitter {
public:
jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -242,7 +242,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -252,7 +252,7 @@ private:
class jit_greater_equal_emitter : public jit_emitter {
public:
jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -261,7 +261,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -271,7 +271,7 @@ private:
class jit_less_emitter : public jit_emitter {
public:
jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -280,7 +280,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -290,7 +290,7 @@ private:
class jit_less_equal_emitter : public jit_emitter {
public:
jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -299,7 +299,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -309,7 +309,7 @@ private:
class jit_logical_and_emitter : public jit_emitter {
public:
jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -318,7 +318,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -328,7 +328,7 @@ private:
class jit_logical_or_emitter : public jit_emitter {
public:
jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -337,7 +337,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -347,7 +347,7 @@ private:
class jit_logical_xor_emitter : public jit_emitter {
public:
jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -356,7 +356,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -365,7 +365,7 @@ private:
class jit_logical_not_emitter : public jit_emitter {
public:
jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -374,7 +374,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -383,7 +383,7 @@ private:
class jit_power_static_emitter : public jit_emitter {
public:
jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -392,7 +392,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
void register_table_entries() override;
@ -401,7 +401,7 @@ private:
class jit_prelu_emitter : public jit_emitter {
public:
jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -410,7 +410,7 @@ private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;
template <mkldnn::impl::cpu::cpu_isa_t isa>
template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
size_t aux_vecs_count() const override;

View File

@ -8,7 +8,8 @@
#include "legacy/ie_layers.h"
using namespace mkldnn::impl::utils;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu::x64;
using namespace Xbyak;
namespace MKLDNNPlugin {
@ -19,15 +20,15 @@ jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa,
auto alg = static_cast<mkldnn_alg_kind_t>(eltwiseNode.getAlgorithm());
if (host_isa_ == cpu::sse42) {
eltwise_injector_sse42 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::sse42>>(
host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
} else if (host_isa_ == cpu::avx2) {
eltwise_injector_avx2 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx2>>(
host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
} else if (host_isa_ == cpu::avx512_common) {
eltwise_injector_avx512_common = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx512_common>>(
host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
if (host_isa_ == cpu::x64::sse41) {
eltwise_injector_sse42 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::x64::sse41>>(
host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1);
} else if (host_isa_ == cpu::x64::avx2) {
eltwise_injector_avx2 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::x64::avx2>>(
host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1);
} else if (host_isa_ == cpu::x64::avx512_common) {
eltwise_injector_avx512_common = std::make_shared<jit_uni_eltwise_injector_f32<cpu::x64::avx512_common>>(
host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1);
} else {
assert(!"unsupported isa");
}
@ -37,15 +38,15 @@ size_t jit_mkldnn_emitter::get_inputs_num() { return 1; }
void jit_mkldnn_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
if (host_isa_ == cpu::sse42) {
if (host_isa_ == cpu::x64::sse41) {
if (out_vec_idxs[0] != in_vec_idxs[0])
h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0]));
eltwise_injector_sse42->compute_vector(out_vec_idxs[0]);
} else if (host_isa_ == cpu::avx2) {
} else if (host_isa_ == cpu::x64::avx2) {
if (out_vec_idxs[0] != in_vec_idxs[0])
h->uni_vmovups(Ymm(out_vec_idxs[0]), Ymm(in_vec_idxs[0]));
eltwise_injector_avx2->compute_vector(out_vec_idxs[0]);
} else if (host_isa_ == cpu::avx512_common) {
} else if (host_isa_ == cpu::x64::avx512_common) {
if (out_vec_idxs[0] != in_vec_idxs[0])
h->uni_vmovups(Zmm(out_vec_idxs[0]), Zmm(in_vec_idxs[0]));
eltwise_injector_avx512_common->compute_vector(out_vec_idxs[0]);
@ -55,11 +56,11 @@ void jit_mkldnn_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std:
}
void jit_mkldnn_emitter::emit_table() {
if (host_isa_ == cpu::sse42) {
if (host_isa_ == cpu::x64::sse41) {
eltwise_injector_sse42->prepare_table();
} else if (host_isa_ == cpu::avx2) {
} else if (host_isa_ == cpu::x64::avx2) {
eltwise_injector_avx2->prepare_table();
} else if (host_isa_ == cpu::avx512_common) {
} else if (host_isa_ == cpu::x64::avx512_common) {
eltwise_injector_avx512_common->prepare_table();
} else {
assert(!"unsupported isa");

View File

@ -5,15 +5,16 @@
#pragma once
#include "common/emitter.h"
#include "jit_generator.hpp"
#include <cpu/x64/jit_generator.hpp>
#include "mkldnn_node.h"
#include "jit_uni_eltwise.hpp"
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
namespace MKLDNNPlugin {
class jit_mkldnn_emitter : public jit_emitter {
public:
jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -24,9 +25,9 @@ public:
void emit_table() override;
private:
std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::sse42>> eltwise_injector_sse42;
std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx2>> eltwise_injector_avx2;
std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx512_common>> eltwise_injector_avx512_common;
std::shared_ptr<mkldnn::impl::cpu::x64::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::x64::sse41>> eltwise_injector_sse42;
std::shared_ptr<mkldnn::impl::cpu::x64::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::x64::avx2>> eltwise_injector_avx2;
std::shared_ptr<mkldnn::impl::cpu::x64::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::x64::avx512_common>> eltwise_injector_avx512_common;
};
} // namespace MKLDNNPlugin

View File

@ -14,16 +14,16 @@ MKLDNNBatchNormalizationNode::MKLDNNBatchNormalizationNode(const InferenceEngine
const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache) {
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return GetVarianceDesc(primitive_desc_it.fetch());
return GetVarianceDesc(primitive_desc_it);
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return GetMeanDesc(primitive_desc_it.fetch());
return GetMeanDesc(primitive_desc_it);
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
if (!fusedWithScale())
return MKLDNNMemoryDesc();
return GetScaleShiftWeightsDesc(primitive_desc_it.fetch());
return GetScaleShiftWeightsDesc(primitive_desc_it);
});
}
@ -105,57 +105,29 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
}
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetVarianceDesc(const memory::primitive_desc &primitive_desc) const {
memory::primitive_desc aprimitive_desc;
mkldnn_primitive_desc_t bndesc = nullptr;
static MKLDNNMemoryDesc get_bn_mdesc_by_index(const mkldnn::primitive_desc_iterator &primitive_desc, int idx) {
mkldnn_batch_normalization_desc_t *p;
error::wrap_c_api(mkldnn_primitive_desc_query(
primitive_desc.get(), mkldnn::convert_to_c(batch_normalization_d), 0, &p),
primitive_desc.get(), mkldnn::convert_to_c(mkldnn::query::batch_normalization_d), 0, &p),
"could not get a batch-normalization descriptor");
const_mkldnn_primitive_desc_t const_bndesc =
(p->flags & use_global_stats) ?
mkldnn_primitive_desc_query_pd(primitive_desc.get(),
mkldnn::convert_to_c(src_pd), 2) :
mkldnn_primitive_desc_query_pd(primitive_desc.get(),
mkldnn::convert_to_c(dst_pd), 2);
error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
const_bndesc),
"could not clone a variance primitive descriptor");
aprimitive_desc.reset(bndesc);
return MKLDNNMemoryDesc(aprimitive_desc.desc());
auto bndesc =
(p->flags & mkldnn::convert_to_c(mkldnn::normalization_flags::use_global_stats)) ?
primitive_desc.src_desc(idx) : primitive_desc.dst_desc(idx);
return MKLDNNMemoryDesc {bndesc};
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetMeanDesc(const memory::primitive_desc &primitive_desc) const {
memory::primitive_desc aprimitive_desc;
mkldnn_primitive_desc_t bndesc = nullptr;
mkldnn_batch_normalization_desc_t *p;
error::wrap_c_api(mkldnn_primitive_desc_query(
primitive_desc.get(), mkldnn::convert_to_c(batch_normalization_d), 0, &p),
"could not get a batch-normalization descriptor");
const_mkldnn_primitive_desc_t const_bndesc =
(p->flags & use_global_stats) ?
mkldnn_primitive_desc_query_pd(primitive_desc.get(),
mkldnn::convert_to_c(src_pd), 1) :
mkldnn_primitive_desc_query_pd(primitive_desc.get(),
mkldnn::convert_to_c(dst_pd), 1);
error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
const_bndesc),
"could not clone a mean primitive descriptor");
aprimitive_desc.reset(bndesc);
return MKLDNNMemoryDesc(aprimitive_desc.desc());
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetVarianceDesc(const mkldnn::primitive_desc &primitive_desc) const {
// TODO: rewrite with using stat_desc
return get_bn_mdesc_by_index(primitive_desc, 2);
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetScaleShiftWeightsDesc(const memory::primitive_desc &primitive_desc) const {
memory::primitive_desc adesc;
mkldnn_primitive_desc_t bndesc = nullptr;
const_mkldnn_primitive_desc_t const_bndesc =
mkldnn_primitive_desc_query_pd(primitive_desc.get(),
mkldnn::convert_to_c(weights_pd), 0);
error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
const_bndesc),
"could not clone a weights primitive descriptor");
adesc.reset(bndesc);
return MKLDNNMemoryDesc(adesc.desc());
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetMeanDesc(const mkldnn::primitive_desc &primitive_desc) const {
return get_bn_mdesc_by_index(primitive_desc, 1);
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetScaleShiftWeightsDesc(const mkldnn::primitive_desc &primitive_desc) const {
return MKLDNNMemoryDesc(primitive_desc.weights_desc(0));
}
bool MKLDNNBatchNormalizationNode::created() const {
@ -166,23 +138,28 @@ void MKLDNNBatchNormalizationNode::createPrimitive() {
if (prim)
return;
if (fusedWithScale()) {
auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
batch_normalization_forward::desc>();
prim.reset(new batch_normalization_forward(prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
(const primitive::at) internalBlobMemory[1]->GetPrimitive(),
(const primitive::at) internalBlobMemory[0]->GetPrimitive(),
(const primitive::at) internalBlobMemory[2]->GetPrimitive(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
} else {
auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
batch_normalization_forward::desc>();
prim.reset(new batch_normalization_forward(prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
(const primitive::at) internalBlobMemory[1]->GetPrimitive(),
(const primitive::at) internalBlobMemory[0]->GetPrimitive(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
batch_normalization_forward::desc>();
prim.reset(new batch_normalization_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
const auto &mean = internalBlobMemory[1]->GetPrimitive();
const auto &var = internalBlobMemory[0]->GetPrimitive();
if (convert_to_c(flag) & dnnl_use_scaleshift) {
const auto &sclshft = internalBlobMemory[2]->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src},
{DNNL_ARG_MEAN, mean},
{DNNL_ARG_VARIANCE, var},
{DNNL_ARG_SCALE_SHIFT, sclshft},
{DNNL_ARG_DST, dst}};
} else {
primArgs = {{DNNL_ARG_SRC, src},
{DNNL_ARG_MEAN, mean},
{DNNL_ARG_VARIANCE, var},
{DNNL_ARG_DST, dst}};
}
}
@ -194,15 +171,16 @@ void MKLDNNBatchNormalizationNode::createDescriptor(const std::vector<InferenceE
MKLDNNDims dims = inDesc.getDims();
dims.push_back(1); // H
dims.push_back(1); // W
auto format = memory::nchw;
auto format = memory::format_tag::nchw;
inDesc = MKLDNNMemoryDesc(dims, inDesc.getDataType(), format);
}
unsigned flag = mkldnn_use_global_stats;
flag = normalization_flags::use_global_stats;
if (fusedWithScale())
flag |= mkldnn_use_scaleshift;
flag |= normalization_flags::use_scale_shift;
MKLDNNDescriptor desc(std::shared_ptr<batch_normalization_forward::desc>(
new batch_normalization_forward::desc(prop_kind::forward_scoring, inDesc, eps,
new mkldnn::batch_normalization_forward::desc(prop_kind::forward_scoring, inDesc, eps,
flag)));
descs.push_back(desc);
}
@ -237,7 +215,7 @@ void MKLDNNBatchNormalizationNode::initSupportedPrimitiveDescriptors() {
// BN primitive doesn't support strides
for (auto& desc : descs) {
primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine());
while (itpd.is_not_end()) {
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
for (size_t i = 0; i < desc.inputNumbers(); i++) {
@ -248,27 +226,25 @@ void MKLDNNBatchNormalizationNode::initSupportedPrimitiveDescriptors() {
config.inConfs.push_back(dataConfig);
}
std::vector<memory::format> outFormats;
for (size_t i = 0; i < desc.outputNumbers(); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = canBeInPlace() ? 0 : -1;
dataConfig.constant = false;
dataConfig.desc = getDstMemDesc(itpd, i);
config.outConfs.push_back(dataConfig);
outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
}
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
itpd++;
supportedPrimitiveDescriptors.emplace_back(config, impl_type);
if (!itpd.next_impl())
break;
}
}
}
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it,
size_t idx) {
TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));
if (getParentEdgeAt(0)->getDims().ndims() == 2 && desc.getLayout() == InferenceEngine::Layout::NCHW) {
desc.reshape(getParentEdgeAt(idx)->getDims().ToSizeVector(), InferenceEngine::Layout::NC);
@ -286,7 +262,7 @@ MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getSrcMemDesc(mkldnn::primitive_d
MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it,
size_t idx) {
TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc());
TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));
if (getParentEdgeAt(0)->getDims().ndims() == 2 && desc.getLayout() == InferenceEngine::Layout::NCHW) {
desc.reshape(getParentEdgeAt(idx)->getDims().ToSizeVector(), InferenceEngine::Layout::NC);

View File

@ -25,16 +25,19 @@ public:
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void createPrimitive() override;
bool created() const override;
bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Eltwise
&& fusedWith[0]->getCnnLayer()->type == "ScaleShift";}
MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
private:
mkldnn::normalization_flags flag = mkldnn::normalization_flags::none;
float eps = 0.0f;
MKLDNNMemoryDesc GetVarianceDesc(const mkldnn::memory::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetMeanDesc(const mkldnn::memory::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetScaleShiftWeightsDesc(const mkldnn::memory::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetVarianceDesc(const mkldnn::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetMeanDesc(const mkldnn::primitive_desc& primitive_desc) const;
MKLDNNMemoryDesc GetScaleShiftWeightsDesc(const mkldnn::primitive_desc& primitive_desc) const;
};
} // namespace MKLDNNPlugin

View File

@ -12,48 +12,106 @@
namespace MKLDNNPlugin {
struct jit_bin_conv_params {
int mb;
int ngroups;
int ic, oc, ic_padded;
int ih, iw, oh, ow;
int l_pad, t_pad, b_pad;
int kh, kw;
int stride_h, stride_w;
int dilate_h, dilate_w;
bool with_sum;
bool with_dw_conv;
bool with_binarization;
float pad_value;
bool exclude_pad;
int nb_ic, ic_block;
int nb_oc, oc_block;
int nb_oc_blocking;
int ur_w, ur_w_tail;
int typesize_in, typesize_out;
mkldnn::memory::data_type dst_dt;
};
struct jit_dw_conv_params {
int kh;
};
struct jit_bin_conv_call_args {
const void *src;
const void *dst;
const void *filt;
size_t kh_padding;
size_t kw_padding;
size_t oc_work;
size_t t_overflow;
size_t b_overflow;
size_t oc_off;
};
struct jit_uni_bin_conv_kernel {
void (*ker_)(const jit_bin_conv_call_args *);
void operator()(const jit_bin_conv_call_args *args) {
assert(ker_);
ker_(args);
}
explicit jit_uni_bin_conv_kernel(jit_bin_conv_params jcp, jit_dw_conv_params jcp_dw_conv, const mkldnn_primitive_attr &attr) :
ker_(nullptr), jcp_(jcp), jcp_dw_conv_(jcp_dw_conv), attr_(attr) {}
virtual ~jit_uni_bin_conv_kernel() {}
virtual void create_ker() = 0;
jit_bin_conv_params jcp_;
jit_dw_conv_params jcp_dw_conv_;
const mkldnn_primitive_attr &attr_;
};
class MKLDNNBinaryConvolutionNode : public MKLDNNNode {
public:
MKLDNNBinaryConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNBinaryConvolutionNode() override = default;
void getSupportedDescriptors() override;
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void initDescriptor(const InferenceEngine::LayerConfig& config) override;
void createPrimitive() override;
void initSupportedPrimitiveDescriptors() override;
void execute(mkldnn::stream strm) override;
bool created() const override;
bool canBeInPlace() const override {
return false;
}
void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
void setPostOps(mkldnn::primitive_attr &attr);
bool canFuse(const MKLDNNNodePtr& node) const;
private:
bool withSum = false;
bool withBinarization = false;
bool withDWConv = false;
bool isDW = false;
bool isMerged = false;
bool isGrouped = false;
size_t group = 1;
float pad_value = 0.f;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> dilation;
std::vector<ptrdiff_t> paddingL;
std::vector<ptrdiff_t> paddingR;
InferenceEngine::SizeVector weightDims;
InferenceEngine::SizeVector biasesDims;
ptrdiff_t dw_conv_oc = 0;
ptrdiff_t dw_conv_ih = 0;
ptrdiff_t dw_conv_iw = 0;
std::vector<ptrdiff_t> dw_conv_kernel;
std::vector<ptrdiff_t> dw_conv_strides;
mkldnn::memory::data_type dw_conv_in_dt = mkldnn::memory::data_type::data_undef;
std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
jit_bin_conv_params jcp = {};
jit_dw_conv_params jcp_dw_conv = {};
std::shared_ptr<jit_uni_bin_conv_kernel> bin_conv_kernel = nullptr;
int baseInputsNumber;
mkldnn::primitive_attr attr;
float pad_value = 0.f;
impl_desc_type implType = impl_desc_type::ref;
void executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst,
const std::vector<size_t>& s_str, const std::vector<size_t>& w_str, const std::vector<size_t>& d_str);
void executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst,
const std::vector<size_t>& s_str, const std::vector<size_t>& w_str, const std::vector<size_t>& d_str);
};
} // namespace MKLDNNPlugin

View File

@ -101,10 +101,10 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format::nc :
parentEdge->getDims().ndims() == 4 ? memory::format::nhwc :
memory::format::ndhwc
: memory::format::any;
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format_tag::nc :
parentEdge->getDims().ndims() == 4 ? memory::format_tag::nhwc :
memory::format_tag::ndhwc
: memory::format_tag::any;
dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, fmt));
config.inConfs.push_back(dataConfig);
@ -116,9 +116,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
config.outConfs[0].inPlace = -1;
config.outConfs[0].constant = false;
if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format::nc :
dims.ndims() == 4 ? memory::format::nhwc :
memory::format::ndhwc
auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format_tag::nc :
dims.ndims() == 4 ? memory::format_tag::nhwc :
memory::format_tag::ndhwc
: MKLDNNMemory::GetPlainFormat(dims);
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, fmt));
@ -128,25 +128,25 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
if (dims.ndims() == 4) {
if (dims[1] % 8 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw8c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nChw8c);
MKLDNNMemoryDesc(dims, outputDataType, memory::format_tag::nChw8c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, memory::format_tag::nChw8c);
if (dims[1] % 16 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw16c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nChw16c);
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nChw16c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nChw16c);
}
}
} else if (dims.ndims() == 5) {
if (dims[1] % 8 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nCdhw8c);
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw8c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw8c);
if (dims[1] % 16 == 0) {
config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nCdhw16c);
MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw16c));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw16c);
}
}
}
@ -197,7 +197,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
{blkDims, order, offset, offsets, strides});
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nhwc);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nhwc);
return;
} else if (numOfDim == 5) {
@ -231,7 +231,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
{blkDims, order, offset, offsets, strides});
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::ndhwc);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::ndhwc);
return;
}
@ -303,8 +303,8 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
{blkDims, order, offset, offsets, strides});
}
if (canInplace) {
auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::nChw8c : mkldnn::memory::nChw16c
: sizeS == 8lu ? mkldnn::memory::nCdhw8c : mkldnn::memory::nCdhw16c;
auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::nChw16c
: sizeS == 8lu ? mkldnn::memory::format_tag::nCdhw8c : mkldnn::memory::format_tag::nCdhw16c;
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, dstFormat);
}
}
@ -312,9 +312,6 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
}
void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
bool hasUnknown = false;
std::vector<size_t> canSelectPrimitive;
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
@ -382,48 +379,45 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
canOptimize = false;
}
std::map<mkldnn::memory::format, size_t> formatFrequency;
std::map<PartialBlkDesc, size_t> formatFrequency;
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto parentEdge = getParentEdgeAt(i);
auto parent = parentEdge->getParent();
if (parent->getSelectedPrimitiveDescriptor() == nullptr)
auto parent_pdesc = parent->getSelectedPrimitiveDescriptor();
if (parent_pdesc == nullptr)
continue;
int outputIndex = parentEdge->getOutputNum();
if (outputIndex < 0)
const auto &parent_config = parent_pdesc->getConfig();
int outputIndex = parentEdge->getInputNum();
if (outputIndex < 0 || outputIndex >= parent_config.outConfs.size())
THROW_IE_EXCEPTION << "Cannot find index of output node";
if (outputIndex >= parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size())
outputIndex = 0;
auto outDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[outputIndex].desc);
if (!outDesc)
const auto &port_desc = parent_config.outConfs[outputIndex].desc;
if (port_desc.getLayout() == Layout::ANY)
continue;
if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
formatFrequency[outDesc.getFormat()] += 1;
else
formatFrequency[outDesc.getFormat()] = 1;
auto partial_format_desc = PartialBlkDesc::extractFrom(port_desc);
formatFrequency[partial_format_desc] += 1;
}
for (size_t i = 0; i < getChildEdges().size(); i++) {
auto childEdge = getChildEdgeAt(i);
auto child = childEdge->getChild();
if (child->getSelectedPrimitiveDescriptor() == nullptr)
const auto *prim_desc = child->getSelectedPrimitiveDescriptor();
if (prim_desc == nullptr)
continue;
const auto &config = prim_desc->getConfig();
int inputIndex = childEdge->getOutputNum();
if (inputIndex < 0)
if (inputIndex < 0 || inputIndex >= config.inConfs.size())
THROW_IE_EXCEPTION << "Cannot find index of output node";
if (inputIndex >= child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size())
inputIndex = 0;
auto outDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[inputIndex].desc);
if (!outDesc)
const auto &port_desc = config.inConfs[inputIndex].desc;
if (port_desc.getLayout() == Layout::ANY)
continue;
if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
formatFrequency[outDesc.getFormat()] += 1;
else
formatFrequency[outDesc.getFormat()] = 1;
auto partial_format_desc = PartialBlkDesc::extractFrom(port_desc);
formatFrequency[partial_format_desc] += 1;
}
size_t maxCount = 0;
mkldnn::memory::format convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
auto convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
for (auto &it : formatFrequency) {
if (it.second > maxCount) {
maxCount = it.second;
@ -431,15 +425,15 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
}
}
if (canOptimize && MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, convertTo).blocksExtended())
convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
if (canOptimize && convertTo.isAutoExtendedWith(getChildEdgeAt(0)->getDims().ToSizeVector()))
convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
if (MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, convertTo).blocksExtended())
convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
if (convertTo.isAutoExtendedWith(getParentEdgeAt(i)->getDims().ToSizeVector()))
convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
}
for (auto supportedPdIndex : canSelectPrimitive) {
if (MKLDNNMemoryDesc(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc).getFormat() == convertTo) {
if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc) == convertTo) {
selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
return;
}
@ -449,10 +443,10 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
auto &primDescInfo = supportedPrimitiveDescriptors[i];
if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
continue;
if (convertTo == MKLDNNMemoryDesc(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc).getFormat()) {
if (convertTo == PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc)) {
size_t num = 0;
for (num = 0; num < getParentEdges().size(); num++) {
if (MKLDNNMemoryDesc(getParentEdgeAt(num)->getDims(), inputDataType, convertTo).blocksExtended())
if (convertTo.isAutoExtendedWith(getParentEdgeAt(num)->getDims().ToSizeVector()))
break;
}
if (num == getParentEdges().size()) {
@ -482,8 +476,7 @@ void MKLDNNConcatNode::createPrimitive() {
if (getSelectedPrimitiveDescriptor() == nullptr)
THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
std::vector<memory::primitive_desc> srcs_pd;
std::vector<primitive::at> srcs_p;
std::vector<memory::desc> srcs_d;
for (size_t i = 0; i < getParentEdges().size(); i++) {
auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
@ -499,20 +492,18 @@ void MKLDNNConcatNode::createPrimitive() {
desc.data.dims[j] = dims[j];
}
srcs_pd.emplace_back(desc, srcMemPtr->GetPrimitiveDescriptor().get_engine());
srcs_p.emplace_back(srcMemPtr->GetPrimitive());
srcs_d.emplace_back(desc);
}
auto desc = getChildEdgeAt(0)->getMemory().GetDescriptor();
auto dims = getChildEdgeAt(0)->getDims();
for (size_t i = 0; i < dims.ndims(); i++) {
desc.data.dims[i] = dims[i];
desc.data.layout_desc.blocking.padding_dims[i] = dims[i];
desc.data.padded_dims[i] = dims[i];
}
auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_pd);
prim.reset(new concat(primitive_desc, srcs_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_d, getEngine());
prim.reset(new concat(primitive_desc));
}
size_t MKLDNNConcatNode::inverseOrder(const SizeVector& order, size_t axis) {
@ -617,14 +608,13 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {
const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
const size_t num_src = getParentEdges().size();
const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);
if (isInt8) {
uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());
const size_t num_src = getParentEdges().size();
std::vector<size_t> channels;
size_t channels_size = 0;
std::vector<const uint8_t*> src_ptrs;
@ -649,7 +639,11 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {
}
});
} else {
MKLDNNNode::execute(strm);
std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
for (int i = 0; i < num_src; i++)
mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
(*prim).execute(strm, mem_ags);
}
}

View File

@ -5,7 +5,6 @@
#include "mkldnn_conv_node.h"
#include "mkldnn_reorder_node.h"
#include "mkldnn_input_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_eltwise_node.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_pooling_node.h"
@ -16,6 +15,7 @@
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers_internal.hpp>
#include <utils/general_utils.h>
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -23,15 +23,15 @@ using namespace InferenceEngine;
MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache), withBiases(false), withSum(false), withDWConv(false), isDW(false), isMerged(false),
isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::data_undef),
isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::undef),
groupNum(1lu), baseInputsNumber(1), eltwisePrecision(Precision::FP32) {
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
if (!withBiases)
return MKLDNNMemoryDesc();
return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc());
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(1));
});
auto ws = layer->blobs.find("w-scale");
@ -74,13 +74,13 @@ bool MKLDNNConvolutionNode::canBeExecutedInInt8() {
if (baseInputsNumber > 1) {
auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
if (!inputZeroPoints.empty())
inputDataType = memory::u8;
inputDataType = memory::data_type::u8;
auto weightsDataType = precisionToDataType(Precision::FP32);
if (baseInputsNumber > 1) {
weightsDataType = precisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());
if (!weightsZeroPoints.empty())
weightsDataType = memory::s8;
weightsDataType = memory::data_type::s8;
}
return (inputDataType == mkldnn_s8 || inputDataType == mkldnn_u8) && weightsDataType == mkldnn_s8;
@ -125,7 +125,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
if (!inputZeroPoints.empty())
inputDataType = memory::u8;
inputDataType = memory::data_type::u8;
auto outputDataType = precisionToDataType(getCnnLayer()->outData[0]->getPrecision());
eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
@ -140,14 +140,14 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
// We need to make sure that convolution output and second input of fused Eltwise operation
// have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
if (outputDataType != memory::f32 && outputDataType != memory::bf16 && withSum) {
if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && withSum) {
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
if (eltwiseNode && eltwiseNode->isSum()) {
eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
eltwisePrecision = Precision::FP32;
outputDataType = memory::f32;
outputDataType = memory::data_type::f32;
}
break;
}
@ -260,7 +260,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
dw_conv_in_dt = precisionToDataType(fusedWith[i - 1].get()->getCnnLayer()->outData[0]->getPrecision());
}
} else {
dw_conv_in_dt = memory::f32;
dw_conv_in_dt = memory::data_type::f32;
}
for (int j = 0; j < paddingR.size(); j++) {
@ -279,15 +279,15 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
MKLDNNMemoryDesc in_candidate, out_candidate;
if (canBeExecutedInInt8()) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::ndhwc : memory::nhwc);
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::ndhwc : memory::nhwc);
getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
createDescriptor({in_candidate}, {out_candidate});
} else {
inputDataType = (convLayer->input()->getPrecision() == Precision::BF16
&& !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::bf16 : memory::f32;
&& !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
outputDataType = (convLayer->outData[0]->getPrecision() == Precision::BF16
&& !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::bf16 : memory::f32;
&& !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
eltwisePrecision = Precision::FP32;
for (int i = 0; i < fusedWith.size(); i++) {
auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
@ -300,61 +300,69 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
// for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
// bofore the fused convolution. This behaviour might be more correct regarding expected markup
// of the graph but performance of first and second approaches might be different. Need to verify
outputDataType = eltwisePrecision == Precision::BF16 ? memory::bf16 : memory::f32;
outputDataType = eltwisePrecision == Precision::BF16 ? memory::data_type::bf16 : memory::data_type::f32;
}
}
// correction for cases of FP32 input - we do not have FP32 convolution supported BF16 output
if (inputDataType == memory::f32
&& (outputDataType == memory::bf16 || eltwisePrecision == Precision::BF16)) {
outputDataType = memory::f32;
if (inputDataType == memory::data_type::f32
&& (outputDataType == memory::data_type::bf16 || eltwisePrecision == Precision::BF16)) {
outputDataType = memory::data_type::f32;
eltwisePrecision = Precision::FP32;
}
Layout layout = convLayer->input()->getLayout();
if (layout == NCHW || layout == NHWC) {
if (IC == 3 || IC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCHW ? memory::nchw : memory::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
if (IC == 1 && groupOC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
createDescriptor({in_candidate}, {out_candidate});
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
} else if (IC == 3 || IC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
createDescriptor({in_candidate}, {out_candidate});
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
createDescriptor({in_candidate}, {out_candidate});
} else {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw16c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
createDescriptor({in_candidate}, {out_candidate});
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw8c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
createDescriptor({in_candidate}, {out_candidate});
}
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCHW ? memory::nchw : memory::nhwc);
layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
layout == NCHW ? memory::nchw : memory::nhwc);
layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
createDescriptor({in_candidate}, {out_candidate});
} else if (layout == NCDHW || layout == NDHWC) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCDHW ? memory::ncdhw : memory::ndhwc);
if (IC == 3 || IC == 1) {
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
if (IC == 1 && groupOC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
createDescriptor({in_candidate}, {out_candidate});
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
} else if (IC == 3 || IC == 1) {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
createDescriptor({in_candidate}, {out_candidate});
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
createDescriptor({in_candidate}, {out_candidate});
} else {
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw16c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
createDescriptor({in_candidate}, {out_candidate});
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw8c);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
createDescriptor({in_candidate}, {out_candidate});
}
in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
layout == NCDHW ? memory::ncdhw : memory::ndhwc);
layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
layout == NCDHW ? memory::ncdhw : memory::ndhwc);
layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
createDescriptor({in_candidate}, {out_candidate});
}
}
@ -370,7 +378,7 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
if (eltwiseNode && eltwiseNode->isSum()) {
ops.append_sum(1.0, mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision)));
ops.append_sum(1.0, precisionToDataType(eltwisePrecision));
continue;
}
@ -396,43 +404,46 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, weightsPrc, memory::format::Goihw8g);
PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
PostOpsIntBlobMemory[blob_idx]->FillZero();
Blob::Ptr weights = convLayer->blobs.find("weights")->second;
Blob::Ptr biases = convLayer->blobs.find("biases")->second;
PostOpsIntBlobMemory[blob_idx]->SetData(weightsPrc, memory::goihw, weights->buffer(),
PostOpsIntBlobMemory[blob_idx]->SetData(weightsPrc, memory::format_tag::goihw, weights->buffer(),
dwWeightsDims.size() * MKLDNNExtensionUtils::sizeOfDataType(weightsPrc));
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
MKLDNNDims dwBiasesDims({dw_conv_oc});
PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, biasPrc, memory::format::x);
PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, biasPrc, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::x, biases->buffer(),
PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::format_tag::x, biases->buffer(),
dwBiasesDims.size() * MKLDNNExtensionUtils::sizeOfDataType(biasPrc));
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
(const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
(const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
// rewrite onto append_dw_k3s2p1
// ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
// dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
// mkldnn::memory::convert_to_c(dw_conv_in_dt),
// (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
// (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
blob_idx += 2;
} else {
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
static_cast<float *>(getParentEdgeAt(
baseInputsNumber + 0)->getMemory().GetData()),
static_cast<float *>(getParentEdgeAt(
baseInputsNumber + 1)->getMemory().GetData()));
// rewrite onto append_dw_k3s2p1
// ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
// dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
// mkldnn::memory::convert_to_c(dw_conv_in_dt),
// static_cast<float *>(getParentEdgeAt(
// baseInputsNumber + 0)->getMemory().GetData()),
// static_cast<float *>(getParentEdgeAt(
// baseInputsNumber + 1)->getMemory().GetData()));
}
} else {
ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
mkldnn::memory::convert_to_c(dw_conv_in_dt),
nullptr,
nullptr);
// rewrite onto append_dw_k3s2p1
// ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
// dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
// mkldnn::memory::convert_to_c(dw_conv_in_dt),
// nullptr,
// nullptr);
}
if (convolutionNode->wScale != nullptr) {
@ -458,24 +469,26 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
MKLDNNDims oScaleDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx]->Create(oScaleDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[blob_idx]->Create(oScaleDims, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx]->FillZero();
PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, &oScaleDataVector[0],
PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::format_tag::x, &oScaleDataVector[0],
oScaleDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx + 1]->Create(oScaleDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[blob_idx + 1]->Create(oScaleDims, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, &oShiftDataVector[0],
PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::format_tag::x, &oShiftDataVector[0],
oShiftDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
ops.append_depthwise(depthwise_scale_shift,
ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift,
(const float *)PostOpsIntBlobMemory[blob_idx]->GetData(),
(const float *)PostOpsIntBlobMemory[blob_idx + 1]->GetData());
blob_idx += 2;
}
THROW_IE_EXCEPTION << "append_dw_conv is not ported";
continue;
}
@ -499,7 +512,7 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
if (containJitImpl && isPossibleToSkipInitConfig(desc))
continue;
auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
while (itpd.is_not_end()) {
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
for (size_t i = 0; i < descInputNumbers(desc); i++) {
@ -522,14 +535,13 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format::Goihw8g);
dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
config.inConfs.push_back(dataConfig);
dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format::x);
dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format_tag::x);
config.inConfs.push_back(dataConfig);
}
std::vector<memory::format> outFormats;
for (size_t i = 0; i < descOutputNumbers(desc); i++) {
InferenceEngine::DataConfig dataConfig;
if (withSum) {
@ -547,15 +559,14 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
dataConfig.desc.setPrecision(eltwisePrecision);
config.inConfs.push_back(dataConfig);
}
outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
}
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type & jit)
containJitImpl = true;
supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
itpd++;
supportedPrimitiveDescriptors.emplace_back(config, impl_type);
if (!itpd.next_impl())
break;
}
}
}
@ -573,18 +584,14 @@ void MKLDNNConvolutionNode::createPrimitive() {
auto prim_desc = createPrimitiveDescriptor<convolution_forward::primitive_desc,
convolution_forward::desc>(attr);
if (withBiases) {
prim.reset(new convolution_forward(prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
getWeights(),
getBias(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
} else {
prim.reset(new convolution_forward(prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
getWeights(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
}
prim.reset(new convolution_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
if (withBiases)
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_BIAS, getBias()}, {DNNL_ARG_DST, dst}};
else
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DST, dst}};
}
bool MKLDNNConvolutionNode::created() const {
@ -602,16 +609,16 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
}
if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
wdt = memory::s8;
bdt = baseInputsNumber == 3 ? precisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::s32;
wdt = memory::data_type::s8;
bdt = baseInputsNumber == 3 ? precisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::data_type::s32;
}
if (baseInputsNumber == 1) {
Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
if (weights->getTensorDesc().getPrecision() == Precision::I8) {
wdt = memory::s8;
bdt = memory::s32;
wdt = memory::data_type::s8;
bdt = memory::data_type::s32;
Precision outPrec;
if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
@ -636,7 +643,7 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
MKLDNNDims blocked_weightDims(weightDims);
MKLDNNDims blocked_biasesDims(biasesDims);
MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::any};
MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::format_tag::any};
std::vector<algorithm> algorithms;
// We cannot map wino_format on tensor descriptor for now
@ -649,15 +656,21 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
try {
std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
if (withBiases) {
MKLDNNMemoryDesc bias_candidate{blocked_biasesDims, bdt, memory::any};
MKLDNNMemoryDesc bias_candidate{blocked_biasesDims, bdt, memory::format_tag::any};
conv_desc.reset(new convolution_forward::desc(prop_kind::forward_scoring, alg,
in_candidate, wgh_candidate, bias_candidate, out_candidate,
stride, dilation, paddingL, paddingR, padding_kind::zero));
mkldnn::memory::dims(stride.begin(), stride.end()),
mkldnn::memory::dims(dilation.begin(), dilation.end()),
mkldnn::memory::dims(paddingL.begin(), paddingL.end()),
mkldnn::memory::dims(paddingR.begin(), paddingR.end())));
} else {
conv_desc.reset(new convolution_forward::desc(prop_kind::forward_scoring, alg,
in_candidate, wgh_candidate, out_candidate, stride, dilation,
paddingL, paddingR, padding_kind::zero));
in_candidate, wgh_candidate, out_candidate,
mkldnn::memory::dims(stride.begin(), stride.end()),
mkldnn::memory::dims(dilation.begin(), dilation.end()),
mkldnn::memory::dims(paddingL.begin(), paddingL.end()),
mkldnn::memory::dims(paddingR.begin(), paddingR.end())));
}
descs.emplace_back(conv_desc);
@ -674,8 +687,9 @@ void MKLDNNConvolutionNode::addZeroPoints(mkldnn::primitive_attr& attr) const {
if (!weightsZeroPoints.empty())
attr.set_weights_zero_points(1 << 1 /*through C dim*/, weightsZeroPoints);
if (!outputCompensation.empty())
if (!outputCompensation.empty()) {
attr.set_output_compensations(1 << 1 /*through C dim*/, outputCompensation);
}
}
void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) const {
@ -695,7 +709,6 @@ void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr)
}
}
attr.set_int_output_round_mode(mkldnn::round_nearest);
attr.set_output_scales(1 << 1 /*through C dim*/, oScaleDataVector);
}
}
@ -741,7 +754,7 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
if (containJitImpl && isPossibleToSkipInitConfig(desc))
continue;
auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
while (itpd.is_not_end()) {
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig cfg;
cfg.dynBatchSupport = true;
for (size_t j = 0; j < descInputNumbers(desc); j++) {
@ -762,10 +775,10 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format::Goihw8g);
dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
cfg.inConfs.push_back(dataConfig);
dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format::x);
dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format_tag::x);
cfg.inConfs.push_back(dataConfig);
}
@ -783,7 +796,7 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
cfg.outConfs.push_back(dataConfig);
}
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type & jit)
containJitImpl = true;
@ -799,7 +812,8 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
}
}
selected_count++;
itpd++;
if (!itpd.next_impl())
break;
}
}
selectedPD->getConfig() = rightConfig;
@ -820,14 +834,12 @@ void MKLDNNConvolutionNode::filterSupportedDescriptors() {
while (itd != descs.end()) {
bool isSuitableDesc = true;
if (!inputMemoryFormatsFilter.empty()) {
auto src_fmt = std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.src_desc.format;
if (src_fmt != inputMemoryFormatsFilter[0])
isSuitableDesc = false;
MKLDNNMemoryDesc src_tdesc(std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.src_desc);
isSuitableDesc &= src_tdesc.isSame(inputMemoryFormatsFilter[0]);
}
if (!outputMemoryFormatsFilter.empty()) {
auto dst_fmt = std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.dst_desc.format;
if (dst_fmt != outputMemoryFormatsFilter[0])
isSuitableDesc = false;
MKLDNNMemoryDesc dst_tdesc(std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.dst_desc);
isSuitableDesc &= dst_tdesc.isSame(outputMemoryFormatsFilter[0]);
}
if (!isSuitableDesc) {
itd = descs.erase(itd);
@ -861,21 +873,21 @@ bool MKLDNNConvolutionNode::isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) {
isPossibleJitPlanar = false;
std::shared_ptr<mkldnn::convolution_forward::desc> convDesc(desc);
auto srcMemFmt = convDesc->data.src_desc.format;
auto dstMemFmt = convDesc->data.dst_desc.format;
auto srcMemDesc = MKLDNNMemoryDesc {convDesc->data.src_desc};
auto dstMemDesc = MKLDNNMemoryDesc {convDesc->data.dst_desc};
auto srcDataType = convDesc->data.src_desc.data_type;
auto dstDataType = convDesc->data.dst_desc.data_type;
bool isPlanarFloatConv = (srcMemFmt == memory::nchw || srcMemFmt == memory::ncdhw)
&& (dstMemFmt == memory::nchw || dstMemFmt == memory::ncdhw)
&& srcDataType == memory::f32
&& dstDataType == memory::f32;
bool isPlanarFloatConv = srcMemDesc.isPlainFormat()
&& dstMemDesc.isPlainFormat()
&& srcDataType == memory::data_type::f32
&& dstDataType == memory::data_type::f32;
return !isPossibleJitPlanar && isPlanarFloatConv;
}
MKLDNNMemoryDesc MKLDNNConvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(idx - 1).desc())
: MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_desc(idx - 1))
: MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY) {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),

View File

@ -32,6 +32,7 @@ public:
bool canBeInPlace() const override {
return false;
}
void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
size_t descInputNumbers(MKLDNNDescriptor desc) override {

View File

@ -10,6 +10,7 @@
#include <mkldnn_extension_utils.h>
#include "ie_parallel.hpp"
#include "common/cpu_memcpy.h"
#include "utils/general_utils.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -32,7 +33,7 @@ void MKLDNNCropNode::getSupportedDescriptors() {
MKLDNNDims childDims = getChildEdgeAt(0)->getDims();
offsets.resize(static_cast<size_t>(childDims.ndims())); // plus one dim for batch
dims.resize(static_cast<size_t>(childDims.ndims())); // plus one dim for batch
dims.resize(static_cast<size_t>(childDims.ndims())); // plus one dim for batch
for (int i = 0; i < childDims.ndims(); i++)
dims[i] = childDims[i];
@ -70,11 +71,11 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
THROW_IE_EXCEPTION << "Crop supports only 2d, 4d and 5d blobs.";
}
memory::format fmt = memory::format::format_undef;
memory::format_tag fmt = memory::format_tag::undef;
switch (inDims.ndims()) {
case 2: fmt = memory::format::nc; break;
case 4: fmt = memory::format::nchw; break;
case 5: fmt = memory::format::ncdhw; break;
case 2: fmt = memory::format_tag::nc; break;
case 4: fmt = memory::format_tag::nchw; break;
case 5: fmt = memory::format_tag::ncdhw; break;
}
InferenceEngine::LayerConfig config;
@ -93,12 +94,12 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
if ((inDims.ndims() == 4 || inDims.ndims() == 5) && channelAxis >= 0 && dims[channelAxis] % 8 == 0) {
fmt = inDims.ndims() == 5 ? memory::format::nCdhw8c : memory::format::nChw8c;
fmt = inDims.ndims() == 5 ? memory::format_tag::nCdhw8c : memory::format_tag::nChw8c;
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmt);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
if (dims[channelAxis] % 16 == 0) {
fmt = inDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c;
fmt = inDims.ndims() == 5 ? memory::format_tag::nCdhw16c : memory::format_tag::nChw16c;
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmt);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
@ -121,14 +122,19 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
auto& parentMem = getParentEdgeAt(0)->getMemory();
int m_block_size = 1;
if (!MKLDNNMemory::IsPlainFormat(parentMem.GetFormat())) {
m_block_size = parentMem.GetDescriptor().data.layout_desc.blocking.block_dims[1];
if (!parentMem.GetDesc().isPlainFormat()) {
const auto &desc = parentMem.GetDescriptor().data;
const auto &blk = desc.format_desc.blocking;
IE_ASSERT(desc.format_kind == dnnl_blocked &&
blk.inner_nblks == 1 &&
blk.inner_idxs[0] == 1);
m_block_size = blk.inner_blks[0];
}
const int m_inner_dim = dims[dims.size() - 1] * m_block_size;
const memory &dst_d = getChildEdgeAt(0)->getMemory().GetPrimitive();
const auto &dst_mem = getChildEdgeAt(0)->getMemory();
const int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
const int dst_ndims = dst_mem.GetDesc().getDims().ndims();
// TODO: Rewrite it in general case. For every tensor
// and rank, without using letter N,C,D,H,W
@ -154,12 +160,10 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
const int IH = (src_ndims > 2) ? src_dims[src_dims.size() - 2] : 1;
const int IW = (src_ndims > 3) ? src_dims[src_dims.size() - 1] : 1;
const uint8_t itemSize = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(parentMem.GetDataType()));
const size_t itemSize = parentMem.GetDesc().GetElementSize();
const auto *src_data = reinterpret_cast<const uint8_t *>(parentMem.GetData()) +
itemSize * parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) +
itemSize * getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
const auto *src_data = reinterpret_cast<const uint8_t*>(parentMem.GetPtr());
auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetPtr());
if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
parallel_for(ON, [&](int n) {

View File

@ -3,7 +3,6 @@
//
#include "mkldnn_deconv_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
#include <mkldnn.hpp>
#include <string>
@ -12,6 +11,7 @@
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers_internal.hpp>
#include "ie_parallel.hpp"
#include "utils/general_utils.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -20,7 +20,7 @@ using namespace InferenceEngine;
MKLDNNDeconvolutionNode::MKLDNNDeconvolutionNode(const InferenceEngine::CNNLayerPtr& layer,
const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(layer, eng, cache) {
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
});
}
@ -127,25 +127,26 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
void MKLDNNDeconvolutionNode::setBiasAsPostOp(const InferenceEngine::Blob::Ptr& biases) {
mkldnn::post_ops ops;
MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(biases->size(), 16))});
auto depthwiseSize = static_cast<ptrdiff_t>(rnd_up(biases->size(), 16));
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[0]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[0]->Create({depthwiseSize}, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[0]->FillZero();
std::vector<float> weights(biases->size());
for (int i = 0; i < biases->size(); i++) {
weights[i] = 1;
}
PostOpsIntBlobMemory[0]->SetData(memory::data_type::f32, memory::x, &weights[0],
biases->size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
std::vector<float> weights(depthwiseSize, 1.0f);
std::fill(weights.begin() + biases->size(), weights.end(), 0.0f);
PostOpsIntBlobMemory[0]->SetData(memory::data_type::f32, memory::format_tag::x, weights.data(),
weights.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[1]->Create({depthwiseSize}, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[1]->FillZero();
PostOpsIntBlobMemory[1]->SetData(memory::data_type::f32, memory::x, biases->buffer(),
biases->size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
auto biases_ptr = biases->buffer().as<float*>();
std::vector<float> bias(depthwiseSize, 0.0f);
std::copy(biases_ptr, biases_ptr + biases->size(), bias.begin());
PostOpsIntBlobMemory[1]->SetData(memory::data_type::f32, memory::format_tag::x, bias.data(),
bias.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
ops.append_depthwise(depthwise_scale_shift,
ops.append_depthwise(algorithm::depthwise_scale_shift,
(const float *) PostOpsIntBlobMemory[0]->GetData(),
(const float *) PostOpsIntBlobMemory[1]->GetData());
@ -166,14 +167,12 @@ void MKLDNNDeconvolutionNode::filterSupportedDescriptors() {
while (itd != descs.end()) {
bool isSuitableDesc = true;
if (!inputMemoryFormatsFilter.empty()) {
auto src_fmt = std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.src_desc.format;
if (src_fmt != inputMemoryFormatsFilter[0])
isSuitableDesc = false;
auto src_tdesc = MKLDNNMemoryDesc(std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.diff_src_desc);
isSuitableDesc &= src_tdesc.isSame(inputMemoryFormatsFilter[0]);
}
if (!outputMemoryFormatsFilter.empty()) {
auto dst_fmt = std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.dst_desc.format;
if (dst_fmt != outputMemoryFormatsFilter[0])
isSuitableDesc = false;
auto dst_tdesc = MKLDNNMemoryDesc(std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.diff_dst_desc);
isSuitableDesc &= dst_tdesc.isSame(outputMemoryFormatsFilter[0]);
}
if (!isSuitableDesc) {
itd = descs.erase(itd);
@ -184,12 +183,6 @@ void MKLDNNDeconvolutionNode::filterSupportedDescriptors() {
}
}
void MKLDNNDeconvolutionNode::execute(mkldnn::stream strm) {
if (prim) {
strm.submit({*prim});
}
}
bool MKLDNNDeconvolutionNode::created() const {
return getType() == Deconvolution;
}
@ -201,10 +194,11 @@ void MKLDNNDeconvolutionNode::createPrimitive() {
auto prim_desc = createPrimitiveDescriptor<convolution_backward_data::primitive_desc,
convolution_backward_data::desc, convolution_forward::primitive_desc>(attr);
prim.reset(new convolution_backward_data(prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
getWeights(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
prim.reset(new convolution_backward_data(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
primArgs = {{DNNL_ARG_DIFF_DST, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DIFF_SRC, dst}};
}
void MKLDNNDeconvolutionNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
@ -216,31 +210,41 @@ void MKLDNNDeconvolutionNode::createDescriptor(const std::vector<InferenceEngine
if ((withGroups && !isDW) && (in_candidate.blocksExtended() || out_candidate.blocksExtended()))
return;
MKLDNNMemoryDesc wgh_candidate{weightsDims, in_candidate.getDataType(), memory::any};
MKLDNNMemoryDesc wgh_candidate{weightsDims, in_candidate.getDataType(), memory::format_tag::any};
for (auto alg : {algorithm::convolution_winograd, algorithm::convolution_direct}) {
try {
std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
conv_desc.reset(new convolution_forward::desc(prop_kind::forward_inference, alg,
out_candidate, wgh_candidate, in_candidate, stride, dilation,
paddingL, paddingR, padding_kind::zero));
auto convert = [] (const std::vector<ptrdiff_t>& orig_dims) {
return memory::dims(orig_dims.begin(), orig_dims.end());
};
std::shared_ptr<mkldnn::convolution_backward_data::desc> deconv_desc;
deconv_desc.reset(new convolution_backward_data::desc(alg, out_candidate, wgh_candidate,
in_candidate, stride, dilation, paddingL, paddingR,
padding_kind::zero));
descs_fwd.push_back(conv_desc);
descs_bwd.push_back(deconv_desc);
std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
conv_desc.reset(new convolution_forward::desc(prop_kind::forward_inference, alg,
out_candidate, wgh_candidate, in_candidate,
convert(stride),
convert(dilation),
convert(paddingL),
convert(paddingR)));
descs.emplace_back(deconv_desc,
std::shared_ptr<convolution_forward::primitive_desc>(
new convolution_forward::primitive_desc(*conv_desc, getEngine())));
} catch(...) {}
std::shared_ptr<mkldnn::convolution_backward_data::desc> deconv_desc;
deconv_desc.reset(new convolution_backward_data::desc(alg, out_candidate, wgh_candidate,
in_candidate,
convert(stride),
convert(dilation),
convert(paddingL),
convert(paddingR)));
descs_fwd.push_back(conv_desc);
descs_bwd.push_back(deconv_desc);
auto fwd_conv_pd = std::make_shared<convolution_forward::primitive_desc>(*conv_desc, getEngine(), true);
if (fwd_conv_pd->get(true) == nullptr)
continue;
descs.emplace_back(deconv_desc, fwd_conv_pd);
}
}
MKLDNNMemoryDesc MKLDNNDeconvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(idx - 1).desc())
: MKLDNNMemoryDesc(primitive_desc_it.diff_dst_primitive_desc(idx).desc());
InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_desc(idx - 1))
: MKLDNNMemoryDesc(primitive_desc_it.diff_dst_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY) {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
@ -268,7 +272,7 @@ MKLDNNMemoryDesc MKLDNNDeconvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_i
}
MKLDNNMemoryDesc MKLDNNDeconvolutionNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.diff_src_primitive_desc(idx).desc());
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.diff_src_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY)
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),

View File

@ -23,7 +23,6 @@ public:
void createPrimitive() override;
void filterSupportedPrimitiveDescriptors() override;
void filterSupportedDescriptors();
void execute(mkldnn::stream strm) override;
bool created() const override;
bool canBeInPlace() const override {
return false;

View File

@ -12,17 +12,66 @@
namespace MKLDNNPlugin {
struct jit_def_conv_params {
int ndims;
int mb;
int dg;
int ngroups, ic, oc, oc_padded;
int id, ih, iw, od, oh, ow;
int f_pad, l_pad, t_pad;
int back_pad, r_pad, b_pad;
int kd, kh, kw;
int stride_d, stride_h, stride_w;
int dilate_d, dilate_h, dilate_w;
bool with_bias;
bool with_sum;
int nthr;
int nb_ic, ic_block;
int nb_oc, oc_block;
int nb_ic_blocking, nb_oc_blocking;
int ur_w;
int ur_w_tail;
int typesize_in;
int typesize_off;
int typesize_bia;
int typesize_out;
};
struct jit_def_conv_call_args {
const void *src;
const void *off;
const void *filt;
const void *bias;
const void *dst;
const void *buf;
size_t oh_pos;
};
struct jit_uni_def_conv_kernel {
void (*ker_)(const jit_def_conv_call_args *);
void operator()(const jit_def_conv_call_args *args) {
assert(ker_);
ker_(args);
}
explicit jit_uni_def_conv_kernel(jit_def_conv_params jcp) : ker_(nullptr), jcp_(jcp) {}
virtual ~jit_uni_def_conv_kernel() {}
virtual void create_ker() = 0;
jit_def_conv_params jcp_;
};
class MKLDNNDeformableConvolutionNode : public MKLDNNNode {
public:
MKLDNNDeformableConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNDeformableConvolutionNode() override = default;
void getSupportedDescriptors() override;
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void initDescriptor(const InferenceEngine::LayerConfig& config) override;
void createPrimitive() override;
void initSupportedPrimitiveDescriptors() override;
void execute(mkldnn::stream strm) override;
bool created() const override;
bool canBeInPlace() const override {
return false;
@ -31,18 +80,23 @@ public:
InferenceEngine::Precision getRuntimePrecision() const override;
private:
bool withBiases = false;
bool isDW = false;
bool isMerged = false;
bool isGrouped = false;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> dilation;
std::vector<ptrdiff_t> paddingL;
std::vector<ptrdiff_t> paddingR;
InferenceEngine::SizeVector weightDims;
InferenceEngine::SizeVector biasesDims;
size_t group = 1;
std::vector<ptrdiff_t> stride = {};
std::vector<ptrdiff_t> dilation = {};
std::vector<ptrdiff_t> paddingL = {};
int deformable_group = 1;
jit_def_conv_params jcp = {};
std::shared_ptr<jit_uni_def_conv_kernel> def_conv_kernel = nullptr;
void executeReference(const float* src, const float* offsets, const float* weights, float* dst,
const std::vector<size_t>& src_strides, const std::vector<size_t>& off_strides,
const std::vector<size_t>& wei_strides, const std::vector<size_t>& dst_strides);
void executeOptimized(const float* src, const float* offsets, const float* weights, float* dst,
const std::vector<size_t>& src_strides, const std::vector<size_t>& off_strides,
const std::vector<size_t>& dst_strides);
};
} // namespace MKLDNNPlugin

View File

@ -3,32 +3,35 @@
//
#include "mkldnn_eltwise_node.h"
#include <legacy/ie_layers.h>
#include <ie_parallel.hpp>
#include <mkldnn_types.h>
#include "utils/bfloat16.hpp"
#include <cpu/x64/jit_uni_quantization_injector.hpp>
#include <cpu/ref_eltwise.hpp>
#include "mkldnn_extension_utils.h"
#include "mkldnn_quantize_node.h"
#include "mkldnn_pooling_node.h"
#include "common/emitter.h"
#include "jit_eltwise_emitters.hpp"
#include "jit_mkldnn_emitters.hpp"
#include <mkldnn_selective_build.h>
#include <string>
#include <vector>
#include <memory>
#include <algorithm>
#include <cmath>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include "utils/bfloat16.hpp"
#include "ie_parallel.hpp"
#include "mkldnn_quantize_node.h"
#include <map>
#include "jit_uni_eltwise.hpp"
#include "jit_uni_quantization.hpp"
#include "common/emitter.h"
#include "jit_eltwise_emitters.hpp"
#include "jit_mkldnn_emitters.hpp"
#include "ref_eltwise.hpp"
#include "mkldnn_pooling_node.h"
#include <mkldnn_selective_build.h>
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl::utils;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace Xbyak;
#define GET_OFF(field) offsetof(jit_eltwise_call_args, field)
@ -44,9 +47,9 @@ struct SupportedPrecisions {
struct EltwiseEmitterContext {
std::shared_ptr<jit_emitter> emitter;
mkldnn::impl::cpu::jit_generator *host;
mkldnn::impl::cpu::cpu_isa_t host_isa;
const MKLDNNNode * node;
jit_generator *host;
cpu_isa_t host_isa;
const MKLDNNNode *node;
InferenceEngine::Precision exec_prc;
};
@ -60,10 +63,17 @@ struct EltwiseEmitter {
} // namespace
template <cpu_isa_t isa>
struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_generator {
struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic)
explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {
explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
Precision exec_prc = Precision::UNSPECIFIED;
std::set<Precision> supported_precision_intersection = get_supported_precisions(eltwiseNode);
@ -108,13 +118,15 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
quantizeNode->appendPostOps(post_ops);
quantization_injectors.push_back(std::make_shared<jit_uni_quantization_injector_f32<isa>>(
this, post_ops.get()->entry_[post_ops.get()->len_ - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
this, post_ops.get()->entry_[post_ops.len() - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
}
}
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
const auto &jep = jep_;
this->preamble();
for (int i = 0; i < jep.inputs_number; i++)
@ -130,7 +142,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
Xbyak::Label tail_loop_label;
Xbyak::Label tail_loop_end_label;
if (isa == avx512_common)
if (isa == x64::avx512_common)
vpxord(vmm_zero, vmm_zero, vmm_zero);
for (int i = 0; i < jep.inputs_number; i++) {
@ -287,12 +299,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
for (int i = 0; i < post_op_emitters.size(); i++) {
post_op_emitters[i]->emit_table();
}
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
using Vmm = typename conditional3<isa == x64::sse41, Xmm, isa == x64::avx2, Ymm, Zmm>::type;
Reg64 get_src_reg(int idx) {
return Reg64(r8.getIdx() + idx);
@ -501,7 +511,7 @@ private:
} else {
auto quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(eltwiseNode.getFusedWith()[i].get());
bool do_dequantization = quantizeNode->getAlgorithm() == mkldnn::quantization_quantize_dequantize;
bool do_dequantization = quantizeNode->getOpType() == QuantizeOpType::FakeQuantization;
bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != eltwiseNode.getFusedWith().size() - 1;
int s_idx = vmm_dst.getIdx();
@ -558,6 +568,8 @@ private:
uni_vcvtdq2ps(vmm_src, vmm_src);
break;
case Precision::I32:
if (src_prc == Precision::FP32 || src_prc == Precision::BF16)
uni_vcvtps2dq(vmm_src, vmm_src);
break;
default:
assert(!"unknown dst_prc");
@ -601,6 +613,8 @@ private:
uni_vcvtdq2ps(xmm_src, xmm_src);
break;
case Precision::I32:
if (src_prc == Precision::FP32 || src_prc == Precision::BF16)
uni_vcvtps2dq(xmm_src, xmm_src);
break;
default:
assert(!"unknown dst_prc");
@ -617,6 +631,8 @@ private:
uni_vcvtps2dq(vmm_dst, vmm_dst);
break;
case Precision::I32:
if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16)
uni_vcvtdq2ps(vmm_dst, vmm_dst);
break;
default:
assert(!"unknown src_prc");
@ -635,7 +651,7 @@ private:
vmovdqu16(op, ymm_dst);
break;
case Precision::I16:
if (isa == avx512_common) {
if (isa == x64::avx512_common) {
vmaxps(vmm_dst, vmm_zero, vmm_dst);
vpmovusdw(op, vmm_dst);
} else {
@ -643,36 +659,36 @@ private:
}
break;
case Precision::U16:
if (isa == avx512_common) {
if (isa == x64::avx512_common) {
vpmovsdw(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
}
break;
case Precision::I8:
if (isa == avx512_common) {
if (isa == x64::avx512_common) {
vmaxps(vmm_dst, vmm_zero, vmm_dst);
vpmovsdb(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
break;
case Precision::U8:
if (isa == avx512_common) {
if (isa == x64::avx512_common) {
vpmovusdb(op, vmm_dst);
} else {
uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
@ -690,6 +706,8 @@ private:
uni_vcvtps2dq(xmm_dst, xmm_dst);
break;
case Precision::I32:
if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16)
uni_vcvtdq2ps(xmm_dst, xmm_dst);
break;
default:
assert(!"unknown src_prc");
@ -742,91 +760,91 @@ MKLDNNEltwiseNode::initializers = {
alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
beta = 0.0f;
opType = Relu;
algorithm = mkldnn::eltwise_relu;
algorithm = mkldnn::algorithm::eltwise_relu;
}},
{"gelu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Gelu;
algorithm = mkldnn::eltwise_gelu;
algorithm = mkldnn::algorithm::eltwise_gelu;
}},
{"elu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
beta = 0.0f;
opType = Elu;
algorithm = mkldnn::eltwise_elu;
algorithm = mkldnn::algorithm::eltwise_elu;
}},
{"tanh", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Tanh;
algorithm = mkldnn::eltwise_tanh;
algorithm = mkldnn::algorithm::eltwise_tanh;
}},
{"sigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Logistic;
algorithm = mkldnn::eltwise_logistic;
algorithm = mkldnn::algorithm::eltwise_logistic;
}},
{"logistic", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Logistic;
algorithm = mkldnn::eltwise_logistic;
algorithm = mkldnn::algorithm::eltwise_logistic;
}},
{"square", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Square;
algorithm = mkldnn::eltwise_square;
algorithm = mkldnn::algorithm::eltwise_square;
}},
{"abs", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Abs;
algorithm = mkldnn::eltwise_abs;
algorithm = mkldnn::algorithm::eltwise_abs;
}},
{"sqrt", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Sqrt;
algorithm = mkldnn::eltwise_sqrt;
algorithm = mkldnn::algorithm::eltwise_sqrt;
}},
{"linear", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
beta = activationLayer->GetParamAsFloat("beta", 0.0f);
opType = Linear;
algorithm = mkldnn::eltwise_linear;
algorithm = mkldnn::algorithm::eltwise_linear;
}},
{"bounded_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = activationLayer->GetParamAsFloat("alpha", 0.0f);
beta = 0.0f;
opType = BoundedRelu;
algorithm = mkldnn::eltwise_bounded_relu;
algorithm = mkldnn::algorithm::eltwise_bounded_relu;
}},
{"soft_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = SoftRelu;
algorithm = mkldnn::eltwise_soft_relu;
algorithm = mkldnn::algorithm::eltwise_soft_relu;
}},
{"relu6", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = activationLayer->GetParamAsFloat("n", 6.0f);
beta = 0.0f;
opType = Relu6;
algorithm = mkldnn::eltwise_bounded_relu;
algorithm = mkldnn::algorithm::eltwise_bounded_relu;
}},
{"clamp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = activationLayer->GetParamAsFloat("max", 1.0f);
beta = activationLayer->GetParamAsFloat("min", 0.0f);
alpha = activationLayer->GetParamAsFloat("min", 1.0f);
beta = activationLayer->GetParamAsFloat("max", 0.0f);
opType = Clamp;
algorithm = mkldnn::eltwise_clamp;
algorithm = mkldnn::algorithm::eltwise_clip;
}},
{"exp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Exp;
algorithm = mkldnn::eltwise_exp;
algorithm = mkldnn::algorithm::eltwise_exp;
}},
{"not", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
@ -837,25 +855,25 @@ MKLDNNEltwiseNode::initializers = {
alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
beta = 0.0f;
opType = Swish;
algorithm = mkldnn::eltwise_swish;
algorithm = mkldnn::algorithm::eltwise_swish;
}},
{"hswish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Hswish;
algorithm = mkldnn::eltwise_hswish;
algorithm = mkldnn::algorithm::eltwise_hswish;
}},
{"mish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Mish;
algorithm = mkldnn::eltwise_mish;
algorithm = mkldnn::algorithm::eltwise_mish;
}},
{"hsigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
beta = 0.0f;
opType = Hsigmoid;
algorithm = mkldnn::eltwise_hsigmoid;
algorithm = mkldnn::algorithm::eltwise_hsigmoid;
}},
{"round", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
alpha = 0.0f;
@ -863,9 +881,9 @@ MKLDNNEltwiseNode::initializers = {
opType = Round;
std::string mode = activationLayer->GetParamAsString("mode", "half_to_even");
if (mode == "half_to_even")
algorithm = mkldnn::eltwise_round_half_to_even;
algorithm = mkldnn::algorithm::eltwise_round_half_to_even;
else if (mode == "half_away_from_zero")
algorithm = mkldnn::eltwise_round_half_away_from_zero;
algorithm = mkldnn::algorithm::eltwise_round_half_away_from_zero;
else
THROW_IE_EXCEPTION << "Round layer with name " << activationLayer->name << " doesn't support mode " << mode;
}},
@ -916,13 +934,13 @@ void MKLDNNEltwiseNode::init() {
} else if (comparator(layerType, "scaleshift")) {
if (getCnnLayer().get()->blobs.size() == 2) {
eltwiseOp = MulAdd;
eltwiseAlgorithm = mkldnn::depthwise_scale_shift;
eltwiseAlgorithm = mkldnn::algorithm::depthwise_scale_shift;
} else {
eltwiseOp = Multiply;
}
} else if (comparator(layerType, "prelu")) {
eltwiseOp = Prelu;
eltwiseAlgorithm = mkldnn::depthwise_prelu;
eltwiseAlgorithm = mkldnn::algorithm::depthwise_prelu;
} else if (comparator(layerType, "activation") && initializers.find(getCnnLayer().get()->GetParamAsString("type")) != initializers.end()) {
initializers[getCnnLayer().get()->GetParamAsString("type")](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta);
} else if (comparator(layerType, "relu") ||
@ -999,7 +1017,7 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
canUseOptimizedImpl = mayiuse(cpu::sse42);
canUseOptimizedImpl = mayiuse(x64::sse41);
size_t expectedInputsNum = getOpInputsNum();
for (auto& postOp : fusedWith) {
@ -1087,35 +1105,39 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
auto initDesc = [&] (LayoutType lt) -> PrimitiveDescInfo {
auto createMemoryDesc = [lt](MKLDNNEdgePtr edge, Precision prc, size_t offset) -> TensorDesc {
if (lt == ChannelsFirst) {
std::vector<size_t> blocks = edge->getDims().ToSizeVector();
std::vector<size_t> order;
order.push_back(0);
for (size_t j = 2; j < blocks.size(); j++)
order.push_back(j);
if (blocks.size() > 1)
auto dims = edge->getDims().ToSizeVector();
auto ndims = dims.size();
std::vector<size_t> order(ndims);
std::iota(order.begin(), order.end(), 0);
if (ndims > 1) {
order.erase(order.begin() + 1);
order.push_back(1);
}
return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
std::vector<size_t> blocks(ndims);
for (size_t i = 0; i < order.size(); i++) {
blocks[i] = dims[order[i]];
}
return TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset});
} else if (lt == Blocked && edge->getDims()[1] != 1) {
size_t blockSize = mayiuse(cpu::avx512_common) ? 16 : 8;
size_t blockSize = mayiuse(x64::avx512_common) ? 16 : 8;
std::vector<size_t> blocks = edge->getDims().ToSizeVector();
std::vector<size_t> order(blocks.size());
for (size_t j = 0; j < order.size(); j++)
order[j] = j;
std::iota(order.begin(), order.end(), 0);
blocks[1] = div_up(blocks[1], blockSize);
blocks.push_back(blockSize);
order.push_back(1);
return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
return TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset});
} else {
std::vector<size_t> blocks = edge->getDims().ToSizeVector();
std::vector<size_t> order(blocks.size());
for (size_t j = 0; j < order.size(); j++)
order[j] = j;
std::iota(order.begin(), order.end(), 0);
return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
return TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset});
}
};
@ -1143,17 +1165,17 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
config.outConfs.push_back(dataConfig);
impl_desc_type impl_type;
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(x64::avx512_common)) {
impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(x64::avx2)) {
impl_type = impl_desc_type::jit_avx2;
} else if (mayiuse(cpu::sse42)) {
} else if (mayiuse(x64::sse41)) {
impl_type = impl_desc_type::jit_sse42;
} else {
impl_type = impl_desc_type::ref;
}
return {config, impl_type, MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat()};
return {config, impl_type};
};
bool isChannelsFirstApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 1, 2, 4, 5);
@ -1243,10 +1265,10 @@ void MKLDNNEltwiseNode::createPrimitive() {
start_offset_in.resize(inputNum);
for (size_t i = 0; i < inputNum; i++) {
start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.offset0 *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getParentEdgeAt(i)->getMemory().GetDescriptor().data.data_type));
}
start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.offset0 *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getChildEdgeAt(0)->getMemory().GetDescriptor().data.data_type));
};
@ -1388,13 +1410,16 @@ void MKLDNNEltwiseNode::createPrimitive() {
jep.oc_size = oc_size;
if (mayiuse(cpu::avx512_common)) {
eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx512_common>(jep, *this));
} else if (mayiuse(cpu::avx2)) {
eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx2>(jep, *this));
} else if (mayiuse(cpu::sse42)) {
eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::sse42>(jep, *this));
if (mayiuse(x64::avx512_common)) {
eltwise_kernel.reset(new jit_uni_eltwise_generic<x64::avx512_common>(jep, *this));
} else if (mayiuse(x64::avx2)) {
eltwise_kernel.reset(new jit_uni_eltwise_generic<x64::avx2>(jep, *this));
} else if (mayiuse(x64::sse41)) {
eltwise_kernel.reset(new jit_uni_eltwise_generic<x64::sse41>(jep, *this));
}
if (eltwise_kernel)
eltwise_kernel->create_ker();
}
void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() {
@ -1448,6 +1473,26 @@ void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() {
selectPrimitiveDescriptorByIndex(0);
}
void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() {
auto selected_pd = getSelectedPrimitiveDescriptor();
if (selected_pd == nullptr)
THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
auto config = selected_pd->getConfig();
if (!isInitConfig(config)) {
for (size_t i = 0; i < config.inConfs.size(); i++) {
config.inConfs[i].desc = getConfiguredInputDesc(config, i);
}
for (size_t i = 0; i < config.outConfs.size(); i++) {
config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
}
initDescriptor(config);
} else {
initDescriptor(config);
}
}
void MKLDNNEltwiseNode::offset_out_calc(std::vector<size_t>& offset, std::vector<size_t>& dims) {
int k = 1;
for (int i = offset.size() - 1; i >= 0; i--) {
@ -1541,8 +1586,8 @@ void MKLDNNEltwiseNode::executeReference(const std::vector<const uint8_t *>& src
size_t inputNum = src_ptrs.size();
std::shared_ptr<ref_eltwise_scalar_fwd_t> ref_eltwise_injector = nullptr;
if (eltwiseAlgorithm != mkldnn::algorithm_undef) {
ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(static_cast<mkldnn_alg_kind_t>(eltwiseAlgorithm), alpha, beta);
if (eltwiseAlgorithm != mkldnn::algorithm::undef) {
ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(static_cast<mkldnn_alg_kind_t>(eltwiseAlgorithm), alpha, beta, 1.f);
}
parallel_nt(0, [&](const int ithr, const int nthr) {
@ -1664,29 +1709,29 @@ bool MKLDNNEltwiseNode::canBeInPlace() const {
void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) {
switch (getAlgorithm()) {
case mkldnn::eltwise_relu:
case mkldnn::eltwise_tanh:
case mkldnn::eltwise_elu:
case mkldnn::eltwise_square:
case mkldnn::eltwise_abs:
case mkldnn::eltwise_sqrt:
case mkldnn::eltwise_linear:
case mkldnn::eltwise_bounded_relu:
case mkldnn::eltwise_soft_relu:
case mkldnn::eltwise_logistic:
case mkldnn::eltwise_exp:
case mkldnn::eltwise_gelu:
case mkldnn::eltwise_clamp:
case mkldnn::eltwise_swish:
case mkldnn::eltwise_hswish:
case mkldnn::eltwise_mish:
case mkldnn::eltwise_hsigmoid:
case mkldnn::eltwise_round_half_to_even:
case mkldnn::eltwise_round_half_away_from_zero:
case mkldnn::algorithm::eltwise_relu:
case mkldnn::algorithm::eltwise_tanh:
case mkldnn::algorithm::eltwise_elu:
case mkldnn::algorithm::eltwise_square:
case mkldnn::algorithm::eltwise_abs:
case mkldnn::algorithm::eltwise_sqrt:
case mkldnn::algorithm::eltwise_linear:
case mkldnn::algorithm::eltwise_bounded_relu:
case mkldnn::algorithm::eltwise_soft_relu:
case mkldnn::algorithm::eltwise_logistic:
case mkldnn::algorithm::eltwise_exp:
case mkldnn::algorithm::eltwise_gelu:
case mkldnn::algorithm::eltwise_clip:
case mkldnn::algorithm::eltwise_swish:
case mkldnn::algorithm::eltwise_hswish:
case mkldnn::algorithm::eltwise_mish:
case mkldnn::algorithm::eltwise_hsigmoid:
case mkldnn::algorithm::eltwise_round_half_to_even:
case mkldnn::algorithm::eltwise_round_half_away_from_zero:
ops.append_eltwise(1.0, getAlgorithm(), getAlpha(), getBeta());
break;
case mkldnn::depthwise_scale_shift:
case mkldnn::depthwise_prelu:
case mkldnn::algorithm::depthwise_scale_shift:
case mkldnn::algorithm::depthwise_prelu:
if (scales.empty() && shifts.empty()) {
size_t bufferSize = static_cast<size_t>(outDims[0][outDims[0].size() > 1 ? 1 : 0]);
size_t bufferSizeAligned = rnd_up(bufferSize, 16);
@ -1742,7 +1787,7 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
return true;
};
if (!mayiuse(cpu::sse42))
if (!mayiuse(x64::sse41))
return false;
if (!isSuitableNode(this)) {

View File

@ -99,6 +99,8 @@ struct jit_uni_eltwise_kernel {
explicit jit_uni_eltwise_kernel(jit_eltwise_params jep, MKLDNNEltwiseNode& node) : ker_(nullptr), jep_(jep), eltwiseNode(node) {}
virtual ~jit_uni_eltwise_kernel() {}
virtual void create_ker() = 0;
jit_eltwise_params jep_;
MKLDNNEltwiseNode& eltwiseNode;
};
@ -111,6 +113,7 @@ public:
void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void selectOptimalPrimitiveDescriptor() override;
void initOptimalPrimitiveDescriptor() override;
void createPrimitive() override;
void execute(mkldnn::stream strm) override;
bool created() const override;
@ -136,7 +139,7 @@ private:
void init() override;
EltwiseOpType eltwiseOp = Add;
mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm_undef;
mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm::undef;
std::shared_ptr<jit_uni_eltwise_kernel> eltwise_kernel = nullptr;
jit_eltwise_params jep = {};

View File

@ -5,12 +5,13 @@
#include "mkldnn_fullyconnected_node.h"
#include "mkldnn_eltwise_node.h"
#include "mkldnn_quantize_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
#include <string>
#include <vector>
#include <mkldnn_extension_utils.h>
#include <mkldnn.hpp>
#include "utils/general_utils.h"
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -19,40 +20,33 @@ using namespace InferenceEngine;
MKLDNNFullyConnectedNode::MKLDNNFullyConnectedNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache), withBiases(false), baseInputsNumber(0) {
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
});
internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
if (internalBlobs.size() <= 1)
return MKLDNNMemoryDesc();
return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc());
return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(1));
});
auto ws = layer->blobs.find("w-scale");
if (ws != layer->blobs.end()) {
wScale = ws->second;
}
if (getCnnLayer()->type == "FullyConnected" || getCnnLayer()->type == "InnerProduct") {
baseInputsNumber = getCnnLayer().get()->insData.size();
}
}
// Trying to find oi-scale
if (getCnnLayer()->type == "FullyConnected" && getCnnLayer()->precision == Precision::I8) {
if (baseInputsNumber != 1) {
THROW_IE_EXCEPTION << "Unsupported number of inputs for quantized FullyConnected " << getCnnLayer()->name;
}
auto ois = layer->blobs.find("oi-scale");
if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8)
&& ois == layer->blobs.end()) {
THROW_IE_EXCEPTION << "Internal error of graph quantization - mismatch of intermediate scales and next layer type for fully connected "
<< getCnnLayer()->name;
}
if (ois != layer->blobs.end()) {
// If we can find an oi-scale, then the next layer has to be an INT8.
oScale = ois->second;
}
}
std::vector<memory::format_tag> MKLDNNFullyConnectedNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
if (dims.ndims() == 0)
return {memory::format_tag::x};
else if (dims.ndims() == 1)
return {memory::format_tag::x};
else if (dims.ndims() == 2)
return {memory::format_tag::nc};
else if (dims.ndims() == 3)
return {memory::format_tag::tnc};
else if (dims.ndims() == 4)
return {memory::format_tag::nChw8c, memory::format_tag::nChw16c, memory::format_tag::nhwc, memory::format_tag::nchw};
else if (dims.ndims() == 5)
return {memory::format_tag::nCdhw8c, memory::format_tag::nCdhw16c, memory::format_tag::ndhwc, memory::format_tag::ncdhw};
return {memory::format_tag::any};
}
void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
@ -64,8 +58,8 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
precision = getCnnLayer()->outData[0]->getPrecision();
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
if (inputDataType == memory::f32) {
outputDataType = memory::f32;
if (inputDataType == memory::data_type::f32) {
outputDataType = memory::data_type::f32;
}
if (baseInputsNumber > 1) {
@ -77,10 +71,10 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
}
auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());
// TODO(amalyse) what are the cases when we have non i8 weights and have to overide the precisions?
if (((inputDataType != memory::u8 && inputDataType != memory::s8) || weightsDataType != memory::s8) && inputDataType != memory::bf16) {
inputDataType = memory::f32;
outputDataType = memory::f32;
if ((!one_of(inputDataType , memory::data_type::u8, memory::data_type::s8) || weightsDataType != memory::data_type::s8) &&
inputDataType != memory::data_type::bf16) {
inputDataType = memory::data_type::f32;
outputDataType = memory::data_type::f32;
}
}
@ -99,66 +93,36 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
if (getChildEdges().empty())
THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
MKLDNNDims inDims(fcLayer->input()->getDims());
MKLDNNDims outDims(fcLayer->outData[0]->getDims());
MKLDNNDims inDims = getParentEdgeAt(0)->getDims();
MKLDNNDims outDims = getChildEdgeAt(0)->getDims();
if (inDims.ndims() == 2) {
weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1])};
} else if (inDims.ndims() == 3) {
weightsDims = {static_cast<size_t>(outDims[2]), static_cast<size_t>(inDims[2])};
} else if (inDims.ndims() == 4) {
weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
static_cast<size_t>(inDims[3])};
} else if (inDims.ndims() == 5) {
weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
static_cast<size_t>(inDims[3]), static_cast<size_t>(inDims[4])};
} else {
THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4 or 2, got: "
if (!one_of(inDims.ndims(), 2, 3, 4, 5)) {
THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4, 3 or 2, got: "
<< inDims.ndims() << " dims.";
}
if (inDims.ndims() == 3) {
weightsDims = InferenceEngine::SizeVector({static_cast<size_t>(outDims[2]), static_cast<size_t>(inDims[2])});
} else {
weightsDims.push_back(outDims[1]);
for (int i = 1; i < inDims.ndims(); i++)
weightsDims.push_back(inDims[i]);
}
biasesDims.push_back(weightsDims[0]);
if (baseInputsNumber == 1) {
internalBlobs.push_back(createInternalBlob(weightsDims, true));
}
withBiases = (fcLayer->_biases != nullptr && fcLayer->_biases->size() != 0) || baseInputsNumber == 3;
if (inDims.ndims() == 3) {
biasesDims.push_back(static_cast<int>(outDims[2]));
} else {
biasesDims.push_back(static_cast<int>(fcLayer->_out_num));
}
if (withBiases && baseInputsNumber == 1) {
internalBlobs.push_back(createInternalBlob(biasesDims, false));
}
if (this->getCnnLayer()->blobs.find("weights") != this->getCnnLayer()->blobs.end()) {
Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
if (weights->getTensorDesc().getPrecision() == Precision::I8) {
// The weights blob has incorrect dims, so we have to fix it
TensorDesc wdesc = internalBlobs[0]->getTensorDesc();
wdesc.setPrecision(Precision::I8);
InferenceEngine::TBlob<int8_t>::Ptr reshapedInt8Weights =
InferenceEngine::TBlob<int8_t>::Ptr(
new InferenceEngine::TBlob<int8_t>(wdesc, static_cast<int8_t *>(weights->buffer()),
weights->byteSize()));
internalBlobs[0] = reshapedInt8Weights;
if (withBiases) {
Blob::Ptr biases = this->getCnnLayer()->blobs.find("biases")->second;
TensorDesc bdesc = internalBlobs[1]->getTensorDesc();
bdesc.setPrecision(Precision::I32);
InferenceEngine::TBlob<int32_t>::Ptr reshapedInt32Biases =
InferenceEngine::TBlob<int32_t>::Ptr(
new InferenceEngine::TBlob<int32_t>(bdesc, static_cast<int32_t *>(biases->buffer()),
biases->byteSize()));
internalBlobs[1] = reshapedInt32Biases;
}
}
}
for (auto format : getAvailableFormatsForDims(getParentEdgeAt(0)->getDims())) {
for (auto format : getAvailableFormatsForDims(inDims)) {
MKLDNNMemoryDesc in_candidate(inDims, inputDataType, format);
MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::any);
MKLDNNMemoryDesc out_candidate(outDims, outputDataType, memory::format_tag::any);
createDescriptor({in_candidate}, {out_candidate});
}
@ -173,17 +137,36 @@ void MKLDNNFullyConnectedNode::createPrimitive() {
prim_desc = std::make_shared<inner_product_forward::primitive_desc>(
createPrimitiveDescriptor<inner_product_forward::primitive_desc, inner_product_forward::desc>(*attr));
if (withBiases) {
prim.reset(new inner_product_forward(*prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
getWeights(),
getBias(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
} else {
prim.reset(new inner_product_forward(*prim_desc,
getParentEdgeAt(0)->getMemory().GetPrimitive(),
getWeights(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
prim.reset(new inner_product_forward(*prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
if (withBiases)
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_BIAS, getBias()}, {DNNL_ARG_DST, dst}};
else
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DST, dst}};
}
void MKLDNNFullyConnectedNode::execute(mkldnn::stream strm) {
if (prim) {
auto reshapeMemory = [this](int argType) {
auto param = primArgs.find(argType);
if (param != primArgs.end()) {
auto oldMem = param->second;
auto dims = oldMem.get_desc().dims();
if (dims.size() == 3) {
MKLDNNDims normalizedDims({static_cast<ptrdiff_t>(dims[0] * dims[1]), static_cast<ptrdiff_t>(dims[2])});
mkldnn::memory::desc newMemDesc(oldMem.get_desc().reshape(normalizedDims));
mkldnn::memory newMem(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
primArgs.at(argType) = newMem;
}
}
};
reshapeMemory(DNNL_ARG_SRC);
reshapeMemory(DNNL_ARG_DST);
(*prim).execute(strm, primArgs);
}
}
@ -206,35 +189,35 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(ndims == 3 ? getChildEdgeAt(0)->getDims()[2] : getChildEdgeAt(0)->getDims()[1], 16))});
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx]->FillZero();
// In case ndims == 3 graph optimizer allows fusing only if all weights values are the same
if (depthwiseLayer->blobs["weights"]->size() == 1 || ndims == 3) {
float broadcastValue = static_cast<float *>(depthwiseLayer->_weights->buffer())[0];
for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetDesc().getDims()[0]; i++) {
static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
}
} else {
PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::format_tag::x,
depthwiseLayer->_weights->buffer(),
depthwiseLayer->_weights->size() *
MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
}
if (eltwiseNode->getAlgorithm() == depthwise_scale_shift) {
if (eltwiseNode->getAlgorithm() == algorithm::depthwise_scale_shift) {
PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format_tag::x);
PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
// In case ndims == 3 graph optimizer allows fusing only if all biases values are the same
if (depthwiseLayer->blobs["biases"]->size() == 1 || ndims == 3) {
float broadcastValue = static_cast<float *>(depthwiseLayer->_biases->buffer())[0];
for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetDesc().getDims()[0]; i++) {
static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
}
} else {
PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::format_tag::x,
depthwiseLayer->_biases->buffer(),
depthwiseLayer->_biases->size() *
MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
@ -273,31 +256,6 @@ bool MKLDNNFullyConnectedNode::created() const {
return getType() == FullyConnected;
}
memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::format sourceFormat) {
switch (sourceFormat) {
case memory::format::x:
return memory::format::x;
case memory::format::nc:
case memory::format::tnc:
case memory::format::ntc:
return memory::format::oi;
case memory::format::nchw:
return memory::format::oihw;
case memory::format::ncdhw:
return memory::format::oidhw;
case memory::format::nChw8c:
return memory::format::oIhw8i;
case memory::format::nCdhw8c:
return memory::format::oIdhw8i;
case memory::format::nChw16c:
return memory::format::oIhw16i;
case memory::format::nCdhw16c:
return memory::format::oIdhw16i;
default:
THROW_IE_EXCEPTION << "Unsupported source format for node " << getName();
}
}
const std::vector<impl_desc_type>& MKLDNNFullyConnectedNode::getPrimitivesPriority() {
std::vector<impl_desc_type> priorities = {
impl_desc_type::unknown,
@ -335,25 +293,6 @@ const std::vector<impl_desc_type>& MKLDNNFullyConnectedNode::getPrimitivesPriori
std::shared_ptr<mkldnn::primitive_attr> MKLDNNFullyConnectedNode::initPrimitiveAttr() {
auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());
if (wScale != nullptr) {
float* wScaleData = static_cast<float*>(wScale->buffer());
std::vector<float> oScaleDataVector;
if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
float *oScaleData = static_cast<float *>(oScale->buffer());
for (size_t c = 0; c < wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
}
} else {
for (size_t c = 0; c < wScale->size(); c++) {
oScaleDataVector.push_back(wScaleData[c]);
}
}
attr->set_int_output_round_mode(mkldnn::round_nearest);
attr->set_output_scales(1 << 1 /*through C dim*/, oScaleDataVector);
}
setPostOps(*attr, true);
@ -363,47 +302,31 @@ std::shared_ptr<mkldnn::primitive_attr> MKLDNNFullyConnectedNode::initPrimitiveA
void MKLDNNFullyConnectedNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0];
mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
mkldnn::memory::data_type bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
if (inDesc.getPrecision() == Precision::BF16) {
bdt = mkldnn::memory::data_type::f32;
} else if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
wdt = memory::data_type::s8;
bdt = baseInputsNumber == 3 ? MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::data_type::f32;
}
if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
wdt = memory::s8;
bdt = baseInputsNumber == 3 ? MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::f32;
}
if (this->getCnnLayer()->blobs.find("weights") != this->getCnnLayer()->blobs.end()) {
Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
if (weights->getTensorDesc().getPrecision() == Precision::I8) {
wdt = memory::s8;
bdt = memory::s32;
Precision outPrec;
if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
outPrec = Precision::FP32;
} else {
// define precision accordninly normalizer
// TODO(amalyshe) do we need to have separate flow for last in int8 chain or not?
outPrec = outDesc.getPrecision();
}
inDesc = TensorDesc(inDesc.getPrecision(), inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), Layout::NC/*, outputDesc[0].getBlockingDesc()*/);
}
if (inDesc.getDims().size() == 3) {
auto inDims = inDesc.getDims();
auto outDims = outDesc.getDims();
InferenceEngine::SizeVector normalizedInDims = {inDims[0] * inDims[1], inDims[2]};
InferenceEngine::SizeVector normalizedOutDims = {outDims[0] * outDims[1], outDims[2]};
inDesc = InferenceEngine::TensorDesc(inDesc.getPrecision(), normalizedInDims, TensorDesc::getLayoutByDims(normalizedInDims));
outDesc = InferenceEngine::TensorDesc(outDesc.getPrecision(), normalizedOutDims, TensorDesc::getLayoutByDims(normalizedOutDims));
}
MKLDNNMemoryDesc in_candidate(inDesc);
MKLDNNMemoryDesc out_candidate(outDesc);
memory::format weights_fmt = weightsFormatForSrcFormat(in_candidate.getFormat());
MKLDNNMemoryDesc wgh_candidate(MKLDNNDims(weightsDims), wdt, weights_fmt);
MKLDNNMemoryDesc wgh_candidate(MKLDNNDims(weightsDims), wdt, mkldnn::memory::format_tag::any);
if (withBiases) {
MKLDNNMemoryDesc bias_candidate(MKLDNNDims(biasesDims), bdt, memory::any);
MKLDNNMemoryDesc bias_candidate(MKLDNNDims(biasesDims), bdt, memory::format_tag::any);
MKLDNNDescriptor desc(std::shared_ptr<inner_product_forward::desc>(
new inner_product_forward::desc(prop_kind::forward_scoring, in_candidate, wgh_candidate,
bias_candidate, out_candidate)));
@ -417,17 +340,39 @@ void MKLDNNFullyConnectedNode::createDescriptor(const std::vector<InferenceEngin
}
MKLDNNMemoryDesc MKLDNNFullyConnectedNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(idx - 1).desc())
: MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_desc(idx - 1))
: MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY)
if (desc.getLayout() == InferenceEngine::Layout::ANY) {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getParentEdgeAt(idx)->getDims().ToSizeVector(),
desc.getLayout()));
else
} else if (getParentEdgeAt(idx)->getDims().ndims() == 3) {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getParentEdgeAt(idx)->getDims().ToSizeVector(),
TensorDesc::getLayoutByDims(getParentEdgeAt(idx)->getDims().ToSizeVector())));
} else {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getParentEdgeAt(idx)->getDims().ToSizeVector(),
desc.getBlockingDesc()));
}
}
MKLDNNMemoryDesc MKLDNNFullyConnectedNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));
if (desc.getLayout() == InferenceEngine::Layout::ANY) {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),
desc.getLayout()));
} else if (getChildEdgeAt(idx)->getDims().ndims() == 3) {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),
TensorDesc::getLayoutByDims(getChildEdgeAt(idx)->getDims().ToSizeVector())));
} else {
return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
getChildEdgeAt(idx)->getDims().ToSizeVector(),
desc.getBlockingDesc()));
}
}
const mkldnn::memory& MKLDNNFullyConnectedNode::getWeights() const {

View File

@ -17,9 +17,12 @@ public:
MKLDNNFullyConnectedNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNFullyConnectedNode() override = default;
std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims &dims) const override;
void getSupportedDescriptors() override;
void createPrimitive() override;
void execute(mkldnn::stream strm) override;
bool created() const override;
bool canBeInPlace() const override {
return false;
}
@ -33,6 +36,7 @@ public:
}
MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
const mkldnn::memory& getWeights() const;
const mkldnn::memory& getBias() const;
@ -45,13 +49,10 @@ protected:
private:
InferenceEngine::SizeVector weightsDims;
InferenceEngine::SizeVector biasesDims;
mkldnn::memory::format weightsFormatForSrcFormat(mkldnn::memory::format sourceFormat);
std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
InferenceEngine::Blob::Ptr wScale, oScale;
bool withBiases;
int baseInputsNumber;
};

View File

@ -200,7 +200,7 @@ inline void process_gemm(char transa, char transb, int M, int N, int K, float al
inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint16_t *A, int lda,
const uint16_t *B, int ldb, float beta, float *C, int ldc) {
mkldnn_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
dnnl_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
}
inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint8_t *A, int lda,
@ -231,13 +231,11 @@ void MKLDNNGemmNode::process_data() {
auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
auto& dstMemory0 = getChildEdgeAt(0)->getMemory();
const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetData()) +
srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData()) +
srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetPtr());
const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData());
float *dst_ptr = reinterpret_cast<float*>(dstMemory0.GetData());
int MB1 = outDims.ndims() == 4 ? batchToProcess() : 1;
int MB2 = outDims.ndims() == 3 ? batchToProcess() : outDims.ndims() > 3 ? outDims[outDims.ndims() - 3] : 1;
@ -255,8 +253,7 @@ void MKLDNNGemmNode::process_data() {
const float *src2_ptr;
if (isThreeInputs) {
auto& srcMemory2 = getParentEdgeAt(2)->getMemory();
src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetData()) +
srcMemory2.GetDescriptor().data.layout_desc.blocking.offset_padding;
src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetPtr());
} else {
src2_ptr = dst_ptr;
}

View File

@ -55,12 +55,7 @@ void MKLDNNGenericNode::initSupportedPrimitiveDescriptors() {
}
for (auto& config : configs) {
std::vector<memory::format> outFormats;
for (auto& outConfig : config.outConfs) {
outFormats.push_back(MKLDNNMemory::Convert(outConfig.desc.getLayout()));
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormats);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
}
}
if (impls.empty()) {

View File

@ -47,33 +47,30 @@ void MKLDNNInputNode::initSupportedPrimitiveDescriptors() {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
memory::format outFormat = mkldnn::memory::format_undef;
if (getType() == Input || getType() == MemoryInput) {
precision = getCnnLayer()->outData[0]->getPrecision();
if (precision == InferenceEngine::Precision::U16 || isMeanImage) {
precision = InferenceEngine::Precision::FP32;
}
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
outFormat = MKLDNNMemory::Convert(getCnnLayer()->outData[0]->getLayout());
dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
auto mem_tdesc = MKLDNNMemoryDesc(getCnnLayer()->outData[0]->getTensorDesc());
dataConfig.desc = mem_tdesc;
config.outConfs.push_back(dataConfig);
} else if (getType() == Output) {
precision = getCnnLayer()->insData[0].lock()->getPrecision();
if (precision == InferenceEngine::Precision::U16) precision = InferenceEngine::Precision::FP32;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
outFormat = MKLDNNMemory::Convert(getCnnLayer()->insData[0].lock()->getLayout());
dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, outFormat);
auto mem_tdesc = MKLDNNMemoryDesc(getCnnLayer()->insData[0].lock()->getTensorDesc());
dataConfig.desc = mem_tdesc;
config.inConfs.push_back(dataConfig);
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormat);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
}
void MKLDNNInputNode::createPrimitive() {
@ -145,17 +142,28 @@ void MKLDNNInputNode::execute(mkldnn::stream strm) {
return;
auto dstBlob = getChildEdgeAt(0)->getBlob();
if (constBlob->size() != dstBlob->size()) {
THROW_IE_EXCEPTION << "Incorrect blob sizes for node " << getName();
}
if (constBlob->getTensorDesc() == dstBlob->getTensorDesc()
|| isCompatibleTensors(constBlob->getTensorDesc(), dstBlob->getTensorDesc())) {
const int8_t *srcData = constBlob->cbuffer().as<int8_t *>();
int8_t *dstData = dstBlob->buffer();
cpu_memcpy_s(dstData, dstBlob->byteSize(), srcData, constBlob->byteSize());
} else if (constBlob->getTensorDesc().getPrecision() == InferenceEngine::Precision::BIN ||
dstBlob->getTensorDesc().getPrecision() == InferenceEngine::Precision::BIN) {
size_t dstSize = dstBlob->size() / 8;
if (constBlob->size() != dstSize) {
THROW_IE_EXCEPTION << "Incorrect blob sizes for node " << getName();
}
const int8_t *srcData = constBlob->cbuffer().as<int8_t *>();
int8_t *dstData = dstBlob->buffer();
cpu_memcpy_s(dstData, dstSize, srcData, constBlob->byteSize());
} else {
if (constBlob->size() != dstBlob->size()) {
THROW_IE_EXCEPTION << "Incorrect blob sizes for node " << getName();
}
switch (precision.size()) {
case 1: {
const int8_t *srcData = constBlob->cbuffer().as<int8_t *>();

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_interpolate_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
#include <legacy/ie_layers.h>
#include "mkldnn_eltwise_node.h"
@ -16,10 +16,11 @@
#include "ie_parallel.hpp"
#include <algorithm>
#include "jit_generator.hpp"
#include "jit_uni_eltwise.hpp"
#include "jit_uni_depthwise.hpp"
#include "jit_uni_quantization.hpp"
#include <cpu/x64/jit_generator.hpp>
#include <cpu/x64/jit_uni_eltwise.hpp>
#include <cpu/x64/jit_uni_depthwise_injector.hpp>
#include <cpu/x64/jit_uni_quantization_injector.hpp>
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
#include "common/cpu_memcpy.h"
#include "utils/bfloat16.hpp"
@ -28,6 +29,7 @@ using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
using namespace Xbyak;
@ -39,16 +41,24 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32)
explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const mkldnn_primitive_attr &attr)
: jit_uni_interpolate_kernel(jcp, attr), jit_generator() {
: jit_uni_interpolate_kernel(jcp, attr), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
const auto &p = attr_.post_ops_;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto &post_op = p.entry_[i];
if (post_op.is_eltwise()) {
eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
this,
post_op.eltwise.alg,
post_op.eltwise.alpha,
post_op.eltwise.beta));
post_op.eltwise.beta,
1));
} else if (post_op.is_depthwise()) {
depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
this,
@ -64,9 +74,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
this->preamble();
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
if (isa == cpu::avx512_common)
if (isa == cpu::x64::avx512_common)
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
switch (jcp_.mode) {
@ -145,12 +155,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
if ((jcp_.mode == InterpolateMode::cubic) && (jcp_.layout == InterpolateLayoutType::planar)) {
prepare_cubic_planar_table();
}
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
const int vlen = cpu_isa_traits<isa>::vlen;
@ -282,7 +290,7 @@ private:
uni_vmovdqu(vmm_index, ptr[reg_index]);
uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask);
vgatherdps(vmm_val, ptr[reg_src_h + vmm_index], vmm_mask);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
apply_post_ops(jcp_.dst_dt, 1);
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -305,7 +313,7 @@ private:
add(reg_src_aux, reg_index_offset);
load_scalar(xmm_val, ptr[reg_src_aux], jcp_.src_dt);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
apply_post_ops(jcp_.dst_dt, 1);
store_scalar(ptr[reg_dst], xmm_val, jcp_.dst_dt);
@ -328,7 +336,7 @@ private:
void nn_blk() {
int step = vlen / sizeof(float);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
step *= 2;
Xbyak::Label nn_loop_label;
@ -343,15 +351,15 @@ private:
add(reg_src_aux, reg_index_offset);
load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
apply_post_ops(jcp_.dst_dt, 0);
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
int sse42_offset = 4;
add(reg_src_aux, sse42_offset * jcp_.src_data_size);
load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
add(reg_oc_off, sse42_offset * sizeof(float));
apply_post_ops(jcp_.dst_dt, 0);
sub(reg_oc_off, sse42_offset * sizeof(float));
@ -398,7 +406,7 @@ private:
add(reg_src_aux, reg_index_offset);
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
L(nn_loop_label);
@ -407,7 +415,7 @@ private:
jl(nn_loop_end_label, T_NEAR);
load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
apply_post_ops(jcp_.dst_dt, 0);
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -427,7 +435,7 @@ private:
jl(nn_tail_loop_end_label, T_NEAR);
load_scalar(xmm_val, ptr[reg_src_aux], jcp_.src_dt);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
apply_post_ops(jcp_.dst_dt, 0);
store_scalar(ptr[reg_dst], xmm_val, jcp_.dst_dt);
@ -468,7 +476,7 @@ private:
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
int step = vlen / sizeof(float);
int blk = (isa == cpu::sse42) ? (2 * step) : step;
int blk = (isa == cpu::x64::sse41) ? (2 * step) : step;
Xbyak::Label main_loop_label;
Xbyak::Label main_loop_end_label;
@ -493,13 +501,13 @@ private:
linear_onnx_worker();
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false); // vmm_val is vmm_valTR
add(reg_oc_off, step * sizeof(float));
}
store_vector(ptr[reg_dst], vmm_valTR, jcp_.dst_dt);
if ((isa == cpu::sse42) && (jcp_.layout == InterpolateLayoutType::block)) {
if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) {
int sse42_offset = 4; // vmm is xmm here
load_vector(vmm_valTL, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
load_vector(vmm_valTR, ptr[reg_src_aux + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
@ -508,7 +516,7 @@ private:
linear_onnx_worker();
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false);
add(reg_oc_off, step * sizeof(float));
}
@ -552,7 +560,7 @@ private:
linear_onnx_worker();
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false); // vmm_val is vmm_valTR
add(reg_oc_off, step * sizeof(float));
}
@ -583,7 +591,7 @@ private:
linear_onnx_worker();
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false); // vmm_val is vmm_valTR
add(reg_oc_off, step * sizeof(float));
}
@ -638,14 +646,14 @@ private:
vgatherdps(vmm_valBR, ptr[reg_src + vmm_index], vmm_mask);
// reg_src_aux point to weight
load_vector(vmm_weightL, ptr[reg_src_aux], memory::f32);
load_vector(vmm_weightR, ptr[reg_src_aux + weight_stride], memory::f32);
load_vector(vmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::f32);
load_vector(vmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::f32);
load_vector(vmm_weightL, ptr[reg_src_aux], memory::data_type::f32);
load_vector(vmm_weightR, ptr[reg_src_aux + weight_stride], memory::data_type::f32);
load_vector(vmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::data_type::f32);
load_vector(vmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::data_type::f32);
linear_onnx_worker();
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, true); // vmm_val is vmm_valTR, broadcase is true
}
store_vector(ptr[reg_dst], vmm_valTR, jcp_.dst_dt);
@ -686,14 +694,14 @@ private:
add(reg_src_aux1, reg_index_offset);
load_scalar(xmm_valBR, ptr[reg_src_aux1], jcp_.src_dt);
load_scalar(xmm_weightL, ptr[reg_src_aux], memory::f32);
load_scalar(xmm_weightR, ptr[reg_src_aux + weight_stride], memory::f32);
load_scalar(xmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::f32);
load_scalar(xmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::f32);
load_scalar(xmm_weightL, ptr[reg_src_aux], memory::data_type::f32);
load_scalar(xmm_weightR, ptr[reg_src_aux + weight_stride], memory::data_type::f32);
load_scalar(xmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::data_type::f32);
load_scalar(xmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::data_type::f32);
linear_onnx_worker();
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, true); // process on vmm_val, vmm_val is vmm_valTR, and bc
}
store_scalar(ptr[reg_dst], xmm_valTR, jcp_.dst_dt);
@ -740,7 +748,7 @@ private:
uni_vbroadcastss(vmm_weightY3, ptr[reg_src_aux1 + 3 * sizeof(float)]);
int step = vlen / sizeof(float);
int blk = (isa == cpu::sse42) ? (2 * step) : step;
int blk = (isa == cpu::x64::sse41) ? (2 * step) : step;
Xbyak::Label main_loop_label;
Xbyak::Label main_loop_end_label;
@ -760,13 +768,13 @@ private:
cubic_c_gathered_matrix(false);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false); // vmm_val is default dst value to post_ops and store
add(reg_oc_off, step * sizeof(float));
}
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
if ((isa == cpu::sse42) && (jcp_.layout == InterpolateLayoutType::block)) {
if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) {
int sse42_offset = 4; // vmm is xmm here
add(reg_src, sse42_offset * jcp_.src_data_size);
add(reg_dst, sse42_offset * jcp_.dst_data_size);
@ -775,7 +783,7 @@ private:
cubic_c_gathered_matrix(false);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false);
add(reg_oc_off, step * sizeof(float)); // second step for one blk
}
@ -814,7 +822,7 @@ private:
cubic_c_gathered_matrix(true);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, false); // vmm_val is default dst value
add(reg_oc_off, step * sizeof(float));
}
@ -974,7 +982,7 @@ private:
vgatherdps(vmm_weightY, ptr[reg_weight_y + 3 * sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask);
cubic_planar_line(false);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, true); // oc_off is broadcast and always the same value for this channel
}
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -997,15 +1005,15 @@ private:
// get idx for input
movss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]);
gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, memory::s32, true);
gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, memory::data_type::s32, true);
movss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]);
gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, memory::s32, true);
gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, memory::data_type::s32, true);
// gather weightX by input idx, used in y0-y3
gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, memory::data_type::f32, true);
gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, memory::data_type::f32, true);
gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, memory::data_type::f32, true);
gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, memory::data_type::f32, true);
// vmm_val is now relieved and used for dst_value
uni_vpxor(vmm_val, vmm_val, vmm_val);
@ -1015,7 +1023,7 @@ private:
vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1));
vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
gather_i32_indices(vmm_weightY, reg_weight_y, 0, vmm_tbl_y, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightY, reg_weight_y, 0, vmm_tbl_y, grid_len, memory::data_type::f32, true);
cubic_planar_line(true);
// y1
@ -1023,7 +1031,7 @@ private:
vpminsd(vmm_index_y_itr, vmm_index_in_y, cubic_planar_table_val(1));
vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
// weight y1: shift weight_size
gather_i32_indices(vmm_weightY, reg_weight_y, sizeof(float), vmm_tbl_y, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightY, reg_weight_y, sizeof(float), vmm_tbl_y, grid_len, memory::data_type::f32, true);
cubic_planar_line(true);
// y2
@ -1032,7 +1040,7 @@ private:
vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1));
vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
// weight y2
gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, memory::data_type::f32, true);
cubic_planar_line(true);
// y3
@ -1042,10 +1050,10 @@ private:
vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1));
vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
// weight y3
gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, memory::f32, true);
gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, memory::data_type::f32, true);
cubic_planar_line(true);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, true); // oc_off is broadcast and always the same value for this channel
}
store_scalar(ptr[reg_dst], Xmm(vmm_val.getIdx()), jcp_.dst_dt);
@ -1093,7 +1101,7 @@ private:
vpaddd(vmm_mask, vmm_mask, vmm_one); // (IW - 1) + 1 = IW
uni_vpmulld(vmm_mask, vmm_mask, vmm_index_y_itr);
uni_vpaddd(vmm_index_x_itr, vmm_index_x_itr, vmm_mask);
gather_i32_indices(vmm_src, reg_src, 0, vmm_index_x_itr, jcp_.src_data_size, memory::f32, is_scalar);
gather_i32_indices(vmm_src, reg_src, 0, vmm_index_x_itr, jcp_.src_data_size, memory::data_type::f32, is_scalar);
if (itr == 0) {
uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX0);
@ -1134,19 +1142,19 @@ private:
inline void gather_i32_indices(Vmm vmm_src, const Xbyak::Reg64 &base, int offset, Vmm vmm_indices, int scale,
memory::data_type src_dt, bool is_scalar) {
Xbyak::Address table_idx = ptr[base + offset + vmm_indices * scale];
if ((isa == cpu::avx512_common) && !is_scalar) {
if ((isa == cpu::x64::avx512_common) && !is_scalar) {
// [0-15] bit of int to mask
kmovw(k_mask, cubic_planar_table_val(3));
if (src_dt == memory::f32) {
if (src_dt == memory::data_type::f32) {
vgatherdps(vmm_src | k_mask, table_idx); // dword index, packed single data
} else if (src_dt == memory::s32) {
} else if (src_dt == memory::data_type::s32) {
vpgatherdd(vmm_src | k_mask, table_idx); // dword index, dword data
}
} else if ((isa == cpu::avx2) && !is_scalar) {
} else if ((isa == cpu::x64::avx2) && !is_scalar) {
uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask);
if (src_dt == memory::f32) {
if (src_dt == memory::data_type::f32) {
vgatherdps(vmm_src, table_idx, vmm_mask);
} else if (src_dt == memory::s32) {
} else if (src_dt == memory::data_type::s32) {
vpgatherdd(vmm_src, table_idx, vmm_mask);
}
} else {
@ -1177,17 +1185,17 @@ private:
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
@ -1195,23 +1203,23 @@ private:
assert(!"unknown dst_dt");
}
if (src_dt != memory::f32 && src_dt != data_type::bf16)
if (src_dt != memory::data_type::f32 && src_dt != data_type::bf16)
uni_vcvtdq2ps(vmm_src, vmm_src);
}
inline void load_xmm(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(xmm_src, op);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(xmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(xmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(xmm_src, op);
uni_vpslld(xmm_src, xmm_src, 16);
break;
@ -1219,25 +1227,25 @@ private:
assert(!"unknown dst_dt");
}
if (src_dt != memory::f32 && src_dt != data_type::bf16)
if (src_dt != memory::data_type::f32 && src_dt != data_type::bf16)
uni_vcvtdq2ps(xmm_src, xmm_src);
}
inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(xmm_src, op);
break;
case memory::s8:
case memory::data_type::s8:
movsx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
case memory::u8:
case memory::data_type::u8:
movzx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
case memory::bf16:
case memory::data_type::bf16:
pinsrw(xmm_src, op, 0x0);
uni_vpslld(xmm_src, xmm_src, 16);
break;
@ -1254,38 +1262,38 @@ private:
Ymm ymm_dst = Ymm(vmm_dst.getIdx());
Xmm xmm_dst = Xmm(vmm_dst.getIdx());
if (dst_dt == memory::f32) {
if (dst_dt == memory::data_type::f32) {
uni_vmovups(op, vmm_dst);
} else if (dst_dt == memory::u8) {
} else if (dst_dt == memory::data_type::u8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
vpmovusdb(op, vmm_dst);
} else {
uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
} else if (dst_dt == memory::s8) {
} else if (dst_dt == memory::data_type::s8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
vpmovsdb(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
} else if (dst_dt == memory::bf16) {
} else if (dst_dt == memory::data_type::bf16) {
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
@ -1295,26 +1303,26 @@ private:
}
inline void store_xmm(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
if (dst_dt != memory::f32 && dst_dt != memory::bf16) {
if (dst_dt != memory::data_type::f32 && dst_dt != memory::data_type::bf16) {
uni_vcvtps2dq(xmm_dst, xmm_dst);
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(op, xmm_dst);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
movd(op, xmm_dst);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
movd(op, xmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
pshuflw(xmm_dst, xmm_dst, 0x0d); // 01 01 01 01 --> 01 01 11 00 imm=0b00001101
pshufhw(xmm_dst, xmm_dst, 0x0d); // 01 01 11 00 --> 11 00 11 00
pshufd(xmm_dst, xmm_dst, 0x08); // 11 00 11 00 --> 11 11 00 00 imm=0b00001000
@ -1331,23 +1339,23 @@ private:
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(op, xmm_dst);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpsrld(xmm_dst, xmm_dst, 16);
pextrw(op, xmm_dst, 0x0);
break;
@ -1362,7 +1370,7 @@ private:
int eltwise_inj_idx = 0;
int depthwise_inj_idx = 0;
int quantization_inj_idx = 0;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto& post_op = p.entry_[i];
if (post_op.is_eltwise()) {
eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
@ -1377,7 +1385,7 @@ private:
depthwise_inj_idx++;
} else if (post_op.is_quantization()) {
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1;
bool do_rounding = do_dequantization || dst_dt == memory::data_type::f32 || i != p.len() - 1;
int s_idx = vmm_val.getIdx();
@ -1636,49 +1644,56 @@ void MKLDNNInterpolateNode::initSupportedPrimitiveDescriptors() {
auto scalesType = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32);
auto axesType = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::I32);
auto pushDesc = [&](memory::format dataFormat, impl_desc_type implDetail) {
auto pushDesc = [&](memory::format_tag dataFormat, impl_desc_type implDetail) {
config.inConfs[DATA_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(DATA_ID)->getDims(), inputDataType, dataFormat);
config.inConfs[TARGET_SHAPE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(TARGET_SHAPE_ID)->getDims(), targetShapeType, memory::x);
config.inConfs[SCALES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(SCALES_ID)->getDims(), scalesType, memory::x);
config.inConfs[TARGET_SHAPE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(TARGET_SHAPE_ID)->getDims(), targetShapeType, memory::format_tag::x);
config.inConfs[SCALES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(SCALES_ID)->getDims(), scalesType, memory::format_tag::x);
if (isAxesSpecified)
config.inConfs[AXES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXES_ID)->getDims(), axesType, memory::x);
config.inConfs[AXES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXES_ID)->getDims(), axesType, memory::format_tag::x);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, dataFormat);
supportedPrimitiveDescriptors.push_back({config, implDetail, dataFormat});
};
auto channels = getParentEdgeAt(DATA_ID)->getDims().ndims() > 1 ? getParentEdgeAt(DATA_ID)->getDims()[1] : 1;
if (mode != InterpolateMode::linear) {
// blk and by_channel JIT kernel on sse42 or above machine
if (mayiuse(cpu::sse42)) {
if (mayiuse(cpu::x64::sse41)) {
if (getParentEdgeAt(DATA_ID)->getDims().ndims() == 4) {
if (mayiuse(cpu::avx512_common)) {
pushDesc(memory::nhwc, jit_avx512);
pushDesc(memory::nChw16c, jit_avx512);
} else if (mayiuse(cpu::avx2)) {
pushDesc(memory::nhwc, jit_avx2);
pushDesc(memory::nChw8c, jit_avx2);
if (mayiuse(cpu::x64::avx512_common)) {
pushDesc(memory::format_tag::nhwc, jit_avx512);
if (channels != 1)
pushDesc(memory::format_tag::nChw16c, jit_avx512);
} else if (mayiuse(cpu::x64::avx2)) {
pushDesc(memory::format_tag::nhwc, jit_avx2);
if (channels != 1)
pushDesc(memory::format_tag::nChw8c, jit_avx2);
} else {
pushDesc(memory::nhwc, jit_sse42);
pushDesc(memory::nChw8c, jit_sse42);
pushDesc(memory::format_tag::nhwc, jit_sse42);
if (channels != 1)
pushDesc(memory::format_tag::nChw8c, jit_sse42);
}
} else if (getParentEdgeAt(DATA_ID)->getDims().ndims() == 5 && mode == InterpolateMode::nearest) {
if (mayiuse(cpu::avx512_common)) {
pushDesc(memory::ndhwc, jit_avx512);
pushDesc(memory::nCdhw16c, jit_avx512);
} else if (mayiuse(cpu::avx2)) {
pushDesc(memory::ndhwc, jit_avx2);
pushDesc(memory::nCdhw8c, jit_avx2);
if (mayiuse(cpu::x64::avx512_common)) {
pushDesc(memory::format_tag::ndhwc, jit_avx512);
if (channels != 1)
pushDesc(memory::format_tag::nCdhw16c, jit_avx512);
} else if (mayiuse(cpu::x64::avx2)) {
pushDesc(memory::format_tag::ndhwc, jit_avx2);
if (channels != 1)
pushDesc(memory::format_tag::nCdhw8c, jit_avx2);
} else {
pushDesc(memory::ndhwc, jit_sse42);
pushDesc(memory::nCdhw8c, jit_sse42);
pushDesc(memory::format_tag::ndhwc, jit_sse42);
if (channels != 1)
pushDesc(memory::format_tag::nCdhw8c, jit_sse42);
}
}
}
// planar for 1.ref on machine without sse42(if no sse42, canFuse() is false). 2.JIT kernel for f32 && avx2(gather).(with fuse)
if (!mayiuse(cpu::sse42))
if (!mayiuse(cpu::x64::sse41))
pushDesc(MKLDNNMemory::GetPlainFormat(getParentEdgeAt(DATA_ID)->getDims()), ref);
if (mayiuse(cpu::avx2) && inputPrec == Precision::FP32) {
if (mayiuse(cpu::x64::avx2) && inputPrec == Precision::FP32) {
pushDesc(MKLDNNMemory::GetPlainFormat(getParentEdgeAt(DATA_ID)->getDims()), jit_avx2);
}
} else {
@ -1708,7 +1723,6 @@ void MKLDNNInterpolateNode::createPrimitive() {
THROW_IE_EXCEPTION << "Interpolate layer with name '" << getName() << "' did not set preferable primitive descriptor";
auto selectedPD = getSelectedPrimitiveDescriptor();
Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
auto jcp = jit_interpolate_config_params();
jcp.mode = mode;
jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[0].desc.getPrecision());
@ -1722,29 +1736,33 @@ void MKLDNNInterpolateNode::createPrimitive() {
jcp.IW = srcDimPad[dimSize - 1];
jcp.IH = srcDimPad[dimSize - 2];
if (MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims()) == selected_layout) {
if (getChildEdgeAt(0)->getMemory().GetDesc().isPlainFormat()) {
jcp.layout = InterpolateLayoutType::planar;
} else if ((selected_layout == NHWC) || (selected_layout == NDHWC)) {
jcp.layout = InterpolateLayoutType::by_channel;
} else {
} else if (getChildEdgeAt(0)->getMemory().GetDesc().isBlockedCFormat()) {
jcp.layout = InterpolateLayoutType::block;
} else {
jcp.layout = InterpolateLayoutType::by_channel;
}
configured_for_layout = jcp.layout;
if (mode == InterpolateMode::nearest || mode == InterpolateMode::linear_onnx || mode == InterpolateMode::cubic) {
if (jcp.layout != InterpolateLayoutType::planar) {
if (mayiuse(cpu::avx512_common)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
} else if (mayiuse(cpu::avx2)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::avx2>(jcp, *attr.get()));
} else if (mayiuse(cpu::sse42)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::sse42>(jcp, *attr.get()));
if (mayiuse(cpu::x64::avx512_common)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx512_common>(jcp, *attr.get()));
} else if (mayiuse(cpu::x64::avx2)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
} else if (mayiuse(cpu::x64::sse41)) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::sse41>(jcp, *attr.get()));
}
} else {
// gather ISA(for planar JIT kernel) for avx2 and fp32
if (mayiuse(cpu::avx2) && inputPrec == Precision::FP32) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::avx2>(jcp, *attr.get()));
if (mayiuse(cpu::x64::avx2) && inputPrec == Precision::FP32) {
interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
}
}
if (interpolateKernel)
interpolateKernel->create_ker();
}
// build indices table
@ -2133,10 +2151,8 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
auto &srcMemPtr = getParentEdgeAt(DATA_ID)->getMemoryPtr();
uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetData()) +
dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dstDataSize;
uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData()) +
srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * srcDataSize;
uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData());
size_t dimSize = srcDim.size();
SizeVector srcDimPad = getPaddedInputShape();
@ -2145,16 +2161,6 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
auto srcDimPad5d = to5Dim(srcDimPad);
auto dstDim5d = to5Dim(dstDim);
InterpolateLayoutType layout;
Layout selected_layout = getParentEdgeAt(DATA_ID)->getDesc().getLayout();
if (MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims()) == selected_layout) {
layout = InterpolateLayoutType::planar;
} else if ((selected_layout == NHWC) || (selected_layout == NDHWC)) {
layout = InterpolateLayoutType::by_channel;
} else {
layout = InterpolateLayoutType::block;
}
uint8_t *src_data = nullptr;
std::vector<uint8_t> srcPadded;
if (hasPad) {
@ -2167,7 +2173,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
SizeVector inShapeBlock = getBlockND(srcDim5d);
SizeVector inShapePadBlock = getBlockND(srcDimPad5d);
if (layout == InterpolateLayoutType::planar) {
if (configured_for_layout == InterpolateLayoutType::planar) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
@ -2177,7 +2183,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
});
src_data = src_data_pad;
} else if (layout == InterpolateLayoutType::by_channel) {
} else if (configured_for_layout == InterpolateLayoutType::by_channel) {
srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
@ -2188,8 +2194,8 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
});
src_data = src_data_pad;
} else if (layout == InterpolateLayoutType::block) {
size_t blkSize = mayiuse(cpu::avx512_common) ? 16 : 8;
} else if (configured_for_layout == InterpolateLayoutType::block) {
size_t blkSize = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
size_t CB = div_up(srcDimPad5d[1], blkSize);
size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
srcPadded.resize(eltsTotal * srcDataSize, 0x0);
@ -2227,7 +2233,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
switch (mode) {
case InterpolateMode::nearest: {
if (interpolateKernel) {
if (layout == InterpolateLayoutType::planar) {
if (configured_for_layout == InterpolateLayoutType::planar) {
NNPlanar(src_data, dst_data, N, C, ID, IH, IW, OD, OH, OW);
} else {
NNCGathered(src_data, dst_data, N, C, ID, IH, IW, OD, OH, OW);
@ -2239,7 +2245,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
}
case InterpolateMode::linear_onnx: {
if (interpolateKernel) {
if (layout == InterpolateLayoutType::planar) {
if (configured_for_layout == InterpolateLayoutType::planar) {
linearOnnxPlanar(src_data, dst_data, N, C, IH, IW, OH, OW);
} else {
linearOnnxCGathered(src_data, dst_data, N, C, IH, IW, OH, OW);
@ -2251,7 +2257,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
}
case InterpolateMode::cubic: {
if (interpolateKernel) {
if (layout == InterpolateLayoutType::planar) {
if (configured_for_layout == InterpolateLayoutType::planar) {
cubicPlanar(src_data, dst_data, N, C, IH, IW, OH, OW);
} else {
cubicCGathered(src_data, dst_data, N, C, IH, IW, OH, OW);
@ -2284,8 +2290,7 @@ void MKLDNNInterpolateNode::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr
int *index_h = static_cast<int*>(&indexTable[OD]);
int *index_w = static_cast<int*>(&indexTable[OD + OH]);
Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
bool is_nhwc = (layout == NHWC || layout == NDHWC) ? true : false;
bool is_nhwc = (configured_for_layout == by_channel);
for (int b = 0; b < B; b++) {
if (is_nhwc) {
@ -2308,7 +2313,7 @@ void MKLDNNInterpolateNode::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr
(*interpolateKernel)(&arg);
});
} else { // for blk
int blk_size = mayiuse(cpu::avx512_common) ? 16 : 8;
int blk_size = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
int CB = div_up(C, blk_size);
const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize;
uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize;
@ -2414,10 +2419,9 @@ void MKLDNNInterpolateNode::linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t
float *weightTop = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW]);
float *weightBottom = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + OH]);
Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
bool isByChannel = (layout == NHWC) ? true : false;
bool isByChannel = (configured_for_layout == by_channel) ? true : false;
int blkSize = mayiuse(cpu::avx512_common) ? 16 : 8;
int blkSize = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
int CB = div_up(C, blkSize);
int CSize = isByChannel ? C : blkSize * CB;
int CGatherLen = isByChannel ? C : blkSize;
@ -2600,14 +2604,11 @@ void MKLDNNInterpolateNode::cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_
int *yOrigin = static_cast<int*>(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW]);
float *yFactor = reinterpret_cast<float*>(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]);
Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
bool isByChannel = (layout == NHWC) ? true : false;
int blkSize = mayiuse(cpu::avx512_common) ? 16 : 8;
int blkSize = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
int CB = div_up(C, blkSize);
int CSize = isByChannel ? C : blkSize * CB;
int CGatherLen = isByChannel ? C : blkSize;
int workAmount = isByChannel ? C : CB;
int CSize = configured_for_layout == InterpolateLayoutType::by_channel ? C : blkSize * CB;
int CGatherLen = configured_for_layout == InterpolateLayoutType::by_channel ? C : blkSize;
int workAmount = configured_for_layout == InterpolateLayoutType::by_channel ? C : CB;
parallel_for3d(B, OH, OW, [&](size_t b, size_t h, size_t w) {
uint8_t *out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize;
@ -2848,7 +2849,7 @@ bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
return false;
};
if (!mayiuse(cpu::sse42) || mode == InterpolateMode::linear) {
if (!mayiuse(cpu::x64::sse41) || mode == InterpolateMode::linear) {
return false;
}

View File

@ -75,6 +75,8 @@ struct jit_uni_interpolate_kernel {
explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
virtual ~jit_uni_interpolate_kernel() {}
virtual void create_ker() = 0;
jit_interpolate_config_params jcp_;
const mkldnn_primitive_attr &attr_;
};
@ -163,6 +165,8 @@ private:
InferenceEngine::Precision inputPrec, outputPrec;
size_t srcDataSize, dstDataSize;
InterpolateLayoutType configured_for_layout;
std::vector<int> indexTable;
std::shared_ptr<jit_uni_interpolate_kernel> interpolateKernel;

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_lrn_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
#include <string>
#include <mkldnn_extension_utils.h>
@ -52,8 +52,11 @@ void MKLDNNLrnNode::createPrimitive() {
auto prim_desc = createPrimitiveDescriptor<lrn_forward::primitive_desc, lrn_forward::desc>();
prim.reset(new lrn_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
prim.reset(new lrn_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
}
bool MKLDNNLrnNode::created() const {
@ -86,7 +89,7 @@ void MKLDNNLrnNode::initOptimalPrimitiveDescriptor() {
void MKLDNNLrnNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
algorithm alg = (isAcrossMaps) ? lrn_across_channels : lrn_within_channel;
algorithm alg = (isAcrossMaps) ? algorithm::lrn_across_channels : algorithm::lrn_within_channel;
MKLDNNMemoryDesc in_candidate(inputDesc[0]);
MKLDNNDescriptor desc(std::shared_ptr<lrn_forward::desc>(
new lrn_forward::desc(prop_kind::forward_scoring, alg, in_candidate, size, alpha, beta, k)));

View File

@ -39,7 +39,7 @@ void MKLDNNMemoryOutputNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].inPlace = -1;
config.inConfs[0].constant = false;
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, MKLDNNMemory::GetPlainFormat(getParentEdgeAt(0)->getDims()));
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, memory::format::any);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, memory::format_tag::any);
}
void MKLDNNMemoryOutputNode::execute(mkldnn::stream strm) {
@ -76,14 +76,8 @@ void MKLDNNMemoryInputNode::createPrimitive() {
*/
inline
static void simple_copy(MKLDNNMemory& dst, const MKLDNNMemory& src) {
auto getDataWithOff = [] (const MKLDNNMemory& mem) {
auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(mem.GetDataType());
return static_cast<uint8_t*>(mem.GetData()) +
mem.GetDescriptor().data.layout_desc.blocking.offset_padding * elemSize;
};
auto srcPtr = getDataWithOff(src);
auto dstPtr = getDataWithOff(dst);
auto srcPtr = static_cast<uint8_t*>(src.GetPtr());
auto dstPtr = static_cast<uint8_t*>(dst.GetPtr());
auto srcSizeInByte = src.GetSize();
auto dstSizeInByte = dst.GetSize();

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_mvn_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
#include <legacy/ie_layers.h>
#include "mkldnn_eltwise_node.h"
@ -17,16 +17,17 @@
#include "ie_parallel.hpp"
#include <algorithm>
#include "jit_generator.hpp"
#include "jit_uni_eltwise.hpp"
#include "jit_uni_depthwise.hpp"
#include "jit_uni_quantization.hpp"
#include <cpu/x64/jit_generator.hpp>
#include <cpu/x64/jit_uni_eltwise.hpp>
#include <cpu/x64/jit_uni_depthwise_injector.hpp>
#include <cpu/x64/jit_uni_quantization_injector.hpp>
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
using namespace Xbyak;
@ -38,7 +39,7 @@ static inline bool isFloatCompatible(Precision prc) {
}
static inline bool isFloatCompatible(memory::data_type type) {
return memory::f32 == type || memory::bf16 == type;
return memory::data_type::f32 == type || memory::data_type::bf16 == type;
}
// normalize_variance = false : src->mean
@ -47,7 +48,14 @@ template <cpu_isa_t isa>
struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_mvn_mean_kernel_f32)
explicit jit_uni_mvn_mean_variance_kernel_f32(jit_mvn_config_params jcp) : jit_uni_mvn_mean_variance_kernel(jcp), jit_generator() {
explicit jit_uni_mvn_mean_variance_kernel_f32(jit_mvn_config_params jcp) : jit_uni_mvn_mean_variance_kernel(jcp), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
if (jcp_.normalize_variance) {
@ -59,7 +67,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
mov(reg_stride, ptr[reg_params + GET_OFF(src_stride)]);
int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::sse42) ? 2 : 1; // block size is also 8 on cpu::sse42
int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::x64::sse41) ? 2 : 1; // block size is also 8 on cpu::x64::sse41
for (int i = 0; i < repeats; i++) {
int offset_sse42 = i * 4;
if (i > 0) {
@ -120,9 +128,9 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
if (jcp_.planar_layout) {
Vmm vmm_dst = jcp_.normalize_variance ? vmm_variance : vmm_sum;
// hsum+store
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
hsum_store(vmm_dst);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
Xbyak::Ymm ymm_sum = Xbyak::Ymm(vmm_dst.getIdx());
vextractf128(xmm_aux1, ymm_sum, 0);
vextractf128(xmm_aux2, ymm_sum, 1);
@ -162,11 +170,10 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
}
this->postamble();
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
Xbyak::Reg64 reg_src = r8;
@ -199,17 +206,17 @@ private:
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
@ -224,13 +231,20 @@ template <cpu_isa_t isa>
struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_mvn_kernel_f32)
explicit jit_uni_mvn_kernel_f32(jit_mvn_config_params jcp, const mkldnn_primitive_attr &attr) : jit_uni_mvn_kernel(jcp, attr), jit_generator() {
explicit jit_uni_mvn_kernel_f32(jit_mvn_config_params jcp, const mkldnn_primitive_attr &attr) : jit_uni_mvn_kernel(jcp, attr), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
const auto &p = attr_.post_ops_;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto &post_op = p.entry_[i];
if (post_op.is_eltwise()) {
eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale));
} else if (post_op.is_depthwise()) {
depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
this, post_op.depthwise.alg));
@ -252,13 +266,13 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]);
mov(reg_dst_stride, ptr[reg_params + GET_OFF(dst_stride)]);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
if (isa == avx512_common)
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::sse42) ? 2 : 1; // block size is also 8 on cpu::sse42
int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::x64::sse41) ? 2 : 1; // block size is also 8 on cpu::x64::sse41
for (int i = 0; i < repeats; i++) {
int offset_sse42 = i * 4;
if (i > 0) {
@ -270,7 +284,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
add(reg_dst, offset_sse42 * jcp_.dst_data_size);
add(reg_mean, offset_sse42 * sizeof(float));
add(reg_variance_inv, offset_sse42 * sizeof(float));
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
add(reg_oc_off, offset_sse42 * sizeof(float));
}
@ -319,12 +333,10 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
for (auto& inj : eltwise_injectors)
inj->prepare_table();
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
const int vlen = cpu_isa_traits<isa>::vlen;
@ -360,17 +372,17 @@ private:
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
@ -386,39 +398,39 @@ private:
Ymm ymm_dst = Ymm(vmm_dst.getIdx());
Xmm xmm_dst = Xmm(vmm_dst.getIdx());
if (dst_dt == memory::f32) {
if (dst_dt == memory::data_type::f32) {
uni_vmovups(op, vmm_dst);
} else if (dst_dt == memory::bf16) {
} else if (dst_dt == memory::data_type::bf16) {
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
} else if (dst_dt == memory::u8) {
} else if (dst_dt == memory::data_type::u8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
vpmovusdb(op, vmm_dst);
} else {
uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
} else if (dst_dt == memory::s8) {
} else if (dst_dt == memory::data_type::s8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
vpmovsdb(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
@ -431,7 +443,7 @@ private:
int eltwise_inj_idx = 0;
int depthwise_inj_idx = 0;
int quantization_inj_idx = 0;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto& post_op = p.entry_[i];
if (post_op.is_eltwise()) {
eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
@ -445,7 +457,7 @@ private:
depthwise_inj_idx++;
} else if (post_op.is_quantization()) {
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len() - 1;
int s_idx = vmm_val.getIdx();
quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
@ -539,18 +551,18 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].inPlace = -1;
config.outConfs[0].inPlace = canBeInplace ? 0 : -1;
auto pushDesc = [&](memory::format format, impl_desc_type impl_type) {
auto pushDesc = [&](memory::format_tag format, impl_desc_type impl_type) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format);
supportedPrimitiveDescriptors.push_back({config, impl_type, format});
};
impl_desc_type impl_type;
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
impl_type = impl_desc_type::jit_avx2;
} else if (mayiuse(cpu::sse42)) {
} else if (mayiuse(cpu::x64::sse41)) {
impl_type = impl_desc_type::jit_sse42;
} else {
impl_type = impl_desc_type::ref;
@ -558,24 +570,24 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
if (across_channels == 0 && normalize_variance == 1) {
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
pushDesc(memory::nhwc, impl_type);
pushDesc(memory::format_tag::nhwc, impl_type);
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
pushDesc(memory::ndhwc, impl_type);
pushDesc(memory::format_tag::ndhwc, impl_type);
}
}
if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) {
if (impl_desc_type::jit_avx512 == impl_type) {
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
pushDesc(memory::nChw16c, impl_type);
pushDesc(memory::format_tag::nChw16c, impl_type);
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
pushDesc(memory::nCdhw16c, impl_type);
pushDesc(memory::format_tag::nCdhw16c, impl_type);
}
} else if (impl_desc_type::jit_avx2 == impl_type || impl_desc_type::jit_sse42 == impl_type) {
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
pushDesc(memory::nChw8c, impl_type);
pushDesc(memory::format_tag::nChw8c, impl_type);
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
pushDesc(memory::nCdhw8c, impl_type);
pushDesc(memory::format_tag::nCdhw8c, impl_type);
}
}
@ -607,34 +619,42 @@ void MKLDNNMVNNode::createPrimitive() {
jcp.normalize_variance = normalize_variance;
jcp.across_channels = across_channels;
if (mayiuse(cpu::avx512_common)) {
mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
if (mayiuse(cpu::x64::avx512_common)) {
mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::x64::avx512_common>(jcp, *attr.get()));
jcp.normalize_variance = false;
mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx512_common>(jcp));
mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx512_common>(jcp));
if (normalize_variance) {
jcp.normalize_variance = true;
mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx512_common>(jcp));
mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx512_common>(jcp));
}
} else if (mayiuse(cpu::avx2)) {
mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::avx2>(jcp, *attr.get()));
} else if (mayiuse(cpu::x64::avx2)) {
mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
jcp.normalize_variance = false;
mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx2>(jcp));
mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx2>(jcp));
if (normalize_variance) {
jcp.normalize_variance = true;
mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx2>(jcp));
mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx2>(jcp));
}
} else if (mayiuse(cpu::sse42)) {
mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::sse42>(jcp, *attr.get()));
} else if (mayiuse(cpu::x64::sse41)) {
mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::x64::sse41>(jcp, *attr.get()));
jcp.normalize_variance = false;
mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::sse42>(jcp));
mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::sse41>(jcp));
if (normalize_variance) {
jcp.normalize_variance = true;
mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::sse42>(jcp));
mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::sse41>(jcp));
}
}
if (mvn_kernel)
mvn_kernel->create_ker();
if (mvn_mean_kernel)
mvn_mean_kernel->create_ker();
if (mvn_variance_kernel)
mvn_variance_kernel->create_ker();
}
void MKLDNNMVNNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {
@ -782,11 +802,11 @@ std::tuple<size_t, size_t, size_t, size_t, size_t> MKLDNNMVNNode::get5dShapes(co
template <typename in_data_t, typename out_data_t>
void MKLDNNMVNNode::mvn_pln(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) {
size_t blk_size = 1; // blk size in vmm
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
blk_size = 16;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
blk_size = 8;
} else if (mayiuse(cpu::sse42)) {
} else if (mayiuse(cpu::x64::sse41)) {
blk_size = 4;
}
@ -1005,10 +1025,10 @@ template <typename in_data_t, typename out_data_t>
void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) {
size_t blk_size = 1; // channel blk for memory layout
size_t ele_in_vmm = 4;
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
blk_size = 16;
ele_in_vmm = 16;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
blk_size = 8;
ele_in_vmm = 8;
} else {
@ -1036,7 +1056,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
size_t C3 = C2 * CB;
size_t C5 = C * D * H * W;
size_t threads_num = mkldnn_get_max_threads();
size_t threads_num = parallel_get_num_threads();
size_t aux_buffer_size = across_channels ? blk_size : rnd_up(C, blk_size);
std::vector<float> mean_buffer(aux_buffer_size * threads_num);
std::vector<float> variance_buffer(aux_buffer_size * threads_num);
@ -1053,7 +1073,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
float mean_internal = 0.0f;
if ((min_cb == blk_size) && mvn_mean_kernel) {
auto mean_buffer_ptr = &mean_buffer[blk_size * mkldnn_get_thread_num()];
auto mean_buffer_ptr = &mean_buffer[blk_size * parallel_get_thread_num()];
for (int i = 0; i < blk_size; i++)
mean_buffer_ptr[i] = 0.f;
@ -1089,7 +1109,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
float variance_internal = 0.0f;
if ((blk_size == min_cb) && mvn_variance_kernel) {
auto variance_buffer_ptr = &variance_buffer[blk_size * mkldnn_get_thread_num()];
auto variance_buffer_ptr = &variance_buffer[blk_size * parallel_get_thread_num()];
for (int i = 0; i < blk_size; i++)
variance_buffer_ptr[i] = 0.f;
@ -1321,7 +1341,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
float dst_value = (src_data[ch + w * src_stride] - mean_buffer_ptr[c]) * variance_buffer_ptr[c];
if (!fusedWith.empty()) {
const auto &p = (*attr.get()).post_ops_;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto &post_op = p.entry_[i];
if (post_op.is_eltwise()) {
// only eltwise_relu supported
@ -1335,7 +1355,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
bool do_dequantization = post_op.quantization.alg ==
alg_kind::quantization_quantize_dequantize;
bool do_rounding = do_dequantization || isFloatCompatible(output_prec) ||
i != p.len_ - 1;
i != p.len() - 1;
auto quant = post_op.quantization;
float crl = quant.crop_low_data->shifts_[quant.crop_low_data->count_ == 1 ? 0 : cb * blk_size + c];

View File

@ -48,6 +48,8 @@ struct jit_uni_mvn_mean_variance_kernel {
explicit jit_uni_mvn_mean_variance_kernel(jit_mvn_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
virtual ~jit_uni_mvn_mean_variance_kernel() {}
virtual void create_ker() = 0;
jit_mvn_config_params jcp_;
};
@ -62,6 +64,8 @@ struct jit_uni_mvn_kernel {
explicit jit_uni_mvn_kernel(jit_mvn_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
virtual ~jit_uni_mvn_kernel() {}
virtual void create_ker() = 0;
jit_mvn_config_params jcp_;
const mkldnn_primitive_attr &attr_;
};

View File

@ -2,39 +2,48 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "mkldnn_normalize_node.h"
#include <legacy/ie_layers_internal.hpp>
#include <ie_parallel.hpp>
#include "mkldnn_quantize_node.h"
#include "mkldnn_eltwise_node.h"
#include <mkldnn_extension_utils.h>
#include "utils/bfloat16.hpp"
#include <legacy/ie_layers_internal.hpp>
#include "ie_parallel.hpp"
#include "jit_uni_eltwise.hpp"
#include "jit_uni_depthwise.hpp"
#include "jit_uni_quantization.hpp"
#include "mkldnn_extension_utils.h"
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
#include <cpu/x64/jit_uni_depthwise_injector.hpp>
#include <cpu/x64/jit_uni_quantization_injector.hpp>
#include "bf16transformer.h"
#include "common/cpu_memcpy.h"
#include "mkldnn_normalize_node.h"
#include <mkldnn_selective_build.h>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
using namespace Xbyak;
#define GET_OFF(field) offsetof(jit_normalize_call_args, field)
static inline bool isFloatCompatible(memory::data_type type) {
return memory::f32 == type || memory::bf16 == type;
return memory::data_type::f32 == type || memory::data_type::bf16 == type;
}
template <cpu_isa_t isa>
struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_modulo_kernel_f32)
jit_uni_normalize_modulo_kernel_f32(jit_normalize_config_params jcp) : jit_uni_normalize_modulo_kernel(jcp), jit_generator() {
jit_uni_normalize_modulo_kernel_f32(jit_normalize_config_params jcp) : jit_uni_normalize_modulo_kernel(jcp), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
mov(reg_modulo, ptr[reg_params + GET_OFF(modulo)]);
@ -52,7 +61,7 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker
load_vector(vmm_val, ptr[reg_src], jcp_.src_dt);
uni_vfmadd231ps(vmm_sqr_sum, vmm_val, vmm_val);
if (isa == cpu::sse42 && jcp_.is_blk) {
if (isa == cpu::x64::sse41 && jcp_.is_blk) {
int sse42_offset = 4;
load_vector(vmm_val, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
uni_vfmadd231ps(vmm_sqr_sum, vmm_val, vmm_val);
@ -69,9 +78,9 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker
uni_vmovups(ptr[reg_modulo], vmm_sqr_sum);
} else {
// hsum+store
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
hsum_store(vmm_sqr_sum);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
Xbyak::Ymm ymm_sqr_sum = Xbyak::Ymm(vmm_sqr_sum.getIdx());
vextractf128(xmm_aux1, ymm_sqr_sum, 0);
vextractf128(xmm_aux2, ymm_sqr_sum, 1);
@ -91,11 +100,10 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker
}
this->postamble();
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
@ -121,18 +129,18 @@ private:
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
default:
@ -149,13 +157,20 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_kernel_f32)
explicit jit_uni_normalize_kernel_f32(jit_normalize_config_params jcp, const mkldnn_primitive_attr &attr)
: jit_uni_normalize_kernel(jcp, attr), jit_generator() {
: jit_uni_normalize_kernel(jcp, attr), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
const auto &p = attr_.post_ops_;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto &post_op = p.entry_[i];
if (post_op.is_eltwise()) {
eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale));
} else if (post_op.is_depthwise()) {
depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
this, post_op.depthwise.alg));
@ -176,7 +191,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
mov(reg_weights, ptr[reg_params + GET_OFF(weights)]);
mov(reg_fused_factor, ptr[reg_params + GET_OFF(fused_factor)]);
mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
if (attr_.post_ops_.len_ != 0)
if (attr_.post_ops_.len() != 0)
mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
if (isa == avx512_common)
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
@ -195,12 +210,10 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
emu_vcvtneps2bf16->emit_table();
for (auto& inj : eltwise_injectors)
inj->prepare_table();
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
@ -255,7 +268,7 @@ private:
Xbyak::Label tail_loop_label;
Xbyak::Label tail_loop_end_label;
int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
int step = jcp_.src_dt == memory::data_type::bf16 ? 16 : (vlen / sizeof(float));
L(main_loop_label);
{
cmp(reg_work_amount, step);
@ -276,7 +289,7 @@ private:
add(reg_modulo, vlen);
}
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 1);
}
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -300,17 +313,17 @@ private:
uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
} else {
if (jcp_.channel_shared) {
load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::f32);
load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32);
uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
add(reg_fused_factor, step * sizeof(float));
} else {
load_scalar(xmm_modulo, ptr[reg_modulo], memory::f32);
load_scalar(xmm_modulo, ptr[reg_modulo], memory::data_type::f32);
uni_vmulps(xmm_val, xmm_val, xmm_modulo);
uni_vmulps(xmm_val, xmm_val, xmm_scale);
add(reg_modulo, step * sizeof(float));
}
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 1); // vector and boradcast
}
store_scalar(ptr[reg_dst], xmm_val, jcp_.dst_dt);
@ -338,7 +351,7 @@ private:
Xbyak::Label tail_loop_label;
Xbyak::Label tail_loop_end_label;
int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
int step = jcp_.src_dt == memory::data_type::bf16 ? 16 : (vlen / sizeof(float));
L(main_loop_label);
{
cmp(reg_work_amount, step);
@ -359,7 +372,7 @@ private:
add(reg_weights, vlen);
}
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 0);
add(reg_oc_off, vlen); // out channel offset of fused ops weights in byte
}
@ -384,17 +397,17 @@ private:
uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
} else {
if (jcp_.across_spatial) {
load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::f32);
load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32);
uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
add(reg_fused_factor, step * sizeof(float));
} else {
load_scalar(xmm_scale, ptr[reg_weights], memory::f32);
load_scalar(xmm_scale, ptr[reg_weights], memory::data_type::f32);
uni_vmulps(xmm_val, xmm_val, xmm_scale);
uni_vmulps(xmm_val, xmm_val, xmm_modulo);
add(reg_weights, step * sizeof(float));
}
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 0);
add(reg_oc_off, step * sizeof(float));
}
@ -413,15 +426,15 @@ private:
inline void normalize_blk() {
size_t blk_size = 0;
size_t simd_w = 0;
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
blk_size = simd_w = 16;
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
blk_size = simd_w = 8;
} else {
blk_size = 8;
simd_w = 4;
}
bool is_sse42 = (isa == cpu::sse42);
bool is_sse42 = (isa == cpu::x64::sse41);
if (jcp_.across_spatial) {
if (jcp_.channel_shared) {
@ -444,7 +457,7 @@ private:
load_vector(vmm_val, ptr[reg_src], jcp_.src_dt);
uni_vmulps(vmm_val, vmm_val, vmm_fused_factor);
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 0);
}
store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -457,7 +470,7 @@ private:
} else {
uni_vmulps(vmm_val, vmm_val, vmm_fused_factor2); // ld once
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
add(reg_oc_off, sse42_offset * sizeof(float));
apply_post_ops(jcp_.dst_dt, 0);
sub(reg_oc_off, sse42_offset * sizeof(float));
@ -497,7 +510,7 @@ private:
uni_vmulps(vmm_val, vmm_val, vmm_modulo);
add(reg_weights, vlen);
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 0);
add(reg_oc_off, vlen); // vlen is related isa
}
@ -514,7 +527,7 @@ private:
uni_vmulps(vmm_val, vmm_val, vmm_modulo); // bc once
add(reg_weights, vlen); // 4 * sizeof(float)
}
if (attr_.post_ops_.len_ != 0) {
if (attr_.post_ops_.len() != 0) {
apply_post_ops(jcp_.dst_dt, 0);
add(reg_oc_off, vlen); // vlen is related isa
}
@ -532,18 +545,18 @@ private:
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
default:
@ -555,19 +568,19 @@ private:
inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(xmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
pinsrw(xmm_src, op, 0x0);
uni_vpslld(xmm_src, xmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
movsx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
case memory::u8:
case memory::data_type::u8:
movzx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
@ -584,39 +597,39 @@ private:
Ymm ymm_dst = Ymm(vmm_dst.getIdx());
Xmm xmm_dst = Xmm(vmm_dst.getIdx());
if (dst_dt == memory::f32) {
if (dst_dt == memory::data_type::f32) {
uni_vmovups(op, vmm_dst);
} else if (dst_dt == memory::bf16) {
} else if (dst_dt == memory::data_type::bf16) {
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
} else if (dst_dt == memory::u8) {
} else if (dst_dt == memory::data_type::u8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
vpmovusdb(op, vmm_dst);
} else {
uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
} else if (dst_dt == memory::s8) {
} else if (dst_dt == memory::data_type::s8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
if (isa == cpu::x64::avx512_common) {
vpmovsdb(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
@ -630,21 +643,21 @@ private:
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(op, xmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpsrld(xmm_dst, xmm_dst, 16);
pextrw(op, xmm_dst, 0x0);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
@ -662,7 +675,7 @@ private:
int eltwise_inj_idx = 0;
int depthwise_inj_idx = 0;
int quantization_inj_idx = 0;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto& post_op = p.entry_[i];
if (post_op.is_eltwise()) {
if (eltwise_injectors.size() <= eltwise_inj_idx
@ -686,7 +699,7 @@ private:
|| quantization_injectors[quantization_inj_idx] == nullptr)
assert(!"Invalid quantization injectors.");
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len() - 1;
int s_idx = vmm_val.getIdx();
@ -835,7 +848,7 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].inPlace = -1;
config.outConfs[0].inPlace = canBeInplace ? 0 : -1;
auto pushDesc = [&](memory::format format) {
auto pushDesc = [&](memory::format_tag format) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, format});
@ -843,12 +856,12 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
// only plain layout support when w/o sse42
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
if (mayiuse(cpu::sse42)) {
pushDesc(memory::nhwc);
if (mayiuse(cpu::avx512_common)) {
pushDesc(memory::nChw16c);
if (mayiuse(cpu::x64::sse41)) {
pushDesc(memory::format_tag::nhwc);
if (mayiuse(cpu::x64::avx512_common)) {
pushDesc(memory::format_tag::nChw16c);
} else {
pushDesc(memory::nChw8c);
pushDesc(memory::format_tag::nChw8c);
}
}
}
@ -890,15 +903,20 @@ void MKLDNNNormalizeNode::createPrimitive() {
THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
auto selectedPD = getSelectedPrimitiveDescriptor();
Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
auto jcp = jit_normalize_config_params();
jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[0].desc.getPrecision());
jcp.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().outConfs[0].desc.getPrecision());
jcp.src_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.src_dt);
jcp.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.dst_dt);
jcp.is_nchw = selected_layout == MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims());
jcp.is_nhwc = selected_layout == Layout::NHWC;
jcp.is_blk = selected_layout == Layout::BLOCKED;
jcp.is_nchw = jcp.is_nhwc = jcp.is_blk = false;
if (getParentEdgeAt(0)->getMemory().GetDesc().isPlainFormat()) {
jcp.is_nchw = true;
} else if (getParentEdgeAt(0)->getMemory().GetDesc().isBlockedCFormat()) {
jcp.is_blk = true;
} else {
jcp.is_nhwc = true;
}
jcp.across_spatial = across_spatial;
jcp.channel_shared = channel_shared;
auto dims = getParentEdgeAt(0)->getDesc().getDims();
@ -908,25 +926,30 @@ void MKLDNNNormalizeNode::createPrimitive() {
jcp.h = (dims_size > 2) ? dims[2] : 1lu;
jcp.w = (dims_size > 3) ? dims[3] : 1lu;
if (mayiuse(cpu::avx512_common)) {
normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::avx512_common>(jcp));
normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
} else if (mayiuse(cpu::avx2)) {
normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::avx2>(jcp));
normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::avx2>(jcp, *attr.get()));
} else if (mayiuse(cpu::sse42)) {
normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::sse42>(jcp));
normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::sse42>(jcp, *attr.get()));
if (mayiuse(cpu::x64::avx512_common)) {
normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::x64::avx512_common>(jcp));
normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::x64::avx512_common>(jcp, *attr.get()));
} else if (mayiuse(cpu::x64::avx2)) {
normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::x64::avx2>(jcp));
normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
} else if (mayiuse(cpu::x64::sse41)) {
normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::x64::sse41>(jcp));
normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::x64::sse41>(jcp, *attr.get()));
}
if (normalize_kernel)
normalize_kernel->create_ker();
if (normalize_modulo_kernel)
normalize_modulo_kernel->create_ker();
const auto &p = (*attr.get()).post_ops_;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto &post_op = p.entry_[i];
if (post_op.is_eltwise()) {
eltwise_injectors_ref.push_back(std::make_shared<ref_eltwise_scalar_fwd_t>(
post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
eltwise_injectors_ref.push_back(std::make_shared<cpu::ref_eltwise_scalar_fwd_t>(
post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale));
} else if (post_op.is_depthwise()) {
depthwise_injectors_ref.push_back(std::make_shared<ref_depthwise_scalar_fwd_t>(
depthwise_injectors_ref.push_back(std::make_shared<cpu::ref_depthwise_scalar_fwd_t>(
post_op.depthwise.alg));
}
}
@ -958,12 +981,8 @@ struct MKLDNNNormalizeNode::NormalizeExecute {
void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
const uint8_t *src_ptr = reinterpret_cast<const uint8_t*>(srcMemPtr->GetData()) +
srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemPtr->GetDescriptor().data.data_type));
uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(dstMemPtr->GetData()) +
dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemPtr->GetDescriptor().data.data_type));
const uint8_t *src_ptr = reinterpret_cast<const uint8_t*>(srcMemPtr->GetPtr());
uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
auto dims = getParentEdgeAt(0)->getDesc().getDims();
@ -990,11 +1009,11 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
template <typename in_data_t, typename out_data_t>
void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
size_t blk_size = 1; // elt in vmm
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
blk_size = 16;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
blk_size = 8;
} else if (mayiuse(cpu::sse42)) {
} else if (mayiuse(cpu::x64::sse41)) {
blk_size = 4;
}
@ -1186,11 +1205,11 @@ void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data
template <typename in_data_t, typename out_data_t>
void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
size_t blk_size = 1; // elt in vmm
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
blk_size = 16;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
blk_size = 8;
} else if (mayiuse(cpu::sse42)) {
} else if (mayiuse(cpu::x64::sse41)) {
blk_size = 4;
}
@ -1307,11 +1326,11 @@ void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t*
template <typename in_data_t, typename out_data_t>
void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
size_t blk_size = 1; // channel blk for memory layout
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
blk_size = 16;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
blk_size = 8;
} else if (mayiuse(cpu::sse42)) {
} else if (mayiuse(cpu::x64::sse41)) {
blk_size = 8;
}
@ -1439,20 +1458,18 @@ void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* d
template <typename in_data_t, typename out_data_t>
void MKLDNNNormalizeNode::normalize_function(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
auto selectedPD = getSelectedPrimitiveDescriptor();
Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
if (mayiuse(cpu::sse42) && normalize_modulo_kernel && normalize_kernel) {
if (selected_layout == MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims())) {
if (mayiuse(cpu::x64::sse41) && normalize_modulo_kernel && normalize_kernel) {
if (jcp.is_nchw) {
normalize_nchw(src_data, dst_data, dims);
} else if (selected_layout == Layout::NHWC) {
} else if (jcp.is_nhwc) {
normalize_nhwc(src_data, dst_data, dims);
} else if (selected_layout == Layout::BLOCKED) {
} else if (jcp.is_blk) {
normalize_blk(src_data, dst_data, dims);
} else {
THROW_IE_EXCEPTION << "The selected layout is not supported.";
}
} else {
if (selected_layout == MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims())) {
if (jcp.is_nchw) {
normalize_nchw_ref(src_data, dst_data, dims);
} else {
THROW_IE_EXCEPTION << "Only support plain layout on machine w/o sse42.";
@ -1464,7 +1481,7 @@ inline void MKLDNNNormalizeNode::apply_post_ops_scalar(float &dst_value, int ind
const auto &p = (*attr.get()).post_ops_;
int eltwise_inj_idx = 0;
int depthwise_inj_idx = 0;
for (int i = 0; i < p.len_; i++) {
for (int i = 0; i < p.len(); i++) {
auto &post_op = p.entry_[i];
if (post_op.is_eltwise()) {
dst_value = eltwise_injectors_ref[eltwise_inj_idx]->compute_scalar(dst_value);
@ -1476,7 +1493,7 @@ inline void MKLDNNNormalizeNode::apply_post_ops_scalar(float &dst_value, int ind
depthwise_inj_idx++;
} else if (post_op.is_quantization()) {
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
bool do_rounding = do_dequantization || output_prec == Precision::FP32 || i != p.len_ - 1;
bool do_rounding = do_dequantization || output_prec == Precision::FP32 || i != p.len() - 1;
auto quant = post_op.quantization;

View File

@ -4,8 +4,12 @@
#pragma once
#include "ref_eltwise.hpp"
#include "ref_depthwise.hpp"
#include <mkldnn_node.h>
#include <mkldnn.hpp>
#include <cassert>
#include <cpu/ref_eltwise.hpp>
#include <cpu/ref_depthwise_injector.hpp>
using namespace InferenceEngine;
@ -47,6 +51,8 @@ struct jit_uni_normalize_modulo_kernel {
jit_uni_normalize_modulo_kernel(jit_normalize_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
virtual ~jit_uni_normalize_modulo_kernel() {}
virtual void create_ker() = 0;
jit_normalize_config_params jcp_;
};
@ -61,6 +67,8 @@ struct jit_uni_normalize_kernel {
explicit jit_uni_normalize_kernel(jit_normalize_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
virtual ~jit_uni_normalize_kernel() {}
virtual void create_ker() = 0;
jit_normalize_config_params jcp_;
const mkldnn_primitive_attr &attr_;
};
@ -118,6 +126,8 @@ private:
std::vector<std::shared_ptr<mkldnn::impl::cpu::ref_eltwise_scalar_fwd_t>> eltwise_injectors_ref;
std::vector<std::shared_ptr<mkldnn::impl::cpu::ref_depthwise_scalar_fwd_t>> depthwise_injectors_ref;
jit_normalize_config_params jcp = {};
};
} // namespace MKLDNNPlugin

View File

@ -87,16 +87,16 @@ void MKLDNNPadNode::initSupportedPrimitiveDescriptors() {
config.outConfs[0].inPlace = -1;
config.outConfs[0].constant = false;
auto pushSupportedPrimitiveDescriptor = [&](memory::format memoryFormat) {
auto pushSupportedPrimitiveDescriptor = [&](memory::format_tag memoryFormat) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), dataType, memoryFormat);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), dataType, memoryFormat);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::ref, memoryFormat});
};
if (numOfDims == 4)
pushSupportedPrimitiveDescriptor(mkldnn::memory::nhwc);
pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nhwc);
else if (numOfDims == 5)
pushSupportedPrimitiveDescriptor(mkldnn::memory::ndhwc);
pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::ndhwc);
pushSupportedPrimitiveDescriptor(MKLDNNMemory::GetPlainFormat(getParentEdgeAt(0)->getDims()));
@ -107,14 +107,14 @@ void MKLDNNPadNode::initSupportedPrimitiveDescriptors() {
if (numOfDims == 4) {
if (srcDims[1] % 8 == 0 && canUseBlocked(8))
pushSupportedPrimitiveDescriptor(mkldnn::memory::nChw8c);
pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nChw8c);
if (srcDims[1] % 16 == 0 && canUseBlocked(16))
pushSupportedPrimitiveDescriptor(mkldnn::memory::nChw16c);
pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nChw16c);
} else if (numOfDims == 5) {
if (srcDims[1] % 8 == 0 && canUseBlocked(8))
pushSupportedPrimitiveDescriptor(mkldnn::memory::nCdhw8c);
pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nCdhw8c);
if (srcDims[1] % 16 == 0 && canUseBlocked(16))
pushSupportedPrimitiveDescriptor(mkldnn::memory::nCdhw16c);
pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nCdhw16c);
}
}
@ -136,8 +136,7 @@ void MKLDNNPadNode::createPrimitive() {
params.srcStrides = getParentEdgeAt(0)->getBlob()->getTensorDesc().getBlockingDesc().getStrides();
params.dstStrides = getChildEdgeAt(0)->getBlob()->getTensorDesc().getBlockingDesc().getStrides();
auto layout = this->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc.getLayout();
if (layout == BLOCKED) {
if (getParentEdgeAt(0)->getMemory().GetDesc().isBlockedCFormat()) {
padsBegin[1] /= params.srcDims[params.srcDims.size() - 1];
padsEnd[1] /= params.srcDims[params.srcDims.size() - 1];
padsBegin.push_back(0);
@ -259,8 +258,8 @@ void MKLDNNPadNode::padConstant() {
template<typename T>
void MKLDNNPadNode::padConstantCommon() {
T* srcData = reinterpret_cast<T*>(getDataPtr(this->getParentEdgeAt(0)->getMemory()));
T* dstData = reinterpret_cast<T*>(getDataPtr(this->getChildEdgeAt(0)->getMemory()));
T* srcData = reinterpret_cast<T*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
T* dstData = reinterpret_cast<T*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
T value = static_cast<T>(padValue);
parallel_nt(0, [&](const int ithr, const int nthr) {
@ -301,8 +300,8 @@ void MKLDNNPadNode::padConstantCommon() {
}
void MKLDNNPadNode::padConstantZero() {
uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
uint8_t* dstData = getDataPtr(this->getChildEdgeAt(0)->getMemory());
uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
@ -342,8 +341,8 @@ void MKLDNNPadNode::padConstantZero() {
}
void MKLDNNPadNode::padEdge() {
uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
uint8_t* dstData = getDataPtr(this->getChildEdgeAt(0)->getMemory());
uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
@ -379,8 +378,8 @@ void MKLDNNPadNode::padEdge() {
}
void MKLDNNPadNode::padReflectOrSymmetric(const bool isSymmetric) {
uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
uint8_t* dstData = getDataPtr(this->getChildEdgeAt(0)->getMemory());
uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
size_t shift = isSymmetric ? 1 : 0;
@ -425,11 +424,6 @@ inline void MKLDNNPadNode::getDstIdx(const InferenceEngine::SizeVector& indexes,
dstIdx *= (padMode == CONSTANT && padValue != 0) ? 1 : params.sizeData;
}
inline uint8_t* MKLDNNPadNode::getDataPtr(const MKLDNNMemory& memoryPtr) const {
return reinterpret_cast<uint8_t*>(memoryPtr.GetData()) + memoryPtr.GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(memoryPtr.GetDescriptor().data.data_type));
}
bool MKLDNNPadNode::created() const {
return getType() == Pad;
}

View File

@ -36,7 +36,6 @@ private:
void padReflectOrSymmetric(const bool isSymmetric = false);
inline void getDstIdx(const InferenceEngine::SizeVector& indexes, size_t& dstIdx) const;
inline uint8_t* getDataPtr(const MKLDNNMemory& memoryPtr) const;
PadMode padMode = CONSTANT;
float padValue = 0.f;

View File

@ -8,23 +8,31 @@
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include "ie_parallel.hpp"
#include "jit_generator.hpp"
#include <cpu/x64/jit_generator.hpp>
#include <algorithm>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
#define GET_OFF(field) offsetof(jit_args_permute, field)
template <cpu::cpu_isa_t isa>
template <cpu::x64::cpu_isa_t isa>
struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_permute_kernel_f32)
explicit jit_uni_permute_kernel_f32(jit_permute_conf_t jpp) : jit_uni_permute_kernel(jpp), jit_generator() {
explicit jit_uni_permute_kernel_f32(jit_permute_conf_t jpp) : jit_uni_permute_kernel(jpp), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
@ -33,8 +41,6 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge
loop(jpp.n);
this->postamble();
ker_ = (decltype(ker_))this->getCode();
}
void load(const Xbyak::Xmm &xmm, const Xbyak::Address &addr) {
@ -115,7 +121,7 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
uint32_t vlen = cpu_isa_traits<isa>::vlen;
Xbyak::Reg64 reg_src = r8;
@ -174,52 +180,52 @@ void MKLDNNPermuteNode::initSupportedPrimitiveDescriptors() {
config.outConfs[0].inPlace = -1;
config.outConfs[0].constant = false;
if (getParentEdgeAt(0)->getDims().ndims() == 4) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nchw);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nchw);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nchw});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nchw});
auto srcDims = getParentEdgeAt(0)->getDims();
if (srcDims[1] % 8 == 0) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nChw8c});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw8c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nChw8c});
}
if (srcDims[1] % 16 == 0) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nChw16c});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw16c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nChw16c});
}
if (prec == Precision::I8 || prec == Precision::U8) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nhwc});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nhwc);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nhwc);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nhwc});
}
} else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ncdhw);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ncdhw);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::ncdhw});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::ncdhw});
auto srcDims = getParentEdgeAt(0)->getDims();
if (srcDims[1] % 8 == 0) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nCdhw8c});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw8c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nCdhw8c});
}
if (srcDims[1] % 16 == 0) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nCdhw16c});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw16c);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nCdhw16c});
}
if (prec == Precision::I8 || prec == Precision::U8) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ndhwc);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ndhwc);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::ndhwc});
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ndhwc);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ndhwc);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::ndhwc});
}
} else {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::any);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()));
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims())});
// general plain case
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
}
}
@ -332,7 +338,7 @@ void MKLDNNPermuteNode::createPrimitive() {
}
}
int max_threads = mkldnn_get_max_threads();
int max_threads = dnnl_get_max_threads();
const int n_max = 3; // max count dims for parallel
int n = 0;
int work_amount = sorted_dst_dims[0];
@ -351,24 +357,27 @@ void MKLDNNPermuteNode::createPrimitive() {
jpp.ndims = sorted_order.size();
jpp.data_size = MKLDNNExtensionUtils::sizeOfDataType(data_type);
if (mayiuse(cpu::avx512_common)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::avx512_common>(jpp));
} else if (mayiuse(cpu::avx2)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::avx2>(jpp));
} else if (mayiuse(cpu::sse42)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::sse42>(jpp));
if (mayiuse(cpu::x64::avx512_common)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx512_common>(jpp));
} else if (mayiuse(cpu::x64::avx2)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx2>(jpp));
} else if (mayiuse(cpu::x64::sse41)) {
permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::sse41>(jpp));
}
if (permute_kernel)
permute_kernel->create_ker();
}
static void permute_to_0231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
// Supports only NCHW to NHWC
int block_size = 1;
if (!MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat())) {
block_size = srcMemPtr->GetDescriptor().data.layout_desc.blocking.block_dims[1];
if (!srcMemPtr->GetDesc().isPlainFormat()) {
const auto &blk_desc = srcMemPtr->GetDescriptor().data.format_desc.blocking;
auto found = std::find(blk_desc.inner_idxs, blk_desc.inner_idxs + blk_desc.inner_nblks, 1);
auto pos = std::distance(found, blk_desc.inner_idxs);
block_size = blk_desc.inner_blks[pos];
}
const int C = srcMemPtr->GetDims()[1];
@ -394,13 +403,14 @@ static void permute_to_0231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_0213(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
int block_size = 1;
if (!MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat())) {
block_size = srcMemPtr->GetDescriptor().data.layout_desc.blocking.block_dims[1];
if (!srcMemPtr->GetDesc().isPlainFormat()) {
const auto &blk_desc = srcMemPtr->GetDescriptor().data.format_desc.blocking;
auto found = std::find(blk_desc.inner_idxs, blk_desc.inner_idxs + blk_desc.inner_nblks, 1);
auto pos = std::distance(found, blk_desc.inner_idxs);
block_size = blk_desc.inner_blks[pos];
}
const int C = srcMemPtr->GetDims()[1];
@ -419,10 +429,8 @@ static void permute_to_0213(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_0312(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int C = srcMemPtr->GetDims()[1];
const int H = srcMemPtr->GetDims()[2];
@ -439,10 +447,8 @@ static void permute_to_0312(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
template <size_t scale_H = 0, size_t scale_W = 0>
static void permute_to_014253(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int C = srcMemPtr->GetDims()[1];
const int CH = scale_H > 0 ? static_cast<int>(scale_H) : srcMemPtr->GetDims()[2];
@ -477,10 +483,8 @@ static void permute_to_014253(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPt
}
static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int C = srcMemPtr->GetDims()[1];
const int H = srcMemPtr->GetDims()[2];
@ -507,10 +511,8 @@ static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int C = srcMemPtr->GetDims()[1];
const int S = srcMemPtr->GetDims()[2];
@ -533,10 +535,8 @@ static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -571,13 +571,14 @@ static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPt
}
static void permute_to_0132(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
int src_block_size = 1;
if (!MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat())) {
src_block_size = srcMemPtr->GetDescriptor().data.layout_desc.blocking.block_dims[1];
if (!srcMemPtr->GetDesc().isPlainFormat()) {
const auto &blk_desc = srcMemPtr->GetDescriptor().data.format_desc.blocking;
auto found = std::find(blk_desc.inner_idxs, blk_desc.inner_idxs + blk_desc.inner_nblks, 1);
auto pos = std::distance(found, blk_desc.inner_idxs);
src_block_size = blk_desc.inner_blks[pos];
}
const int C = srcMemPtr->GetDims()[1];
@ -596,10 +597,8 @@ static void permute_to_0132(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_03142(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -630,10 +629,8 @@ static void permute_to_03142(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
}
static void permute_to_1203(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int C = srcMemPtr->GetDims()[1];
const int H = srcMemPtr->GetDims()[2];
@ -649,10 +646,8 @@ static void permute_to_1203(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_02134(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -678,10 +673,8 @@ static void permute_to_02134(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
}
static void permute_to_02431(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -707,10 +700,8 @@ static void permute_to_02431(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
}
static void permute_to_04231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -736,10 +727,8 @@ static void permute_to_04231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
}
static void permute_to_102(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int C = srcMemPtr->GetDims()[1];
const int S = srcMemPtr->GetDims()[2];
@ -762,10 +751,8 @@ static void permute_to_102(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
}
static void permute_to_02341(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -791,10 +778,8 @@ static void permute_to_02341(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
}
static void permute_to_04123(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
const int DIM1 = srcMemPtr->GetDims()[1];
const int DIM2 = srcMemPtr->GetDims()[2];
@ -824,52 +809,52 @@ const std::multimap<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl>
return true;
})}, // NCHW -> NHWC case
{{0, 1, 4, 2, 5, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_014253<2, 2>, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && srcMemPtr->GetDims()[2] == 2 && srcMemPtr->GetDims()[3] == 2;
return srcMemPtr->GetDesc().isPlainFormat() && srcMemPtr->GetDims()[2] == 2 && srcMemPtr->GetDims()[3] == 2;
})}, // Dense upsample convolution case (scale = 2)
{{0, 1, 4, 2, 5, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_014253<0, 0>, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})}, // Dense upsample convolution case (generic)
{{3, 0, 1, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_3012, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && MB == srcMemPtr->GetDims()[0];
return srcMemPtr->GetDesc().isPlainFormat() && MB == srcMemPtr->GetDims()[0];
})}, // LPR case
{{0, 2, 1, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_0213, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})}, // shufflenet
{{0, 2, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_021, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})}, // self attention block
{{0, 3, 4, 1, 5, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_034152, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})}, // learning-to-see-in-the-dark-sony
{{0, 1, 3, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_0132, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return true;
})},
{{0, 3, 1, 4, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_03142, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
{{1, 2, 0, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_1203, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && MB == srcMemPtr->GetDims()[0];
return srcMemPtr->GetDesc().isPlainFormat() && MB == srcMemPtr->GetDims()[0];
})},
{{0, 2, 1, 3, 4}, MKLDNNPermuteNode::PermuteImpl(permute_to_02134, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
{{0, 2, 4, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_02431, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
{{0, 4, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_04231, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
{{0, 3, 1, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_0312, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
{{1, 0, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_102, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && MB == srcMemPtr->GetDims()[0];
return srcMemPtr->GetDesc().isPlainFormat() && MB == srcMemPtr->GetDims()[0];
})},
{{0, 2, 3, 4, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_02341, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
{{0, 4, 1, 2, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_04123, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
return srcMemPtr->GetDesc().isPlainFormat();
})},
};
@ -887,14 +872,11 @@ void MKLDNNPermuteNode::execute(mkldnn::stream strm) {
}
if (permute_kernel) {
auto src_data = reinterpret_cast<const char *>(srcMemPtr->GetData());
auto dst_data = reinterpret_cast<char *>(dstMemPtr->GetData());
auto src_data = reinterpret_cast<const char *>(srcMemPtr->GetPtr());
auto dst_data = reinterpret_cast<char *>(dstMemPtr->GetPtr());
const auto &jpp = (*permute_kernel).jpp;
src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * jpp.data_size;
dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * jpp.data_size;
SizeVector dst_dims = jpp.dst_block_dims;
SizeVector dst_strides = jpp.dst_strides;
SizeVector src_strides = jpp.src_strides;

View File

@ -37,6 +37,8 @@ struct jit_uni_permute_kernel {
jit_permute_conf_t jpp;
virtual void create_ker() = 0;
explicit jit_uni_permute_kernel(jit_permute_conf_t jpp) : ker_(nullptr), jpp(jpp) {}
virtual ~jit_uni_permute_kernel() {}
};

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_pooling_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
#include "mkldnn_conv_node.h"
#include "mkldnn_concat_node.h"
@ -14,6 +14,7 @@
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include <legacy/ie_layers_internal.hpp>
#include <utils/general_utils.h>
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -23,6 +24,22 @@ MKLDNNPoolingNode::MKLDNNPoolingNode(const InferenceEngine::CNNLayerPtr& layer,
MKLDNNWeightsSharing::Ptr &cache)
: MKLDNNNode(layer, eng, cache) {}
std::vector<memory::format_tag> MKLDNNPoolingNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
if (dims.ndims() == 0)
return {memory::format_tag::x};
else if (dims.ndims() == 1)
return {memory::format_tag::x};
else if (dims.ndims() == 2)
return {memory::format_tag::nc};
else if (dims.ndims() == 3)
return {memory::format_tag::tnc, memory::format_tag::ntc};
else if (dims.ndims() == 4)
return {memory::format_tag::nChw8c, memory::format_tag::nChw16c, memory::format_tag::nhwc, memory::format_tag::nchw};
else if (dims.ndims() == 5)
return {memory::format_tag::nCdhw8c, memory::format_tag::nCdhw16c, memory::format_tag::ndhwc, memory::format_tag::ncdhw};
return {memory::format_tag::any};
}
void MKLDNNPoolingNode::getSupportedDescriptors() {
if (!descs.empty())
return;
@ -88,18 +105,18 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
}
if (inputPrecision == Precision::I8 || inputPrecision == Precision::U8) {
// i8 layers supports only ndhwc and nhwc layouts
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc};
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc};
createDescriptor({ in_candidate }, { out_candidate });
} else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && parentDims[1] == 1) {
// WA. We should force planar layout since it provides better performance
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format_tag::ncdhw : memory::format_tag::nchw};
MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format_tag::ncdhw : memory::format_tag::nchw};
createDescriptor({ in_candidate }, { out_candidate });
} else {
if (inputDataType != memory::bf16) {
inputDataType = memory::f32;
outputDataType = memory::f32;
if (inputDataType != memory::data_type::bf16) {
inputDataType = memory::data_type::f32;
outputDataType = memory::data_type::f32;
}
// It doesn't support any format
for (auto format : getAvailableFormatsForDims(parentDims)) {
@ -119,8 +136,11 @@ void MKLDNNPoolingNode::createPrimitive() {
auto prim_desc = createPrimitiveDescriptor<pooling_forward::primitive_desc, pooling_forward::desc>(attr);
prim.reset(new pooling_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
prim.reset(new pooling_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
}
bool MKLDNNPoolingNode::created() const {
@ -149,23 +169,28 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
}
}
if (!exclude_pad && (not_zero_l || not_zero_r))
alg = pooling_avg_include_padding;
alg = algorithm::pooling_avg_include_padding;
else
alg = pooling_avg_exclude_padding;
alg = algorithm::pooling_avg_exclude_padding;
} else if (type == PoolingLayer::PoolType::MAX) {
alg = pooling_max;
alg = algorithm::pooling_max;
} else {
// TODO: Handle rest of the possible: STOCH, ROI, SPACIAL_PYRAMID
THROW_IE_EXCEPTION << "Unsupported pooling type";
}
auto convert = [] (std::vector<ptrdiff_t> orig_dims) {
return memory::dims(orig_dims.begin(), orig_dims.end());
};
std::shared_ptr<pooling_forward::desc> desc_ptr(
new pooling_forward::desc(prop_kind::forward_scoring, alg,
in_candidate, out_candidate,
stride, kernel, effective_pad_begin, effective_pad_end,
mkldnn::padding_kind::zero));
convert(stride),
convert(kernel),
convert(effective_pad_begin),
convert(effective_pad_end)));
if (alg == pooling_avg_include_padding) {
if (alg == algorithm::pooling_avg_include_padding) {
// In case of AVG including paddings the norm coeff should be calculated
// with tacking into account original pads. So we need to restore
// original values for end paddings.
@ -190,7 +215,7 @@ void MKLDNNPoolingNode::initSupportedPrimitiveDescriptors() {
for (auto& desc : descs) {
auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
while (itpd.is_not_end()) {
while (static_cast<bool>(itpd)) {
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
for (size_t i = 0; i < descInputNumbers(desc); i++) {
@ -201,30 +226,18 @@ void MKLDNNPoolingNode::initSupportedPrimitiveDescriptors() {
config.inConfs.push_back(dataConfig);
}
std::vector<mkldnn::memory::format> outFormats;
for (size_t i = 0; i < descOutputNumbers(desc); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = canBeInPlace() ? 0 : -1;
dataConfig.constant = false;
dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getDstMemDesc(itpd, i));
config.outConfs.push_back(dataConfig);
auto primDesc = itpd.fetch();
auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0);
if (dstPrimDesc) {
outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
} else {
// This path is needed to correctly handle Deconvolution node
auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0);
if (diffSrcPrimDesc) {
outFormats.emplace_back(static_cast<memory::format>(itpd.diff_src_primitive_desc().desc().data.format));
}
}
}
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
itpd++;
supportedPrimitiveDescriptors.emplace_back(config, impl_type);
if (!itpd.next_impl())
break;
}
}
}
@ -249,18 +262,18 @@ void MKLDNNPoolingNode::initDescriptor(const InferenceEngine::LayerConfig &confi
size_t selected_count = 0;
for (size_t j = 0; j < descs.size(); j++) {
const auto &desc = descs[j];
std::shared_ptr<primitive_desc_iterator> itpd;
primitive_desc_iterator itpd;
itpd = std::make_shared<primitive_desc_iterator>(desc.createPrimitiveDescriptorIterator(getEngine(), attr));
itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
while (itpd->is_not_end()) {
while (itpd) {
InferenceEngine::LayerConfig cfg;
cfg.dynBatchSupport = true;
for (size_t i = 0; i < descInputNumbers(desc); i++) {
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = canBeInPlace() ? 0 : -1;
dataConfig.constant = false;
dataConfig.desc = getSrcMemDesc(*itpd, i);
dataConfig.desc = getSrcMemDesc(itpd, i);
cfg.inConfs.push_back(dataConfig);
}
@ -268,10 +281,10 @@ void MKLDNNPoolingNode::initDescriptor(const InferenceEngine::LayerConfig &confi
InferenceEngine::DataConfig dataConfig;
dataConfig.inPlace = -1;
dataConfig.constant = false;
dataConfig.desc = getDstMemDesc(*itpd, i);
dataConfig.desc = getDstMemDesc(itpd, i);
cfg.outConfs.push_back(dataConfig);
}
impl_desc_type impl_type = parse_impl_name(itpd->get_impl_info_str().c_str());
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (selected_count == selectedPrimitiveDescriptorIndex) {
if (impl_type != selectedPD->getImplementationType()) {
THROW_IE_EXCEPTION << "Cannot get the original layer configuration!";
@ -284,7 +297,8 @@ void MKLDNNPoolingNode::initDescriptor(const InferenceEngine::LayerConfig &confi
}
}
selected_count++;
(*itpd)++;
if (!itpd.next_impl())
break;
}
}

View File

@ -19,6 +19,7 @@ public:
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims &dims) const override;
void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void initDescriptor(const InferenceEngine::LayerConfig &config) override;

View File

@ -6,14 +6,68 @@
#include <ie_common.h>
#include <mkldnn_node.h>
#include <common/primitive_attr.hpp>
#include <string>
#include <memory>
#include <vector>
#include <utility>
#include <primitive_attr.hpp>
namespace MKLDNNPlugin {
enum QuantizeOpType {
FakeQuantization,
Quantization,
Binarization,
};
struct jit_quantize_params {
int c;
InferenceEngine::Precision src_prc;
InferenceEngine::Precision wei_prc;
InferenceEngine::Precision dst_prc;
InferenceEngine::Layout src_layout;
QuantizeOpType op_type;
};
struct jit_quantize_call_args {
const uint8_t* from;
const uint8_t* to;
const float* thresholds;
const float* output_mask;
const float* crop_low;
const float* crop_high;
const float* input_scale;
const float* input_shift;
const float* output_scale;
const float* output_shift;
size_t src_step;
size_t dst_step;
size_t block_size;
size_t work_amount;
};
struct jit_uni_quantize_kernel {
void (*ker_)(const jit_quantize_call_args *);
void operator()(const jit_quantize_call_args *args) {
assert(ker_);
ker_(args);
}
explicit jit_uni_quantize_kernel(jit_quantize_params jqp) : ker_(nullptr), jqp_(jqp) {}
virtual ~jit_uni_quantize_kernel() {}
virtual void create_ker() = 0;
jit_quantize_params jqp_;
};
class MKLDNNQuantizeNode : public MKLDNNNode {
public:
MKLDNNQuantizeNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
@ -27,8 +81,8 @@ public:
size_t getAxis() const { return axis; }
bool isBinarization() const { return quantizeAlgorithm == mkldnn::algorithm::binarization_depthwise; }
mkldnn::algorithm getAlgorithm() const { return quantizeAlgorithm; }
bool isBinarization() const { return quantizeOpType == QuantizeOpType::Binarization; }
QuantizeOpType getOpType() const { return quantizeOpType; }
const float* getBinarizationTresholdsPtr() const { return &binarizationThresholds[0]; }
const float* getBinarizationOutputMaskPtr() const { return reinterpret_cast<const float*>(&binarizationOutputMask[0]); }
@ -61,7 +115,10 @@ public:
private:
void init() override;
std::vector<mkldnn::memory::format> getDataFormats() const;
std::vector<mkldnn::memory::format_tag> getDataFormats() const;
void executeReference();
void executeBinarization();
void executeQuantization();
int levels = -1;
@ -94,7 +151,11 @@ private:
InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
mkldnn::algorithm quantizeAlgorithm = mkldnn::algorithm::algorithm_undef;
QuantizeOpType quantizeOpType = FakeQuantization;
jit_quantize_params jqp = {};
std::shared_ptr<jit_uni_quantize_kernel> quantize_kernel = nullptr;
};
} // namespace MKLDNNPlugin

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_reduce_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
#include <legacy/ie_layers.h>
#include <mkldnn.hpp>
@ -16,16 +16,17 @@
#include "ie_parallel.hpp"
#include <algorithm>
#include "jit_generator.hpp"
#include "jit_uni_eltwise.hpp"
#include "jit_uni_depthwise.hpp"
#include "jit_uni_quantization.hpp"
#include <cpu/x64/jit_generator.hpp>
#include <cpu/x64/jit_uni_eltwise.hpp>
#include <cpu/x64/jit_uni_depthwise_injector.hpp>
#include <cpu/x64/jit_uni_quantization_injector.hpp>
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
using namespace Xbyak;
@ -67,7 +68,7 @@ using namespace Xbyak;
// some utility functions
static inline bool isFloatCompatible(memory::data_type type) {
return memory::f32 == type || memory::bf16 == type;
return memory::data_type::f32 == type || memory::data_type::bf16 == type;
}
template <cpu_isa_t isa>
@ -75,8 +76,15 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32)
explicit jit_uni_reduce_kernel_f32(jit_reduce_config_params jcp)
: jit_uni_reduce_kernel(jcp), jit_generator() {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
: jit_uni_reduce_kernel(jcp), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f, 1));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -94,10 +102,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
mov(reg_table, l_table);
}
if (isa == cpu::avx512_common || jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::Or)
if (isa == cpu::x64::avx512_common || jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::Or)
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
if ((isa == cpu::avx512_common && jcp_.reduce_mode == Reduce::And) || jcp_.reduce_mode == Reduce::Or) {
if ((isa == cpu::x64::avx512_common && jcp_.reduce_mode == Reduce::And) || jcp_.reduce_mode == Reduce::Or) {
uni_vmovups(vmm_aux, table_val(0));
}
@ -115,12 +123,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
} else if (jcp_.reduce_mode == Reduce::LogSumExp) {
exp_injector->prepare_table();
}
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
@ -250,7 +256,7 @@ private:
load_vector(vmm_src, ptr[reg_src], jcp_.src_dt);
reduce_kernel(vmm_src, vmm_dst);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt);
reduce_kernel(vmm_src, vmm_dst_aux);
}
@ -424,7 +430,7 @@ private:
load_vector(vmm_src, ptr[reg_src], jcp_.src_dt);
reduce_kernel(vmm_src, vmm_dst);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt);
reduce_kernel(vmm_src, vmm_dst);
}
@ -530,7 +536,7 @@ private:
inline void load_dst_vector() {
load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt);
}
@ -538,30 +544,30 @@ private:
if (jcp_.reduce_mode == Reduce::Or && isa != avx512_common) {
vcmpneqps(vmm_dst, vmm_dst, vmm_zero);
uni_vandps(vmm_dst, vmm_dst, vmm_aux);
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
vcmpneqps(vmm_dst_aux, vmm_dst_aux, vmm_zero);
uni_vandps(vmm_dst_aux, vmm_dst_aux, vmm_aux);
}
}
store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt);
}
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
default:
@ -574,19 +580,19 @@ private:
inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(xmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
pinsrw(xmm_src, op, 0x0);
uni_vpslld(xmm_src, xmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
movsx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
case memory::u8:
case memory::data_type::u8:
movzx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
@ -608,41 +614,41 @@ private:
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(op, vmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case memory::s8:
case memory::data_type::s8:
if (isa == avx512_common) {
vmaxps(vmm_dst, vmm_zero, vmm_dst);
vpmovsdb(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
break;
case memory::u8:
case memory::data_type::u8:
if (isa == avx512_common) {
vpmovusdb(op, vmm_dst);
} else {
uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
@ -659,21 +665,21 @@ private:
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(op, xmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpsrld(xmm_dst, xmm_dst, 16);
pextrw(op, xmm_dst, 0x0);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
@ -685,9 +691,9 @@ private:
}
inline void load_embedded_horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) {
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
load_embedded_horiz_store(vmm_dst, dst_dt);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
vextractf128(xmm_aux1, ymm_dst, 0);
vextractf128(xmm_aux2, ymm_dst, 1);
@ -712,20 +718,20 @@ private:
movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4
horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
switch (dst_dt) {
case memory::f32:
case memory::bf16:
case memory::data_type::f32:
case memory::data_type::bf16:
load_scalar(xmm_aux3, ptr[reg_dst], dst_dt);
horiz_ps(xmm_dst, xmm_aux3);
store_scalar(ptr[reg_dst], xmm_dst, dst_dt);
break;
case memory::s32:
case memory::data_type::s32:
movss(xmm_aux3, ptr[reg_dst]);
uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
horiz_ps(xmm_dst, xmm_aux3);
uni_vcvtps2dq(xmm_dst, xmm_dst);
movss(ptr[reg_dst], xmm_dst);
break;
case memory::u8:
case memory::data_type::u8:
vpbroadcastb(xmm_aux3, ptr[reg_dst]);
uni_vpmovzxbd(xmm_aux3, xmm_aux3);
uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -735,7 +741,7 @@ private:
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
pextrb(ptr[reg_dst], xmm_dst, 0);
break;
case memory::s8:
case memory::data_type::s8:
vpbroadcastb(xmm_aux3, ptr[reg_dst]);
uni_vpmovsxbd(xmm_aux3, xmm_aux3);
uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -814,8 +820,15 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_post_kernel_f32)
explicit jit_uni_reduce_post_kernel_f32(jit_reduce_config_params jcp)
: jit_uni_reduce_post_kernel(jcp), jit_generator() {
log_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_log, 0.f, 0.f));
: jit_uni_reduce_post_kernel(jcp), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
log_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_log, 0.f, 0.f, 1.f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -828,7 +841,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
if (!jcp_.planar_layout)
mov(reg_reduce_c, ptr[reg_params + GET_OFF(reduce_c)]);
if (isa == cpu::avx512_common)
if (isa == cpu::x64::avx512_common)
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
reduce_post_main();
@ -843,12 +856,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) {
log_injector->prepare_table();
}
ker_ = (decltype(ker_)) this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
@ -902,12 +913,12 @@ private:
// load
load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt);
// reduce and store
horiz_reduce_store(vmm_dst, jcp_.dst_dt);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
load_embedded_horiz_reduce_store(vmm_dst_aux, jcp_.dst_dt);
add(reg_dst, step * jcp_.dst_data_size);
@ -941,17 +952,17 @@ private:
// load
load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt);
// reduce
reduce_map_kernel(vmm_dst);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
reduce_map_kernel(vmm_dst_aux);
// store
store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt);
if (isa == cpu::sse42)
if (isa == cpu::x64::sse41)
store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt);
add(reg_dst, step * jcp_.dst_data_size);
@ -1019,18 +1030,18 @@ private:
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(vmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpmovzxwd(vmm_src, op);
uni_vpslld(vmm_src, vmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpmovsxbd(vmm_src, op);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpmovzxbd(vmm_src, op);
break;
default:
@ -1043,19 +1054,19 @@ private:
inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
switch (src_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(xmm_src, op);
break;
case memory::bf16:
case memory::data_type::bf16:
pinsrw(xmm_src, op, 0x0);
uni_vpslld(xmm_src, xmm_src, 16);
break;
case memory::s8:
case memory::data_type::s8:
movsx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
case memory::u8:
case memory::data_type::u8:
movzx(reg_tmp_32, op);
movq(xmm_src, reg_tmp_64);
break;
@ -1077,41 +1088,41 @@ private:
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
uni_vmovups(op, vmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case memory::s8:
case memory::data_type::s8:
if (isa == avx512_common) {
vmaxps(vmm_dst, vmm_zero, vmm_dst);
vpmovsdb(op, vmm_dst);
} else {
uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
}
break;
case memory::u8:
case memory::data_type::u8:
if (isa == avx512_common) {
vpmovusdb(op, vmm_dst);
} else {
uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vpermq(ymm_dst, ymm_dst, 0x08);
uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
if (isa != cpu::sse42)
if (isa != cpu::x64::sse41)
vmovq(op, xmm_dst);
else
movd(op, xmm_dst);
@ -1128,21 +1139,21 @@ private:
}
switch (dst_dt) {
case memory::f32:
case memory::s32:
case memory::data_type::f32:
case memory::data_type::s32:
movss(op, xmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpsrld(xmm_dst, xmm_dst, 16);
pextrw(op, xmm_dst, 0x0);
break;
case memory::s8:
case memory::data_type::s8:
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
mov(op, reg_tmp_8);
break;
case memory::u8:
case memory::data_type::u8:
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
movq(reg_tmp_64, xmm_dst);
@ -1154,9 +1165,9 @@ private:
}
inline void horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) {
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
horize_store(vmm_dst, dst_dt);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
vextractf128(xmm_aux1, ymm_dst, 0);
vextractf128(xmm_aux2, ymm_dst, 1);
@ -1181,24 +1192,24 @@ private:
movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4
horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
switch (dst_dt) {
case memory::f32:
case memory::data_type::f32:
movss(ptr[reg_dst], xmm_dst);
break;
case memory::bf16:
case memory::data_type::bf16:
uni_vpsrld(xmm_dst, xmm_dst, 16);
pextrw(ptr[reg_dst], xmm_dst, 0x0);
break;
case memory::s32:
case memory::data_type::s32:
uni_vcvtps2dq(xmm_dst, xmm_dst);
movss(ptr[reg_dst], xmm_dst);
break;
case memory::u8:
case memory::data_type::u8:
uni_vcvtps2dq(xmm_dst, xmm_dst);
uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
pextrb(ptr[reg_dst], xmm_dst, 0);
break;
case memory::s8:
case memory::data_type::s8:
uni_vcvtps2dq(xmm_dst, xmm_dst);
uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
@ -1210,9 +1221,9 @@ private:
}
inline void load_embedded_horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) {
if (isa == cpu::sse42) {
if (isa == cpu::x64::sse41) {
load_embedded_horiz_store(vmm_dst, dst_dt);
} else if (isa == cpu::avx2) {
} else if (isa == cpu::x64::avx2) {
Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
vextractf128(xmm_aux1, ymm_dst, 0);
vextractf128(xmm_aux2, ymm_dst, 1);
@ -1237,20 +1248,20 @@ private:
movhlps(xmm_aux3, xmm_dst); // aux3:f(3,4),f(4,4),4,4
horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
switch (dst_dt) {
case memory::f32:
case memory::bf16:
case memory::data_type::f32:
case memory::data_type::bf16:
load_scalar(xmm_aux3, ptr[reg_dst], dst_dt);
horiz_ps(xmm_dst, xmm_aux3);
store_scalar(ptr[reg_dst], xmm_dst, dst_dt);
break;
case memory::s32:
case memory::data_type::s32:
movss(xmm_aux3, ptr[reg_dst]);
uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
horiz_ps(xmm_dst, xmm_aux3);
uni_vcvtps2dq(xmm_dst, xmm_dst);
movss(ptr[reg_dst], xmm_dst);
break;
case memory::u8:
case memory::data_type::u8:
vpbroadcastb(xmm_aux3, ptr[reg_dst]);
uni_vpmovzxbd(xmm_aux3, xmm_aux3);
uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -1260,7 +1271,7 @@ private:
uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
pextrb(ptr[reg_dst], xmm_dst, 0);
break;
case memory::s8:
case memory::data_type::s8:
vpbroadcastb(xmm_aux3, ptr[reg_dst]);
uni_vpmovsxbd(xmm_aux3, xmm_aux3);
uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -1369,7 +1380,7 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
Precision inputPrecision = getCnnLayer()->insData[REDUCE_DATA].lock()->getPrecision();
Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();
jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 &&
jit_mode = (mayiuse(cpu::x64::sse41)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 &&
std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), inputPrecision) != std::end(supportedPrecisions) &&
std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), outputPrecision) != std::end(supportedPrecisions);
@ -1405,19 +1416,19 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
config.inConfs[REDUCE_INDEXES].inPlace = -1;
config.outConfs[0].inPlace = -1;
auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType,
auto pushDesc = [&](memory::format_tag inFormat, memory::format_tag outFormat, memory::data_type inDataType,
memory::data_type outDataType, impl_desc_type impl_type) {
config.inConfs[REDUCE_DATA].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_DATA)->getDims(), inDataType, inFormat);
config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::s32, memory::x);
config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::data_type::s32, memory::format_tag::x);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outDataType, outFormat);
supportedPrimitiveDescriptors.push_back({config, impl_type, outFormat});
};
if (jit_mode) {
impl_desc_type impl_type = impl_desc_type::jit_sse42;
if (mayiuse(cpu::avx512_common)) {
if (mayiuse(cpu::x64::avx512_common)) {
impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(cpu::avx2)) {
} else if (mayiuse(cpu::x64::avx2)) {
impl_type = impl_desc_type::jit_avx2;
}
@ -1425,22 +1436,23 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType, impl_type);
if (keep_dims) {
if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 4 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) {
if (mayiuse(cpu::avx512_common)) {
pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType, impl_type);
} else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType, impl_type);
if (mayiuse(cpu::x64::avx512_common)) {
pushDesc(memory::format_tag::nChw16c, memory::format_tag::nChw16c, inputDataType, outputDataType, impl_type);
} else if (mayiuse(cpu::x64::avx2) || mayiuse(cpu::x64::sse41)) {
pushDesc(memory::format_tag::nChw8c, memory::format_tag::nChw8c, inputDataType, outputDataType, impl_type);
}
} else if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 5 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) {
if (mayiuse(cpu::avx512_common)) {
pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType, impl_type);
} else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType, impl_type);
if (mayiuse(cpu::x64::avx512_common)) {
pushDesc(memory::format_tag::nCdhw16c, memory::format_tag::nCdhw16c, inputDataType, outputDataType, impl_type);
} else if (mayiuse(cpu::x64::avx2) || mayiuse(cpu::x64::sse41)) {
pushDesc(memory::format_tag::nCdhw8c, memory::format_tag::nCdhw8c, inputDataType, outputDataType, impl_type);
}
}
}
} else {
pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())),
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32, impl_desc_type::ref);
MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())),
memory::data_type::f32, memory::data_type::f32, impl_desc_type::ref);
}
}
@ -1456,8 +1468,7 @@ void MKLDNNReduceNode::createPrimitive() {
THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "didn't set preferable primitive descriptor.";
auto selectedPD = getSelectedPrimitiveDescriptor();
Layout selected_layout = selectedPD->getConfig().inConfs[REDUCE_DATA].desc.getLayout();
planar_layout = MKLDNNMemory::GetPlainLayout(getParentEdgeAt(REDUCE_DATA)->getDims()) == selected_layout;
planar_layout = getParentEdgeAt(REDUCE_DATA)->getMemory().GetDesc().isPlainFormat();
auto jcp = jit_reduce_config_params();
jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[REDUCE_DATA].desc.getPrecision());
@ -1467,20 +1478,26 @@ void MKLDNNReduceNode::createPrimitive() {
jcp.planar_layout = planar_layout;
jcp.reduce_mode = reduceMode;
if (mayiuse(cpu::avx512_common)) {
reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::avx512_common>(jcp));
reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::avx512_common>(jcp));
if (mayiuse(cpu::x64::avx512_common)) {
reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::x64::avx512_common>(jcp));
reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::x64::avx512_common>(jcp));
blk_size = 16;
} else if (mayiuse(cpu::avx2)) {
reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::avx2>(jcp));
reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::avx2>(jcp));
} else if (mayiuse(cpu::x64::avx2)) {
reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::x64::avx2>(jcp));
reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::x64::avx2>(jcp));
blk_size = 8;
} else if (mayiuse(cpu::sse42)) {
reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::sse42>(jcp));
reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::sse42>(jcp));
} else if (mayiuse(cpu::x64::sse41)) {
reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::x64::sse41>(jcp));
reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::x64::sse41>(jcp));
blk_size = 8;
}
if (reduce_kernel)
reduce_kernel->create_ker();
if (reduce_post_kernel)
reduce_post_kernel->create_ker();
jit_mode = jit_mode && reduce_kernel;
}
@ -1521,12 +1538,8 @@ void MKLDNNReduceNode::execute(mkldnn::stream strm) {
ReduceW = IW != OW && OW == 1;
}
const uint8_t *src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData()) +
srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemPtr->GetDescriptor().data.data_type));
uint8_t *dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetData()) +
dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemPtr->GetDescriptor().data.data_type));
const uint8_t *src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetPtr());
uint8_t *dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetPtr());
if (jit_mode) {
reduce_type(src_data, dst_data, dst_size);
} else {

View File

@ -56,6 +56,8 @@ struct jit_uni_reduce_kernel {
explicit jit_uni_reduce_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
virtual ~jit_uni_reduce_kernel() {}
virtual void create_ker() = 0;
jit_reduce_config_params jcp_;
};
@ -67,6 +69,8 @@ struct jit_uni_reduce_post_kernel {
ker_(args);
}
virtual void create_ker() = 0;
explicit jit_uni_reduce_post_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
virtual ~jit_uni_reduce_post_kernel() {}

View File

@ -58,8 +58,8 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].desc = parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].desc;
config.outConfs[0].desc = child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc;
} else {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format::any);
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::any);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::any);
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::reorder, MKLDNNMemory::Convert(config.outConfs[0].desc.getLayout()));
@ -103,65 +103,53 @@ void MKLDNNReorderNode::createReorderPrimitive(const mkldnn::memory::desc &srcDe
mask = 1 << oc_dim_id;
attr.set_output_scales(mask, scales);
attr.set_int_output_round_mode(round_nearest);
}
auto createReorder = [&]() {
auto createReorder = [&]() -> bool {
// No autoblocking. Reorder can be applied as is
reorder::primitive_desc pd = reorder::primitive_desc(src_blocked->GetPrimitiveDescriptor(), dst_blocked->GetPrimitiveDescriptor(), attr);
reorder::primitive_desc pd = mkldnn::reorder::primitive_desc(src_blocked->GetPrimitive(), dst_blocked->GetPrimitive(), attr, true);
const char *info;
mkldnn_primitive_desc_query(pd.get(), mkldnn::convert_to_c(impl_info_str), 0, &info);
supportedPrimitiveDescriptors[0].setImplementationType(parse_impl_name(std::string(info)));
supportedPrimitiveDescriptors[0].setOutputLayouts(static_cast<memory::format>(dstDesc.data.format));
if (!pd)
return false;
prim.reset(new mkldnn::reorder(pd, src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
auto info = pd.impl_info_str();
supportedPrimitiveDescriptors[0].setImplementationType(parse_impl_name(info));
prim.reset(new mkldnn::reorder(pd));
return true;
};
try {
createReorder();
} catch (...) {
auto success = createReorder();
if (!success) {
// TODO: We should keep shape consistency for const and expected shape for node.
// If it requires reshape operation it should explicitly injected into graph.
//
// There is a limitation for IE representing of weights for grouped convolutions. IE doesn't
// split group dimension in separate shape dimension. IE use OIHW, but mkldnn expect GOIHW.
// So we will perform implicit reshape to dst shape.
//
// MKLDNN doesn't support direct reorders from planar data formats to grouped weights formats.
// Code block below tries to detect such cases and reinterpret data planar formats (e.g. nchw)
// as grouped weights planar formats (e.g. goihw) since they have same physical memory layout.
if (MKLDNNMemory::GetPlainFormat(src_blocked->GetDims()) == src_blocked->GetFormat() &&
if (src_blocked->GetDesc().isPlainFormat() &&
src_blocked->GetDims().size() + 1 == dst_blocked->GetDims().size()) {
try {
mkldnn::memory::dims newDims = dst_blocked->GetDims();
mkldnn::memory::format newFormat;
if (MKLDNNMemory::IsGroupedFormat(dst_blocked->GetFormat())) {
newFormat = src_blocked->GetDims().size() == 4 ? memory::goihw :
src_blocked->GetDims().size() == 5 ? memory::goidhw :
src_blocked->GetFormat();
} else {
newFormat = src_blocked->GetDims().size() == 4 ? memory::ncdhw :
src_blocked->GetFormat();
}
const auto newDims = dst_blocked->GetDims();
const auto newFormat = MKLDNNMemory::GetPlainFormat(newDims);
auto newDesc = mkldnn::memory::desc(newDims, src_blocked->GetDataType(), newFormat);
src_blocked->Create(newDesc, srcPtr, false);
auto newDesc = mkldnn::memory::desc(newDims, src_blocked->GetDataType(), newFormat);
src_blocked->Create(newDesc, srcPtr, false);
createReorder();
} catch (...) {
THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
}
// MKLDNN doesn't support direct reorders between planar data formats in case they have different rank but the same number of elements.
// Code block below detects these cases and substitute src dims with dst ones.
} else if (MKLDNNMemory::GetPlainFormat(src_blocked->GetDims()) == src_blocked->GetFormat() &&
MKLDNNMemory::GetPlainFormat(dst_blocked->GetDims()) == dst_blocked->GetFormat() &&
src_blocked->GetElementsCount() == dst_blocked->GetElementsCount()) {
try {
auto newDesc = mkldnn::memory::desc(dst_blocked->GetDims(), src_blocked->GetDataType(), dst_blocked->GetFormat());
src_blocked->Create(newDesc, srcPtr, false);
createReorder();
} catch (...) {
THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
}
} else {
THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
success = createReorder();
}
}
if (!success) {
THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
}
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
}
const std::vector<impl_desc_type>& MKLDNNReorderNode::getPrimitivesPriority() {
@ -194,10 +182,10 @@ void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
src_d.data.dims[0] = batchToProcess();
src_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
src_d.data.padded_dims[0] = batchToProcess();
dst_d.data.dims[0] = batchToProcess();
dst_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
dst_d.data.padded_dims[0] = batchToProcess();
createReorderPrimitive(src_d, src_data_hdl, dst_d, dst_data_hdl);
}

View File

@ -36,22 +36,19 @@ void MKLDNNReshapeNode::initSupportedPrimitiveDescriptors() {
if (inputDataType != outputDataType)
inputDataType = outputDataType;
auto& outDims = getChildEdgeAt(0)->getDims();
memory::format outFormat = MKLDNNMemory::GetPlainFormat(outDims);
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
config.inConfs.resize(getParentEdges().size());
for (size_t i = 0; i <getParentEdges().size(); i++) {
config.inConfs[i].inPlace = -1;
config.inConfs[i].constant = false;
config.inConfs[i].desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType,
MKLDNNMemory::GetPlainFormat(getParentEdgeAt(i)->getDims()));
config.inConfs[i].desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType);
}
config.outConfs.resize(1);
config.outConfs[0].inPlace = 0;
config.outConfs[0].constant = false;
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormat);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType);
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
}
void MKLDNNReshapeNode::createPrimitive() {

View File

@ -4,7 +4,8 @@
#include "mkldnn_rnn.h"
#include "mkldnn_extension_utils.h"
#include "desc_iterator.hpp"
#include "utils/general_utils.h"
#include <string>
#include <utility>
@ -14,38 +15,55 @@ using namespace InferenceEngine;
namespace MKLDNNPlugin {
template <typename T, typename P>
inline bool one_of(T val, P item) { return val == item; }
template <typename T, typename P, typename... Args>
inline bool one_of(T val, P item, Args... item_others) {
return val == item || one_of(val, item_others...);
}
using _RNN = RNNSequenceLayer; // alias
static rnn_direction ie2mkl(_RNN::Direction &direction) {
return direction == _RNN::FWD ? unidirectional_left2right
: direction == _RNN::BWD ? unidirectional_right2left
: direction == _RNN::BDR ? bidirectional_concat
: unidirectional;
return direction == _RNN::FWD ? rnn_direction::unidirectional_left2right
: direction == _RNN::BWD ? rnn_direction::unidirectional_right2left
: direction == _RNN::BDR ? rnn_direction::bidirectional_concat
: rnn_direction::unidirectional;
}
static algorithm ie2mkl(std::string act_type) {
return act_type == "sigmoid" ? eltwise_logistic
: act_type == "tanh" ? eltwise_tanh
: act_type == "relu" ? eltwise_relu
: algorithm_undef;
return act_type == "sigmoid" ? algorithm::eltwise_logistic
: act_type == "tanh" ? algorithm::eltwise_tanh
: act_type == "relu" ? algorithm::eltwise_relu
: algorithm::undef;
}
static algorithm ie2mkl(RNNCellBase::CellType cell_type) {
switch (cell_type) {
case RNNCellBase::LSTM: return vanilla_lstm;
case RNNCellBase::GRU: return vanilla_gru;
case RNNCellBase::GRU_LBR: return gru_linear_before_reset;
case RNNCellBase::RNN: return vanilla_rnn;
case RNNCellBase::RNN: return algorithm::vanilla_rnn;
case RNNCellBase::LSTM: return algorithm::vanilla_lstm;
case RNNCellBase::GRU: return algorithm::vanilla_gru;
case RNNCellBase::GRU_LBR: return algorithm::lbr_gru;
default:
THROW_IE_EXCEPTION << "Unsoupported cell type";
return algorithm_undef;
THROW_IE_EXCEPTION << "Unsupported cell type";
return algorithm::undef;
}
}
size_t gatesCount(algorithm alg) {
switch (alg) {
case algorithm::vanilla_rnn: return 1;
case algorithm::vanilla_gru:
case algorithm::lbr_gru: return 3;
case algorithm::vanilla_lstm: return 4;
default:
THROW_IE_EXCEPTION << "Unsupported cell type";
return 0;
}
}
size_t statesCount(algorithm alg) {
switch (alg) {
case algorithm::vanilla_rnn:
case algorithm::vanilla_gru:
case algorithm::lbr_gru: return 1;
case algorithm::vanilla_lstm: return 2;
default:
THROW_IE_EXCEPTION << "Unsupported cell type";
return 0;
}
}
@ -72,12 +90,14 @@ void MKLDNNRNN::fillCellDesc() {
if (!cellLayer)
THROW_IE_EXCEPTION << "No original layer for RNNCell.";
algorithm cell_type = ie2mkl(cellLayer->cellType);
algorithm cell_act = ie2mkl(cellLayer->activations[0]); // Works only for RNN with one gate
cell_type = ie2mkl(cellLayer->cellType);
cell_act = ie2mkl(cellLayer->activations[0]); // Works only for RNN with one gate
cell_desc = {cell_type, cell_act};
if (cellLayer->clip != 0.0f)
cell_desc.set_clipping(cellLayer->clip);
if (cellLayer->clip != 0.0f) {
// TODO [oneDNN]: No more supported
THROW_IE_EXCEPTION << "Clipping is not supported for RNN primitive";
// cell_desc.set_clipping(cellLayer->clip);
}
auto &ins = cellLayer->insData;
auto &outs = cellLayer->outData;
@ -94,17 +114,17 @@ void MKLDNNRNN::fillCellDesc() {
if (in_data_dims.ndims() != 2 || in_h_state_dims.ndims() != 2)
THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
G = cell_desc.get_gates_count();
S = cell_desc.get_state_count();
G = gatesCount(cell_type);
S = statesCount(cell_type);
T = 1;
N = in_data_dims[0];
DC = in_data_dims[1];
SC = in_h_state_dims[1];
Gb = (cell_type != gru_linear_before_reset) ? G : G + 1;
Gb = (cell_type != mkldnn::algorithm::lbr_gru) ? G : G + 1;
// Expected shapes
MKLDNNDims D_shape {N, DC}, S_shape {N, SC};
MKLDNNDims D_shape {N, DC}, S_shape {N, SC}, S_4D_shape {L, D, N, SC};
if (in_data_dims != D_shape
|| in_h_state_dims != S_shape
@ -135,33 +155,34 @@ void MKLDNNRNN::fillCellDesc() {
THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
// Shapes and Attributes are correct. Can start internal stuff initialization.
in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
in_data_d = {{T, N, DC}, memory::f32, memory::tnc};;
out_data_d = {{T, N, SC}, memory::f32, memory::tnc};;
w_data_d = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
if (bias)
w_bias_d = {{L, D, Gb, SC}, memory::f32, memory::ldgo};
std::vector<TensorDesc> in_candidate, out_candidate;
std::vector<memory::format> outputFormats;
in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc});
in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
outputFormats.emplace_back(memory::nc);
if (S == 2) {
in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
outputFormats.emplace_back(memory::nc);
for (size_t i = 0; i < S; i++) {
in_states_d.emplace_back(S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc);
out_states_d.emplace_back(S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc);
}
createDescriptor(in_candidate, out_candidate, outputFormats);
in_data_d = {{T, N, DC}, memory::data_type::f32, memory::format_tag::tnc};;
out_data_d = {{T, N, SC}, memory::data_type::f32, memory::format_tag::tnc};;
w_data_d = {{L, D, DC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
w_state_d = {{L, D, SC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
if (bias)
w_bias_d = {{L, D, Gb, SC}, memory::data_type::f32, memory::format_tag::ldgo};
std::vector<TensorDesc> in_candidate, out_candidate;
std::vector<memory::format_tag> outputFormats;
in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::data_type::f32, memory::format_tag::nc});
in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
outputFormats.emplace_back(memory::format_tag::nc);
if (S == 2) {
in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
outputFormats.emplace_back(memory::format_tag::nc);
}
createDescriptor(in_candidate, out_candidate);
}
void MKLDNNRNN::fillSeqDesc() {
@ -174,15 +195,16 @@ void MKLDNNRNN::fillSeqDesc() {
if (!one_of(rnnLayer->cellType, _RNN::LSTM, _RNN::GRU, _RNN::GRU_LBR, _RNN::RNN))
THROW_IE_EXCEPTION << "RNN layer supports only LSTM/GRU/RNN cell";
algorithm cell_type = ie2mkl(rnnLayer->cellType);
algorithm cell_act = algorithm_undef;
cell_type = ie2mkl(rnnLayer->cellType);
cell_act = algorithm::undef;
if (!rnnLayer->activations.empty())
cell_act = ie2mkl(rnnLayer->activations[0]); // Works only for RNN with one gate
cell_desc = {cell_type, cell_act};
if (rnnLayer->clip != 0.0f)
cell_desc.set_clipping(rnnLayer->clip);
// TODO [oneDNN]: No more supported
if (rnnLayer->clip != 0.0f) {
THROW_IE_EXCEPTION << "Clipping is not supported for RNN primitive";
// cell_desc.set_clipping(rnnLayer->clip);
}
if (!one_of(rnnLayer->axis, 0, 1))
THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1";
@ -211,34 +233,33 @@ void MKLDNNRNN::fillSeqDesc() {
std::swap(out_data_dims[0], out_data_dims[1]);
}
G = cell_desc.get_gates_count();
S = cell_desc.get_state_count();
G = gatesCount(cell_type);
S = statesCount(cell_type);
T = in_data_dims[0];
N = in_data_dims[1];
DC = in_data_dims[2];
SC = out_data_dims[2];
Gb = (cell_type != gru_linear_before_reset) ? G : G + 1;
Gb = (cell_type != mkldnn::algorithm::lbr_gru) ? G : G + 1;
MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC};
MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC}, S_4D_shape {L, D, N, SC};
if (out_data_dims != OD_shape)
THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
if (ins.size() > 1) {
for (int i = 1; i < ins.size(); i++)
if (getParentEdgeAt(i)->getDims() != S_shape)
THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
in_states_d.resize(S);
out_states_d.resize(S);
in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
for (int i = 1; i < ins.size(); i++) {
if (getParentEdgeAt(i)->getDims() != S_shape)
THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
in_states_d[i - 1] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
}
if (outs.size() > 1) {
for (int i = 1; i < outs.size(); i++)
if (getChildEdgeAt(i)->getDims() != S_shape)
THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
for (int i = 1; i < outs.size(); i++) {
if (getChildEdgeAt(i)->getDims() != S_shape)
THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
out_states_d[i - 1] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
}
auto blobs = rnnLayer->blobs;
@ -252,60 +273,98 @@ void MKLDNNRNN::fillSeqDesc() {
if (weights->size() != G*SC*(SC+DC))
THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);
w_data_d = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
w_data_d = {{L, D, DC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
w_state_d = {{L, D, SC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
if (bias && bias->size() != Gb*SC)
THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
if (bias)
w_bias_d = {{L, D, Gb, SC}, memory::f32, memory::ldgo};
w_bias_d = {{L, D, Gb, SC}, memory::data_type::f32, memory::format_tag::ldgo};
// Try to create descriptor and corresponding configuration
in_data_d = {in_data_dims, memory::f32, memory::tnc};
out_data_d = {out_data_dims, memory::f32, memory::tnc};
in_data_d = {in_data_dims, memory::data_type::f32, memory::format_tag::tnc};
out_data_d = {out_data_dims, memory::data_type::f32, memory::format_tag::tnc};
std::vector<TensorDesc> in_candidate;
if (nativeOrder)
in_candidate.push_back(in_data_d);
else
in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc});
in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::data_type::f32, memory::format_tag::ntc});
for (int i = 1; i < ins.size(); i++)
in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
std::vector<TensorDesc> out_candidate;
std::vector<memory::format> outputFormats;
if (nativeOrder) {
out_candidate.push_back(out_data_d);
outputFormats.push_back(out_data_d.getFormat());
} else {
out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc});
outputFormats.push_back(memory::ntc);
out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::data_type::f32, memory::format_tag::ntc});
}
for (int i = 1; i < outs.size(); i++) {
out_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, memory::f32, memory::nc});
outputFormats.push_back(memory::nc);
out_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, memory::data_type::f32, memory::format_tag::nc});
}
createDescriptor(in_candidate, out_candidate, outputFormats);
createDescriptor(in_candidate, out_candidate);
}
void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
const std::vector<TensorDesc> &outputDesc,
const std::vector<memory::format> &outputFormats) {
MKLDNNDescriptor desc(std::shared_ptr<rnn_forward::desc>(
new rnn_forward::desc(forward_scoring, cell_desc,
direction,
/* In Data */ in_data_d,
/* In State */ in_state_d,
/* Weights data */ w_data_d,
/* Weights state */ w_state_d,
/* Bias */ w_bias_d,
/* Out Data */ out_data_d,
/* Out State */ out_state_d)));
descs.push_back(desc);
const std::vector<TensorDesc> &outputDesc) {
switch (cell_type) {
case mkldnn::algorithm::vanilla_rnn: {
MKLDNNDescriptor desc(std::shared_ptr<vanilla_rnn_forward::desc>(
new vanilla_rnn_forward::desc(prop_kind::forward_scoring, cell_act, direction,
/* In Data */ in_data_d,
/* In State */ in_states_d[0],
/* Weights data */ w_data_d,
/* Weights state */ w_state_d,
/* Bias */ w_bias_d,
/* Out Data */ out_data_d,
/* Out State */ out_states_d[0])));
descs.push_back(desc);
} break;
case mkldnn::algorithm::vanilla_gru: {
MKLDNNDescriptor desc(std::shared_ptr<gru_forward::desc>(
new gru_forward::desc(prop_kind::forward_scoring, direction,
/* In Data */ in_data_d,
/* In State */ in_states_d[0],
/* Weights data */ w_data_d,
/* Weights state */ w_state_d,
/* Bias */ w_bias_d,
/* Out Data */ out_data_d,
/* Out State */ out_states_d[0])));
descs.push_back(desc);
} break;
case mkldnn::algorithm::lbr_gru: {
MKLDNNDescriptor desc(std::shared_ptr<lbr_gru_forward::desc>(
new lbr_gru_forward::desc(prop_kind::forward_scoring, direction,
/* In Data */ in_data_d,
/* In State */ in_states_d[0],
/* Weights data */ w_data_d,
/* Weights state */ w_state_d,
/* Bias */ w_bias_d,
/* Out Data */ out_data_d,
/* Out State */ out_states_d[0])));
descs.push_back(desc);
} break;
case mkldnn::algorithm::vanilla_lstm: {
MKLDNNDescriptor desc(std::shared_ptr<lstm_forward::desc>(
new lstm_forward::desc(prop_kind::forward_scoring, direction,
/* In Data */ in_data_d,
/* In State H */ in_states_d[0],
/* In State C */ in_states_d[1],
/* Weights data */ w_data_d,
/* Weights state */ w_state_d,
/* Bias */ w_bias_d,
/* Out Data */ out_data_d,
/* Out State H */ out_states_d[0],
/* Out State C */ out_states_d[1])));
descs.push_back(desc);
} break;
default:
THROW_IE_EXCEPTION << "Unknown cell type";
}
// Fill supported config
InferenceEngine::LayerConfig config;
@ -326,7 +385,7 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
config.outConfs.push_back(dataConfig);
}
supportedPrimitiveDescriptors.emplace_back(config, ref_any, outputFormats);
supportedPrimitiveDescriptors.emplace_back(config, ref_any);
}
void MKLDNNRNN::createPrimitive() {
@ -342,8 +401,7 @@ void MKLDNNRNN::createPrimitive() {
&& getCnnLayer()->blobs["biases"]->getTensorDesc().getPrecision() != Precision::FP32)
THROW_IE_EXCEPTION << errorPrefix << " has invalid biases precision: " << getCnnLayer()->blobs["biases"]->getTensorDesc().getPrecision();
std::shared_ptr<rnn_forward::desc> d = descs[0];
rnn_forward::primitive_desc pd(*d, getEngine());
auto pd = descs[0].createPrimitiveDescriptorIterator(getEngine());
auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
@ -387,22 +445,22 @@ void MKLDNNRNN::createPrimitive() {
const int gate_map_lstm_size = sizeof(gate_map_lstm) / sizeof(int);
const int gate_map_gru_size = sizeof(gate_map_gru) / sizeof(int);
const int gate_map_rnn_size = sizeof(gate_map_rnn) / sizeof(int);
if (cell_desc.get_cell_kind() == vanilla_lstm) {
if (cell_type == algorithm::vanilla_lstm) {
gate_map = gate_map_lstm;
if (G > gate_map_lstm_size) {
THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
}
} else if (cell_desc.get_cell_kind() == vanilla_gru) {
} else if (cell_type == algorithm::vanilla_gru) {
gate_map = gate_map_gru;
if (G > gate_map_gru_size) {
THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
}
} else if (cell_desc.get_cell_kind() == gru_linear_before_reset) {
} else if (cell_type == algorithm::lbr_gru) {
gate_map = gate_map_gru;
if (G > gate_map_gru_size) {
THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
}
} else if (cell_desc.get_cell_kind() == vanilla_rnn) {
} else if (cell_type == algorithm::vanilla_rnn) {
gate_map = gate_map_rnn;
if (G > gate_map_rnn_size) {
THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
@ -448,76 +506,48 @@ void MKLDNNRNN::createPrimitive() {
}
}
auto src_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
src_state_mem->Create(in_state_d);
internalBlobMemory.push_back(src_state_mem);
if (in_state_d) {
int offset = 0;
for (int i = 0; i < S; i++) {
/* create copy/concat primitive */
auto src_stat = getParentEdgeAt(i+1)->getMemory().GetPrimitive();
auto state_mem = std::make_shared<MKLDNNMemory>(getEngine());
state_mem->Create(
src_stat.get_primitive_desc().desc(),
static_cast<uint8_t *>(src_state_mem->GetPrimitive().get_data_handle()) + offset);
offset += src_stat.get_primitive_desc().get_size();
internalBlobMemory.push_back(state_mem);
exec_before.emplace_back(src_stat, state_mem->GetPrimitive());
}
}
auto dst_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
dst_state_mem->Create(out_state_d);
internalBlobMemory.push_back(dst_state_mem);
if (out_state_d) {
int offset = 0;
int idx_start = is_cell ? 0 : 1;
for (int i = 0; i < S; i++) {
/* create copy/split primitive */
auto dst_stat = getChildEdgeAt(idx_start + i)->getMemory().GetPrimitive();
auto state_mem = std::make_shared<MKLDNNMemory>(getEngine());
state_mem->Create(
dst_stat.get_primitive_desc().desc(),
static_cast<uint8_t *>(dst_state_mem->GetPrimitive().get_data_handle()) + offset);
offset += dst_stat.get_primitive_desc().get_size();
internalBlobMemory.push_back(state_mem);
if (is_cell && i == 0) continue;
exec_after.emplace_back(state_mem->GetPrimitive(), dst_stat);
}
}
auto workspace_mem = std::make_shared<MKLDNNMemory>(getEngine());
workspace_mem->Create({}, memory::f32, memory::format_undef, nullptr); // stub, not in use
internalBlobMemory.push_back(workspace_mem);
auto p = new rnn_forward(pd,
/* In Data */ src_data_mem ->GetPrimitive(),
/* In State */ src_state_mem->GetPrimitive(),
/* Weights data */ w_data_mem ->GetPrimitive(),
/* Weights state */ w_state_mem ->GetPrimitive(),
/* Bias */ w_bias_mem ->GetPrimitive(),
/* Out Data */ dst_data_mem ->GetPrimitive(),
/* Out State */ dst_state_mem->GetPrimitive(),
/* Workspace */ workspace_mem->GetPrimitive());
prim.reset(p);
prim.reset(new mkldnn::primitive(pd));
}
void MKLDNNRNN::execute(mkldnn::stream strm) {
if (!exec_before.empty())
strm.submit({exec_before.begin(), exec_before.end()});
if (!prim)
THROW_IE_EXCEPTION << "No initialized primitive to execute";
if (prim)
strm.submit({*prim});
const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
const auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
if (!exec_after.empty())
strm.submit({exec_after.begin(), exec_after.end()});
const auto &wgh_data_mem = internalBlobMemory[0];
const auto &wgh_stat_mem = internalBlobMemory[1];
const auto &wgh_bias_mem = internalBlobMemory[2];
std::unordered_map<int, memory> args {
{DNNL_ARG_SRC_LAYER, src_data_mem->GetPrimitive()},
{DNNL_ARG_WEIGHTS_LAYER, wgh_data_mem->GetPrimitive()},
{DNNL_ARG_WEIGHTS_ITER, wgh_stat_mem->GetPrimitive()},
{DNNL_ARG_BIAS, wgh_bias_mem->GetPrimitive()},
{DNNL_ARG_DST_LAYER, dst_data_mem->GetPrimitive()},
};
int state_i_tags[] {DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C};
int state_o_tags[] {DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C};
for (size_t s = 0; s < S; s++) {
args[state_i_tags[s]] = getParentEdgeAt(s+1)->getMemoryPtr()->GetPrimitive();
}
if (is_cell) {
for (size_t s = 0; s < S; s++) {
args[state_o_tags[s]] = getChildEdgesAtPort(s)[0]->getMemoryPtr()->GetPrimitive();
}
} else {
ptrdiff_t n_ports_with_init_states = outDims.size() - 1; // first is a sequence data
for (size_t s = 0; s < std::min(S, n_ports_with_init_states); s++) {
if (s < inDims.size()) {
args[state_o_tags[s]] = getChildEdgesAtPort(s+1)[0]->getMemoryPtr()->GetPrimitive();
}
}
}
(*prim).execute(strm, args);
}
REG_MKLDNN_PRIM_FOR(MKLDNNRNN, RNNCell);

View File

@ -20,10 +20,8 @@ public:
void getSupportedDescriptors() override;
void createPrimitive() override;
bool created() const override;
using MKLDNNNode::createDescriptor;
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc,
const std::vector<mkldnn::memory::format> &outputFormats);
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void execute(mkldnn::stream strm) override;
@ -39,10 +37,13 @@ private:
bool nativeOrder = true;
/** Direction of iteration through sequence dimension */
mkldnn::rnn_direction direction = mkldnn::unidirectional;
mkldnn::rnn_direction direction = mkldnn::rnn_direction::unidirectional;
/** RNN Cell desc (type/activation_alg/clip)*/
mkldnn::rnn_cell::desc cell_desc { mkldnn::algorithm::vanilla_lstm };
/** RNN Cell type (type/activation_alg/clip)*/
mkldnn::algorithm cell_type = mkldnn::algorithm::vanilla_lstm;
/** activation type for vanilla RNN cell */
mkldnn::algorithm cell_act = mkldnn::algorithm::eltwise_tanh;
// Internal attributes
ptrdiff_t N = 0; /**< Batch value */
@ -58,8 +59,8 @@ private:
MKLDNNMemoryDesc in_data_d;
MKLDNNMemoryDesc out_data_d;
MKLDNNMemoryDesc in_state_d;
MKLDNNMemoryDesc out_state_d;
std::vector<MKLDNNMemoryDesc> in_states_d;
std::vector<MKLDNNMemoryDesc> out_states_d;
MKLDNNMemoryDesc w_data_d;
MKLDNNMemoryDesc w_state_d;

View File

@ -11,7 +11,7 @@
#include <mkldnn_extension_utils.h>
#include <mkldnn_types.h>
#include <utils/bfloat16.hpp>
#include <cpu_isa_traits.hpp>
#include <cpu/x64/cpu_isa_traits.hpp>
#include "ie_parallel.hpp"
#include <mkldnn_selective_build.h>
@ -19,6 +19,7 @@ using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
MKLDNNROIAlignNode::MKLDNNROIAlignNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
MKLDNNWeightsSharing::Ptr &cache)
@ -100,17 +101,17 @@ void MKLDNNROIAlignNode::initSupportedPrimitiveDescriptors() {
config.inConfs.resize(3);
config.outConfs.resize(1);
std::vector<std::pair<memory::format, memory::format> > supportedFormats {
{memory::nchw, memory::nchw},
{memory::nhwc, memory::nhwc},
{memory::nChw16c, memory::nChw16c},
{memory::nChw8c, memory::nChw8c}
std::vector<std::pair<memory::format_tag, memory::format_tag>> supportedFormats {
{memory::format_tag::nchw, memory::format_tag::nchw},
{memory::format_tag::nhwc, memory::format_tag::nhwc},
{memory::format_tag::nChw16c, memory::format_tag::nChw16c},
{memory::format_tag::nChw8c, memory::format_tag::nChw8c}
};
for (auto fmts : supportedFormats) {
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmts.first);
config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), memory::f32, memory::nc);
config.inConfs[2].desc = MKLDNNMemoryDesc(getParentEdgeAt(2)->getDims(), memory::s32, memory::x);
config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), memory::data_type::f32, memory::format_tag::nc);
config.inConfs[2].desc = MKLDNNMemoryDesc(getParentEdgeAt(2)->getDims(), memory::data_type::s32, memory::format_tag::x);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmts.second);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, fmts.second});
}
@ -153,16 +154,17 @@ void MKLDNNROIAlignNode::executeSpecified() {
auto &srcMemory1 = getParentEdgeAt(1)->getMemory();
auto &dstMemory = getChildEdgeAt(0)->getMemory();
auto srcBlockDesc = srcMemory0.GetDescriptor().data.layout_desc.blocking;
auto dstBlockDesc = dstMemory.GetDescriptor().data.layout_desc.blocking;
auto srcBlockDesc = srcMemory0.GetDescriptor().data.format_desc.blocking;
auto dstBlockDesc = dstMemory.GetDescriptor().data.format_desc.blocking;
int blockSize = srcBlockDesc.block_dims[1];
auto selectedFmt = srcMemory0.GetDescriptor().data.format;
int blockSize = srcBlockDesc.inner_nblks > 0 ? srcBlockDesc.inner_blks[0] : 1;
auto isPlainFmt = srcMemory0.GetDesc().isPlainFormat();
auto isNhwcFmt = srcMemory0.GetDesc().isTailCFormat();
const auto *srcData = reinterpret_cast<const inputType *>(getDataPtr(getParentEdgeAt(0)->getMemory()));
const auto *srcRoi = reinterpret_cast<const float *>(getDataPtr(getParentEdgeAt(1)->getMemory()));
const auto *srcRoiIdx = reinterpret_cast<const int *>(getDataPtr(getParentEdgeAt(2)->getMemory()));
auto *dst = reinterpret_cast<outputType *>(getDataPtr(getChildEdgeAt(0)->getMemory()));
const auto *srcData = reinterpret_cast<const inputType *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
const auto *srcRoi = reinterpret_cast<const float *>(getParentEdgeAt(1)->getMemoryPtr()->GetPtr());
const auto *srcRoiIdx = reinterpret_cast<const int *>(getParentEdgeAt(2)->getMemoryPtr()->GetPtr());
auto *dst = reinterpret_cast<outputType *>(getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
auto nominalRoiCount = static_cast<int>(srcMemory1.GetDims()[0]);
int realRois = 0;
@ -173,11 +175,11 @@ void MKLDNNROIAlignNode::executeSpecified() {
const int binCount = pooledH * pooledW;
const int hInputStride = srcBlockDesc.strides[0][2];
const int wInputStride = srcBlockDesc.strides[0][3];
const int hOutputStride = dstBlockDesc.strides[0][2];
const int wOutputStride = dstBlockDesc.strides[0][3];
const int chPadding = srcBlockDesc.padding_dims[1];
const int hInputStride = srcBlockDesc.strides[2];
const int wInputStride = srcBlockDesc.strides[3];
const int hOutputStride = dstBlockDesc.strides[2];
const int wOutputStride = dstBlockDesc.strides[3];
const int chPadding = srcMemory0.GetDescriptor().data.padded_dims[1];
const int blockCount = chPadding / blockSize;
for (; realRois < nominalRoiCount; realRois++) {
@ -317,7 +319,7 @@ void MKLDNNROIAlignNode::executeSpecified() {
xBinInd_ * wOutputStride + blockResidual_;
dst[dstIndex] = pooledValue;
};
if (selectedFmt == mkldnn_nhwc) {
if (isNhwcFmt) {
parallel_for2d(pooledH, pooledW, [&](int yBinInd, int xBinInd) {
for (int c = 0; c < C; c++) {
size_t binOffsetInput = roiBatchInd * C * H * W + c;
@ -330,7 +332,7 @@ void MKLDNNROIAlignNode::executeSpecified() {
int cStart = blkIdx * blockSize;
int cEnd = (blkIdx == blockCount - 1 ? C : cStart + blockSize);
for (int c = cStart; c < cEnd; c++) {
const int blockResidual = (selectedFmt == mkldnn_nchw ? 0 : c % blockSize);
const int blockResidual = (isPlainFmt ? 0 : c % blockSize);
const int blockIdx = (c / blockSize) * blockSize;
size_t binOffsetInput = (roiBatchInd * chPadding + blockIdx) * H * W;
size_t binOffsetOutput = (n * chPadding + blockIdx) * binCount;
@ -341,11 +343,6 @@ void MKLDNNROIAlignNode::executeSpecified() {
}
}
inline uint8_t* MKLDNNROIAlignNode::getDataPtr(const MKLDNNMemory& memoryPtr) const {
return reinterpret_cast<uint8_t*>(memoryPtr.GetData()) + memoryPtr.GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(memoryPtr.GetDescriptor().data.data_type));
}
bool MKLDNNROIAlignNode::created() const {
return getType() == ROIAlign;
}

View File

@ -39,7 +39,6 @@ private:
void executeSpecified();
template<typename T>
struct ROIAlignExecute;
inline uint8_t* getDataPtr(const MKLDNNMemory& memoryPtr) const;
};
} // namespace MKLDNNPlugin

View File

@ -3,16 +3,249 @@
//
#include "mkldnn_roi_pooling_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
#include <mkldnn.hpp>
#include <string>
#include <vector>
#include <math.h>
#include <mkldnn_extension_utils.h>
#include <cpu/x64/jit_generator.hpp>
#include "ie_parallel.hpp"
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn;
using namespace mkldnn::impl;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
using namespace Xbyak;
#define GET_OFF(field) offsetof(jit_roi_pooling_call_args, field)
template <cpu_isa_t isa>
struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_roi_pooling_kernel_f32)
explicit jit_uni_roi_pooling_kernel_f32(jit_roi_pooling_params jcp) : jit_uni_roi_pooling_kernel(jcp), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
};
void generate() override {
this->preamble();
Label exit_label;
Label tail_label;
mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
mov(reg_bin_area, ptr[this->param1 + GET_OFF(bin_area)]);
mov(reg_c_blocks, ptr[this->param1 + GET_OFF(c_blocks)]);
if (jpp_.alg == ROIPoolingOpType::Max) {
mov(reg_kh, ptr[this->param1 + GET_OFF(kh)]);
mov(reg_kw, ptr[this->param1 + GET_OFF(kw)]);
} else {
mov(reg_yf, ptr[this->param1 + GET_OFF(yf)]);
mov(reg_xf, ptr[this->param1 + GET_OFF(xf)]);
mov(reg_yoff, ptr[this->param1 + GET_OFF(yoff)]);
mov(reg_xoff, ptr[this->param1 + GET_OFF(xoff)]);
}
int nb_c_tail = jpp_.nb_c % jpp_.nb_c_blocking;
cmp(reg_c_blocks, jpp_.nb_c_blocking);
jne(nb_c_tail ? tail_label : exit_label, T_NEAR);
loop_body(jpp_.nb_c_blocking);
jmp(exit_label, T_NEAR);
if (nb_c_tail) {
L(tail_label);
loop_body(nb_c_tail);
}
L(exit_label);
this->postamble();
}
private:
using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
const int vlen = cpu_isa_traits<isa>::vlen;
Vmm vmm_mask = Vmm(0);
Vmm vmm_zero = Vmm(0);
Xmm xmm_yf = Xmm(0);
Vmm vmm_yf = Vmm(0);
Xmm xmm_xf = Xmm(1);
Vmm vmm_xf = Vmm(1);
Vmm get_acc_reg(int idx) { return Vmm(2*idx + 1); }
Vmm get_src_reg(int idx) { return Vmm(2*idx + 2); }
Opmask k_store_mask = Opmask(7);
const unsigned char _cmp_lt_os = 1;
using reg64_t = const Xbyak::Reg64;
reg64_t reg_input = r8;
reg64_t aux_reg_input = rax;
reg64_t aux_reg_input1 = rdx;
reg64_t reg_output = r9;
reg64_t reg_kh = r10;
reg64_t reg_kw = r11;
reg64_t h_iter = r14;
reg64_t w_iter = r15;
reg64_t reg_c_blocks = rbx;
reg64_t reg_bin_area = rdx;
reg64_t reg_yf = reg_kh;
reg64_t reg_xf = reg_kw;
reg64_t reg_yoff = h_iter;
reg64_t reg_xoff = r12;
void roi_pool_max(int c_blocks) {
Label h_loop_label;
Label w_loop_label;
mov(aux_reg_input, reg_input);
for (int i = 0; i < c_blocks; i++) {
Vmm vmm_max = get_acc_reg(i);
uni_vmovups(vmm_max, ptr[reg_input + i * jpp_.ih * jpp_.iw * jpp_.c_block * sizeof(float)]);
}
xor_(h_iter, h_iter);
L(h_loop_label); {
xor_(w_iter, w_iter);
mov(aux_reg_input1, aux_reg_input);
L(w_loop_label); {
for (int i = 0; i < c_blocks; i++) {
Vmm vmm_max = get_acc_reg(i);
Vmm vmm_src = get_src_reg(i);
uni_vmovups(vmm_src, ptr[aux_reg_input1 + i * jpp_.ih * jpp_.iw * jpp_.c_block * sizeof(float)]);
if (isa == cpu::x64::sse41) {
movups(vmm_mask, vmm_max);
cmpps(vmm_mask, vmm_src, _cmp_lt_os);
blendvps(vmm_max, vmm_src);
} else if (isa == cpu::x64::avx2) {
vcmpps(vmm_mask, vmm_max, vmm_src, _cmp_lt_os);
vblendvps(vmm_max, vmm_max, vmm_src, vmm_mask);
} else if (isa == cpu::x64::avx512_common) {
vcmpps(k_store_mask, vmm_max, vmm_src, _cmp_lt_os);
vblendmps(vmm_max| k_store_mask, vmm_max, vmm_src);
}
}
add(aux_reg_input1, jpp_.c_block * sizeof(float));
inc(w_iter);
cmp(w_iter, reg_kw);
jl(w_loop_label, T_NEAR);
}
add(aux_reg_input, jpp_.iw * jpp_.c_block * sizeof(float));
inc(h_iter);
cmp(h_iter, reg_kh);
jl(h_loop_label, T_NEAR);
}
for (int i = 0; i < c_blocks; i++) {
Vmm vmm_dst = get_acc_reg(i);
uni_vmovups(ptr[reg_output + i * jpp_.oh * jpp_.ow * jpp_.c_block * sizeof(float)], vmm_dst);
}
}
void roi_pool_bilinear(int c_blocks) {
movq(xmm_yf, reg_yf);
uni_vbroadcastss(vmm_yf, xmm_yf);
movq(xmm_xf, reg_xf);
uni_vbroadcastss(vmm_xf, xmm_xf);
Vmm vmm_src00 = get_src_reg(0);
Vmm vmm_src01 = get_src_reg(1);
Vmm vmm_src10 = get_src_reg(2);
Vmm vmm_src11 = get_src_reg(3);
for (int i = 0; i < c_blocks; i++) {
int src_c_off = i * jpp_.ih * jpp_.iw * jpp_.c_block * sizeof(float);
mov(aux_reg_input, reg_input);
uni_vmovups(vmm_src00, ptr[aux_reg_input + src_c_off]);
add(aux_reg_input, reg_xoff);
uni_vmovups(vmm_src01, ptr[aux_reg_input + src_c_off]);
add(aux_reg_input, reg_yoff);
uni_vmovups(vmm_src11, ptr[aux_reg_input + src_c_off]);
sub(aux_reg_input, reg_xoff);
uni_vmovups(vmm_src10, ptr[aux_reg_input + src_c_off]);
uni_vsubps(vmm_src01, vmm_src01, vmm_src00);
uni_vfmadd213ps(vmm_src01, vmm_xf, vmm_src00);
uni_vsubps(vmm_src11, vmm_src11, vmm_src10);
uni_vfmadd213ps(vmm_src11, vmm_xf, vmm_src10);
uni_vsubps(vmm_src11, vmm_src11, vmm_src01);
uni_vfmadd213ps(vmm_src11, vmm_yf, vmm_src01);
int dst_c_off = i * jpp_.oh * jpp_.ow * jpp_.c_block * sizeof(float);
uni_vmovups(ptr[reg_output + dst_c_off], vmm_src11);
}
}
void empty_roi(int c_blocks) {
uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
for (int i = 0; i < c_blocks; i++) {
uni_vmovups(ptr[reg_output + i * jpp_.oh * jpp_.ow * jpp_.c_block * sizeof(float)], vmm_zero);
}
}
void loop_body(int c_blocks) {
Label empty_roi_label;
Label exit_label;
cmp(reg_bin_area, 0);
je(empty_roi_label, T_NEAR);
if (jpp_.alg == ROIPoolingOpType::Max)
roi_pool_max(c_blocks);
else
roi_pool_bilinear(c_blocks);
if (isa == cpu::x64::sse41) {
add(reg_input, 4 * sizeof(float));
add(reg_output, 4 * sizeof(float));
if (jpp_.alg == ROIPoolingOpType::Max)
roi_pool_max(c_blocks);
else
roi_pool_bilinear(c_blocks);
}
jmp(exit_label, T_NEAR);
L(empty_roi_label);
empty_roi(c_blocks);
if (isa == cpu::x64::sse41) {
add(reg_output, 4 * sizeof(float));
empty_roi(c_blocks);
}
L(exit_label);
}
};
MKLDNNROIPoolingNode::MKLDNNROIPoolingNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
MKLDNNWeightsSharing::Ptr &cache)
@ -22,94 +255,312 @@ void MKLDNNROIPoolingNode::getSupportedDescriptors() {
if (!descs.empty())
return;
InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
if (precision != InferenceEngine::Precision::FP32)
precision = InferenceEngine::Precision::FP32;
auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
precision = getCnnLayer()->outData[0]->getPrecision();
if (precision != InferenceEngine::Precision::FP32)
precision = InferenceEngine::Precision::FP32;
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
GenericLayer* genericLayer = getCnnLayer().get();
if (genericLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert ROIPooling layer.";
if (getParentEdges().empty())
THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
std::string errorPrefix = "ROIPooling layer with name '" + getName() + "' ";
if (getParentEdges().size() != 2)
THROW_IE_EXCEPTION << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size();
if (getChildEdges().empty())
THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
THROW_IE_EXCEPTION << errorPrefix << "has incorrect number of output edges: " << getChildEdges().size();
if (getParentEdgeAt(0)->getDims().ndims() != 4) {
THROW_IE_EXCEPTION << errorPrefix << "doesn't support 0th input with rank: " << getParentEdgeAt(0)->getDims().ndims();
}
if (getParentEdgeAt(1)->getDims().ndims() != 2) {
THROW_IE_EXCEPTION << errorPrefix << "doesn't support 1st input with rank: " << getParentEdgeAt(1)->getDims().ndims();
}
if (getChildEdgeAt(0)->getDims().ndims() != 4) {
THROW_IE_EXCEPTION << errorPrefix << "doesn't support output with rank: " << getChildEdgeAt(0)->getDims().ndims();
}
if (getParentEdgeAt(1)->getDims()[1] != 5) {
THROW_IE_EXCEPTION << errorPrefix << "has invalid shape on 1st input: ["
<< getParentEdgeAt(1)->getDims()[0] << "," << getParentEdgeAt(1)->getDims()[1] << "]";
}
pooled_h = genericLayer->GetParamAsInt("pooled_h");
pooled_w = genericLayer->GetParamAsInt("pooled_w");
spatial_scale = genericLayer->GetParamAsFloat("spatial_scale");
std::string m = genericLayer->GetParamAsString("method", "max");
if (m == "max") {
method = mkldnn::algorithm::roi_pooling_max;
opType = ROIPoolingOpType::Max;
} else if (m == "bilinear") {
method = mkldnn::algorithm::roi_pooling_bilinear;
opType = ROIPoolingOpType::Bilinear;
} else {
THROW_IE_EXCEPTION << "Unsupported roi pooling method";
THROW_IE_EXCEPTION << errorPrefix << "doesn't support roi pooling method: " << m;
}
}
void MKLDNNROIPoolingNode::initSupportedPrimitiveDescriptors() {
if (!supportedPrimitiveDescriptors.empty())
return;
InferenceEngine::LayerConfig config;
config.dynBatchSupport = false;
config.inConfs.resize(2);
config.inConfs[0].constant = false;
config.inConfs[0].inPlace = -1;
config.inConfs[1].constant = false;
config.inConfs[1].inPlace = -1;
config.outConfs.resize(1);
config.outConfs[0].constant = false;
config.outConfs[0].inPlace = -1;
auto parentDims = getParentEdgeAt(0)->getDims();
for (auto format : getAvailableFormatsForDims(parentDims)) {
std::vector<InferenceEngine::TensorDesc> srcs;
srcs.push_back(MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format));
srcs.push_back(MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), inputDataType, memory::nc));
MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, format);
createDescriptor(srcs, {out_candidate});
auto format = mayiuse(avx512_common) ? memory::format_tag::nChw16c : memory::format_tag::nChw8c;
impl_desc_type impl_type;
if (mayiuse(cpu::x64::avx512_common)) {
impl_type = impl_desc_type::jit_avx512;
} else if (mayiuse(cpu::x64::avx2)) {
impl_type = impl_desc_type::jit_avx2;
} else if (mayiuse(cpu::x64::sse41)) {
impl_type = impl_desc_type::jit_sse42;
} else {
impl_type = impl_desc_type::ref;
}
config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), memory::data_type::f32, format);
config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), memory::data_type::f32, memory::format_tag::nc);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), memory::data_type::f32, format);
supportedPrimitiveDescriptors.push_back({config, impl_type, format});
}
void MKLDNNROIPoolingNode::createPrimitive() {
if (prim)
return;
auto config = getSelectedPrimitiveDescriptor()->getConfig();
std::vector<memory::desc> srcs;
for (size_t i = 0; i < getParentEdges().size(); i++) {
srcs.push_back(getParentEdgeAt(i)->getMemory().GetDescriptor());
const int simd_w = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
jpp.c_block = simd_w;
auto inDims = config.inConfs[0].desc.getDims();
auto outDims = config.outConfs[0].desc.getDims();
jpp.mb = outDims[0];
jpp.c = rnd_up(inDims[1], simd_w);
jpp.ih = inDims[2];
jpp.iw = inDims[3];
jpp.oh = outDims[2];
jpp.ow = outDims[3];
jpp.spatial_scale = spatial_scale;
jpp.pooled_h = pooled_h;
jpp.pooled_w = pooled_w;
jpp.nb_c = jpp.c / jpp.c_block;
jpp.nb_c_blocking = mayiuse(cpu::x64::avx512_common) ? 15 : 7;
jpp.alg = opType;
if (mayiuse(cpu::x64::avx512_common)) {
roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32<cpu::x64::avx512_common>(jpp));
} else if (mayiuse(cpu::x64::avx2)) {
roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32<cpu::x64::avx2>(jpp));
} else if (mayiuse(cpu::x64::sse41)) {
roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32<cpu::x64::sse41>(jpp));
}
memory::desc out_candidate = getChildEdgeAt(0)->getMemory().GetDescriptor();
MKLDNNDescriptor desc(std::shared_ptr<roi_pooling_forward::desc>(
new roi_pooling_forward::desc(prop_kind::forward_scoring, method, srcs, out_candidate, pooled_h, pooled_w,
spatial_scale)));
descs[0] = desc;
std::shared_ptr<roi_pooling_forward::desc> selected_desc_ptr = descs[0];
const PrimitiveDescInfo *selected_pd = getSelectedPrimitiveDescriptor();
if (selected_pd == nullptr)
THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set for node " << getName() << ".";
auto prim_desc = roi_pooling_forward::primitive_desc(*selected_desc_ptr, getEngine());
primitive_desc_iterator itpd = descs[0].createPrimitiveDescriptorIterator(getEngine());
std::vector<primitive::at> src_p;
for (size_t i = 0; i < getParentEdges().size(); i++) {
src_p.push_back(getParentEdgeAt(i)->getMemoryPtr()->GetPrimitive());
}
prim.reset(new roi_pooling_forward(prim_desc, src_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
if (roi_pooling_kernel)
roi_pooling_kernel->create_ker();
}
void MKLDNNROIPoolingNode::execute(mkldnn::stream strm) {
auto &srcMemory0 = getParentEdgeAt(0)->getMemory();
auto &srcMemory1 = getParentEdgeAt(1)->getMemory();
auto &dstMemory = getChildEdgeAt(0)->getMemory();
const auto *src_data = reinterpret_cast<const float *>(srcMemory0.GetPtr());
const auto *src_roi = reinterpret_cast<const float *>(srcMemory1.GetPtr());
float *dst = reinterpret_cast<float *>(dstMemory.GetPtr());
auto config = getSelectedPrimitiveDescriptor()->getConfig();
auto src_strides = config.inConfs[0].desc.getBlockingDesc().getStrides();
auto dst_strides = config.outConfs[0].desc.getBlockingDesc().getStrides();
int cb_work = impl::utils::div_up(jpp.nb_c, jpp.nb_c_blocking);
int MB = jpp.mb;
size_t src_roi_step = config.inConfs[1].desc.getBlockingDesc().getStrides()[0];
int real_rois = 0;
for (; real_rois < MB; real_rois++) {
size_t roi_off = real_rois * src_roi_step;
const float *src_roi_ptr = &src_roi[roi_off];
int roi_batch_ind = static_cast<int>(src_roi_ptr[0]);
if (roi_batch_ind == -1) {
break;
}
}
parallel_for4d(MB, cb_work, jpp.oh, jpp.ow, [&](int n, int cbb, int oh, int ow) {
auto arg = jit_roi_pooling_call_args();
int cb = cbb * jpp.nb_c_blocking;
int cb_num = jpp.nb_c_blocking;
int c_block = jpp.c_block;
arg.c_blocks = std::min(cb + cb_num, jpp.nb_c) - cb;
if (n >= real_rois) {
if (roi_pooling_kernel) {
arg.bin_area = 0;
arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
} else {
for (int c = 0; c < c_block; c++) {
dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c] = 0;
}
}
(*roi_pooling_kernel)(&arg);
} else {
size_t roi_off = n * src_roi_step;
const float* src_roi_ptr = &src_roi[roi_off];
int roi_batch_ind = static_cast<int>(src_roi_ptr[0]);
if (jpp.alg == ROIPoolingOpType::Max) {
int roi_start_w = static_cast<int>(round(src_roi_ptr[1] * jpp.spatial_scale));
int roi_start_h = static_cast<int>(round(src_roi_ptr[2] * jpp.spatial_scale));
int roi_end_w = static_cast<int>(round(src_roi_ptr[3] * jpp.spatial_scale));
int roi_end_h = static_cast<int>(round(src_roi_ptr[4] * jpp.spatial_scale));
int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
int hstart = (oh * roi_height) / jpp.pooled_h;
if ((hstart * jpp.pooled_h) > (oh * roi_height)) {
--hstart;
}
int wstart = (ow * roi_width) / jpp.pooled_w;
if ((wstart * jpp.pooled_w) > (ow * roi_width)) {
--wstart;
}
int hend = ((oh + 1) * roi_height) / jpp.pooled_h;
if ((hend * jpp.pooled_h) < ((oh + 1) * roi_height)) {
++hend;
}
int wend = ((ow + 1) * roi_width) / jpp.pooled_w;
if ((wend * jpp.pooled_w) < ((ow + 1) * roi_width)) {
++wend;
}
hstart = std::min(std::max(hstart + roi_start_h, 0), jpp.ih);
hend = std::min(std::max(hend + roi_start_h, 0), jpp.ih);
wstart = std::min(std::max(wstart + roi_start_w, 0), jpp.iw);
wend = std::min(std::max(wend + roi_start_w, 0), jpp.iw);
if (roi_pooling_kernel) {
arg.src = &src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] + hstart * src_strides[2] + wstart * src_strides[3]];
arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
arg.bin_area = (hend - hstart) * (wend - wstart);
arg.kh = hend - hstart;
arg.kw = wend - wstart;
} else {
for (int c = 0; c < c_block; c++) {
const size_t pool_index = n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c;
if ((hend <= hstart) || (wend <= wstart)) {
dst[pool_index] = 0;
} else {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
float batch_data = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
h * src_strides[2] + w * src_strides[3] + c];
if (batch_data > dst[pool_index]) {
dst[pool_index] = batch_data;
}
}
}
}
}
}
} else {
float roi_start_w_ = src_roi_ptr[1];
float roi_start_h_ = src_roi_ptr[2];
float roi_end_w_ = src_roi_ptr[3];
float roi_end_h_ = src_roi_ptr[4];
float height_scale = ((roi_end_h_ - roi_start_h_) * (jpp.ih - 1)) / (jpp.pooled_h - 1);
float width_scale = ((roi_end_w_ - roi_start_w_) * (jpp.iw - 1)) / (jpp.pooled_w - 1);
float in_y = (oh * height_scale + roi_start_h_ * (jpp.ih - 1));
float in_x = (ow * width_scale + roi_start_w_ * (jpp.iw - 1));
if (in_y < 0 || in_y > jpp.ih - 1 || in_x < 0 || in_x > jpp.iw - 1) {
if (roi_pooling_kernel) {
arg.bin_area = 0;
arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
} else {
for (int c = 0; c < c_block; c++) {
dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c] = 0;
}
}
} else {
int top_y_index = static_cast<int>(floorf(in_y));
int bottom_y_index = static_cast<int>(ceilf(in_y));
int left_x_index = static_cast<int>(floorf(in_x));
int right_x_index = static_cast<int>(ceilf(in_x));
if (right_x_index > jpp.iw - 1)
right_x_index = jpp.iw - 1;
if (bottom_y_index > jpp.ih - 1)
bottom_y_index = jpp.ih - 1;
if (roi_pooling_kernel) {
arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
arg.xf = in_x - left_x_index;
arg.yf = in_y - top_y_index;
arg.xoff = (size_t) ((right_x_index - left_x_index) * jpp.c_block * sizeof(float));
arg.yoff = (size_t) ((bottom_y_index - top_y_index) * jpp.iw * jpp.c_block * sizeof(float));
arg.src = &src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
top_y_index * src_strides[2] + left_x_index * src_strides[3]];
arg.bin_area = 1;
} else {
for (int c = 0; c < 1; c++) {
const float top_left = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
top_y_index * src_strides[2] + left_x_index * src_strides[3] + c];
const float top_right = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
top_y_index * src_strides[2] + right_x_index * src_strides[3] + c];
const float bottom_left = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
bottom_y_index * src_strides[2] + left_x_index * src_strides[3] + c];
const float bottom_right = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
bottom_y_index * src_strides[2] + right_x_index * src_strides[3] + c];
const float top = top_left + (top_right - top_left) * (in_x - left_x_index);
const float bottom = bottom_left + (bottom_right - bottom_left) * (in_x - left_x_index);
dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c] =
top + (bottom - top) * (in_y - top_y_index);
}
}
}
}
if (roi_pooling_kernel) {
(*roi_pooling_kernel)(&arg);
}
}
});
}
bool MKLDNNROIPoolingNode::created() const {
return getType() == ROIPooling;
}
void MKLDNNROIPoolingNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
std::vector<memory::desc> srcs;
srcs.push_back(MKLDNNMemoryDesc(inputDesc[0]));
srcs.push_back(MKLDNNMemoryDesc(inputDesc[1]));
MKLDNNMemoryDesc out_candidate(outputDesc[0]);
MKLDNNDescriptor desc(std::shared_ptr<roi_pooling_forward::desc>(
new roi_pooling_forward::desc(prop_kind::forward_scoring, method, srcs, out_candidate, pooled_h, pooled_w,
spatial_scale)));
descs.push_back(desc);
}
REG_MKLDNN_PRIM_FOR(MKLDNNROIPoolingNode, ROIPooling);

View File

@ -12,22 +12,77 @@
namespace MKLDNNPlugin {
enum ROIPoolingOpType {
Max,
Bilinear
};
struct jit_roi_pooling_params {
int mb, c;
int ih, iw, oh, ow;
int c_block, nb_c, nb_c_blocking;
double spatial_scale;
int pooled_h;
int pooled_w;
ROIPoolingOpType alg;
};
struct jit_roi_pooling_call_args {
const float *src;
float *dst;
size_t kh;
size_t kw;
size_t bin_area;
size_t c_blocks;
float xf;
float yf;
size_t xoff;
size_t yoff;
};
struct jit_uni_roi_pooling_kernel {
void (*ker_)(const jit_roi_pooling_call_args *);
void operator()(const jit_roi_pooling_call_args *args) {
assert(ker_);
ker_(args);
}
explicit jit_uni_roi_pooling_kernel(jit_roi_pooling_params jpp) : ker_(nullptr), jpp_(jpp) {}
virtual ~jit_uni_roi_pooling_kernel() {}
virtual void create_ker() = 0;
jit_roi_pooling_params jpp_;
};
class MKLDNNROIPoolingNode : public MKLDNNNode {
public:
MKLDNNROIPoolingNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
~MKLDNNROIPoolingNode() override = default;
void getSupportedDescriptors() override;
void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
void initSupportedPrimitiveDescriptors() override;
void createPrimitive() override;
void execute(mkldnn::stream strm) override;
bool created() const override;
private:
int pooled_h = 0;
int pooled_w = 0;
float spatial_scale = 0;
mkldnn::algorithm method = mkldnn::algorithm::roi_pooling_max;
ROIPoolingOpType opType = Max;
jit_roi_pooling_params jpp = {};
std::shared_ptr<jit_uni_roi_pooling_kernel> roi_pooling_kernel = nullptr;
};
} // namespace MKLDNNPlugin

View File

@ -3,8 +3,6 @@
//
#include "mkldnn_scatter_update_node.h"
#include "desc_iterator.hpp"
#include "mkldnn_quantize_node.h"
#include <legacy/ie_layers.h>
#include <mkldnn.hpp>
#include <string>
@ -192,13 +190,13 @@ void MKLDNNScatterUpdateNode::initSupportedPrimitiveDescriptors() {
config.inConfs[AXIS_ID].inPlace = -1;
}
auto pushDesc = [&](memory::format inFormat, memory::format idxFormat, memory::format updateFormat, memory::format outFormat) {
auto pushDesc = [&](memory::format_tag inFormat, memory::format_tag idxFormat, memory::format_tag updateFormat, memory::format_tag outFormat) {
config.inConfs[DATA_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(DATA_ID)->getDims(), dataType, inFormat);
config.inConfs[INDICES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(INDICES_ID)->getDims(), indicesType, idxFormat);
config.inConfs[UPDATE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(UPDATE_ID)->getDims(), dataType, updateFormat);
if (axisRelaxed)
config.inConfs[AXIS_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXIS_ID)->getDims(),
MKLDNNExtensionUtils::IEPrecisionToDataType(axisPrec), memory::x);
MKLDNNExtensionUtils::IEPrecisionToDataType(axisPrec), memory::format_tag::x);
config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), dataType, outFormat);
supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, outFormat});
};
@ -264,14 +262,10 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) {
auto &indicesMemPtr = getParentEdgeAt(INDICES_ID)->getMemoryPtr();
auto &updateMemPtr = getParentEdgeAt(UPDATE_ID)->getMemoryPtr();
uint8_t *dstPtr = reinterpret_cast<uint8_t*>(dstMemPtr->GetData()) +
dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dataSize;
uint8_t *srcPtr = reinterpret_cast<uint8_t*>(srcMemPtr->GetData()) +
srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dataSize;
uint8_t *indicesPtr = reinterpret_cast<uint8_t*>(indicesMemPtr->GetData()) +
indicesMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * indicesSize;
uint8_t *updatePtr = reinterpret_cast<uint8_t*>(updateMemPtr->GetData()) +
updateMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dataSize;
uint8_t *dstPtr = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
uint8_t *srcPtr = reinterpret_cast<uint8_t*>(srcMemPtr->GetPtr());
uint8_t *indicesPtr = reinterpret_cast<uint8_t*>(indicesMemPtr->GetPtr());
uint8_t *updatePtr = reinterpret_cast<uint8_t*>(updateMemPtr->GetPtr());
SizeVector srcDataDim = getParentEdgeAt(DATA_ID)->getDesc().getDims();
SizeVector indicesDim = getParentEdgeAt(INDICES_ID)->getDesc().getDims();
@ -281,7 +275,7 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) {
if (axisRelaxed) {
auto &axisMemPtr = getParentEdgeAt(AXIS_ID)->getMemoryPtr();
uint8_t *axisPtr = reinterpret_cast<uint8_t*>(axisMemPtr->GetData()) +
axisMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * axisSize;
axisMemPtr->GetDescriptor().data.offset0 * axisSize;
if (axisSize == 4) {
auto *axisPtr32 = reinterpret_cast<int32_t*>(axisPtr);
axis = *axisPtr32;

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_softmax_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
#include <string>
#include <mkldnn_types.h>
@ -41,7 +41,7 @@ void MKLDNNSoftMaxNode::getSupportedDescriptors() {
}
if (getParentEdgeAt(0)->getDims().ndims() == 3) {
MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::blocked);
MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::abc);
createDescriptor({in_candidate}, {});
}
@ -73,18 +73,22 @@ void MKLDNNSoftMaxNode::createPrimitive() {
auto prim_desc = softmax_forward::primitive_desc(*selected_desc_ptr, getEngine());
primitive_desc_iterator itpd = descs[0].createPrimitiveDescriptorIterator(getEngine());
while (itpd.is_not_end()) {
impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
while (itpd) {
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
auto primitiveDescriptor = getSelectedPrimitiveDescriptor();
if ((primitiveDescriptor != nullptr) && (impl_type == primitiveDescriptor->getImplementationType())) {
itpd.getPrimitiveDescriptor(prim_desc);
prim_desc = itpd.get();
break;
}
itpd++;
if (!itpd.next_impl())
break;
}
prim.reset(new softmax_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
getChildEdgeAt(0)->getMemory().GetPrimitive()));
prim.reset(new softmax_forward(prim_desc));
auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
}
bool MKLDNNSoftMaxNode::created() const {

View File

@ -58,11 +58,6 @@ static TensorDesc makeChannelBlockedTensorDesc(const Precision& precision, const
return TensorDesc(precision, srcDims, {blkDims, order});
}
static inline uint8_t* getDataPtr(const MKLDNNMemory& memoryPtr) {
return reinterpret_cast<uint8_t*>(memoryPtr.GetData()) + memoryPtr.GetDescriptor().data.layout_desc.blocking.offset_padding *
MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(memoryPtr.GetDescriptor().data.data_type));
}
MKLDNNSplitNode::MKLDNNSplitNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
MKLDNNNode(layer, eng, cache) {}
@ -139,7 +134,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
config.inConfs[0].desc = getTensorDesc(precision, srcDims.ToSizeVector());
config.outConfs.resize(outDims.size());
std::vector<memory::format> outFormats;
std::vector<memory::format_tag> outFormats;
for (size_t i = 0; i < outDims.size(); i++) {
auto o_Dims = outDims[i];
@ -197,7 +192,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
const auto& blkDims = refConfig.inConfs[0].desc.getBlockingDesc().getBlockDims();
auto numOfDim = blkDims.size();
std::vector<memory::format> outFormats;
std::vector<memory::format_tag> outFormats;
SizeVector offsets(numOfDim, 0lu);
SizeVector strides(numOfDim);
strides.back() = 1lu;
@ -245,7 +240,7 @@ void MKLDNNSplitNode::execute(mkldnn::stream strm) {
return;
int MB = batchToProcess();
uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
size_t batch = this->getParentEdgeAt(0)->getDims()[0];
if (batch != MB)
@ -385,9 +380,6 @@ void MKLDNNSplitNode::setDynamicBatchLim(int lim) {
THROW_ERROR << "Dynamic batch is not supported by split layer with axis == 0 parameter";
dynBatchLim = lim;
if (prim) {
prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
}
}
void MKLDNNSplitNode::prepareOptimizedParams() {
@ -418,7 +410,7 @@ void MKLDNNSplitNode::prepareOptimizedParams() {
optimizedParams.dataSize.resize(this->getChildEdges().size());
optimizedParams.dstMemPtrs.clear();
for (int i = 0; i < this->getChildEdges().size(); i++) {
if (uint8_t* dstData = getDataPtr(this->getChildEdgeAt(i)->getMemory())) {
if (uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(i)->getMemoryPtr()->GetPtr())) {
optimizedParams.dstMemPtrs.push_back(dstData);
} else {
THROW_ERROR << "can't get child edge indx " << i << "data.";

View File

@ -3,7 +3,7 @@
//
#include "mkldnn_tensoriterator_node.h"
#include "desc_iterator.hpp"
#include <legacy/ie_layers.h>
#include <legacy/ie_layers_internal.hpp>
#include <string>
@ -50,7 +50,8 @@ static InferenceEngine::LayerConfig make_plain_config(const InferenceEngine::CNN
class PortIteratorHelper : public PortMapHelper {
public:
PortIteratorHelper(const MKLDNNMemoryPtr &from, const MKLDNNMemoryPtr &to, bool sliced_src,
const InferenceEngine::TensorIterator::PortMap &slice_rule, const mkldnn::engine& eng) {
const InferenceEngine::TensorIterator::PortMap &slice_rule, const mkldnn::engine& eng)
: sliced_src(sliced_src) {
const auto &full_blob = sliced_src ? from : to;
const auto &part_blob = !sliced_src ? from : to;
@ -71,56 +72,59 @@ public:
// make chunk view
auto chunk_desc = full_blob->GetDescriptor();
chunk_desc.data.dims[axis] = abs_stride;
chunk_desc.data.layout_desc.blocking.padding_dims[axis] = abs_stride; // TODO: asamption that plain tensor
chunk_desc.data.padded_dims[axis] = abs_stride; // TODO: asamption that plain tensor
mem_holder.push_back(full_blob->GetPrimitive());
auto full_mem_handler = full_blob->GetPrimitive().get_data_handle();
mem_holder.emplace_back(mkldnn::memory::primitive_desc(chunk_desc, eng), full_mem_handler);
auto &chunk_mem_prim = mem_holder.back();
full_mem = full_blob->GetPrimitive();
const auto full_mem_handler = full_mem.get_data_handle();
mkldnn::memory chunk_mem = {chunk_desc, eng, full_mem_handler};
auto elem_size = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(chunk_desc.data.data_type));
chunk_stride_in_byte = chunk_desc.data.layout_desc.blocking.strides[0][axis] * elem_size * abs_stride;
chunk_stride_in_byte = chunk_desc.data.format_desc.blocking.strides[axis] * elem_size * abs_stride;
chunk_offset_in_byte = sign_of_stride < 0 ? (iter_count - 1) * chunk_stride_in_byte : 0;
chunk_stride_in_byte *= sign_of_stride;
if (sliced_src) {
reorders.emplace_back(chunk_mem_prim, to->GetPrimitive());
mem_holder_src = chunk_mem;
mem_holder_dst = to->GetPrimitive();
} else {
reorders.emplace_back(from->GetPrimitive(), chunk_mem_prim);
mem_holder_src = from->GetPrimitive();
mem_holder_dst = chunk_mem;
}
reorder = {mem_holder_src, mem_holder_dst};
}
void execute(mkldnn::stream strm, int iter) override {
IE_ASSERT(iter >= 0 && iter < iter_count);
auto full_mem = mem_holder[FULL_DATA];
auto chunk_mem = mem_holder[CHUNK_DATA];
auto &chunk_mem = sliced_src ? mem_holder_src : mem_holder_dst;
chunk_mem.set_data_handle(static_cast<uint8_t *>(full_mem.get_data_handle()) +
chunk_offset_in_byte + chunk_stride_in_byte * iter);
strm.submit({reorders.begin(), reorders.end()});
reorder.execute(strm, mem_holder_src, mem_holder_dst);
}
private:
ptrdiff_t chunk_stride_in_byte = 0;
ptrdiff_t chunk_offset_in_byte = 0;
const int FULL_DATA = 0;
const int CHUNK_DATA = 1;
bool sliced_src;
mkldnn::memory full_mem;
int iter_count;
};
class BackEdgePortHelper : public PortMapHelper {
public:
BackEdgePortHelper(const MKLDNNMemoryPtr &from, const MKLDNNMemoryPtr &to, const mkldnn::engine& eng) {
reorders.emplace_back(from->GetPrimitive(), to->GetPrimitive());
mem_holder_src = from->GetPrimitive();
mem_holder_dst = to->GetPrimitive();
reorder = {mem_holder_src, mem_holder_dst};
}
void execute(mkldnn::stream strm, int iter) override {
if (iter != 0) {
strm.submit({reorders.begin(), reorders.end()});
reorder.execute(strm, mem_holder_src, mem_holder_dst);
}
}
};
@ -129,13 +133,13 @@ class IterCountPortHelper : public PortMapHelper {
public:
IterCountPortHelper(const MKLDNNMemoryPtr &to, const mkldnn::engine& eng) {
// Only scalar I32 tensor is supported
IE_ASSERT(to->GetDataType() == memory::s32);
IE_ASSERT(to->GetDataType() == memory::data_type::s32);
IE_ASSERT(to->GetDims() == memory::dims{1});
mem_holder.push_back(to->GetPrimitive());
mem_holder_dst = to->GetPrimitive();
}
void execute(mkldnn::stream strm, int n_iter) override {
auto mem = mem_holder[0];
auto mem = mem_holder_dst;
auto data_ptr = static_cast<uint32_t*>(mem.get_data_handle());
*data_ptr = n_iter;
}
@ -144,14 +148,13 @@ public:
class asBoolCheck : public PortChecker {
public:
asBoolCheck(const MKLDNNMemoryPtr &mem) {
IE_ASSERT(mem->GetDataType() == memory::u8);
IE_ASSERT(mem->GetDataType() == memory::data_type::u8);
IE_ASSERT(mem->GetDims() == memory::dims{1});
mem_holder.push_back(mem->GetPrimitive());
mem_holder = mem->GetPrimitive();
}
int getStatus() override {
auto mem = mem_holder[0];
auto data_ptr = static_cast<uint8_t*>(mem.get_data_handle());
auto data_ptr = static_cast<uint8_t*>(mem_holder.get_data_handle());
return *data_ptr == static_cast<uint8_t>(0) ? 0 : 1;
}
};
@ -159,14 +162,13 @@ public:
class asIntCheck : public PortChecker {
public:
asIntCheck(const MKLDNNMemoryPtr &mem) {
IE_ASSERT(mem->GetDataType() == memory::s32);
IE_ASSERT(mem->GetDataType() == memory::data_type::s32);
IE_ASSERT(mem->GetDims() == memory::dims{1});
mem_holder.push_back(mem->GetPrimitive());
mem_holder = mem->GetPrimitive();
}
int getStatus() override {
auto mem = mem_holder[0];
auto data_ptr = static_cast<uint32_t*>(mem.get_data_handle());
auto data_ptr = static_cast<uint32_t*>(mem_holder.get_data_handle());
return *data_ptr;
}
};
@ -185,7 +187,8 @@ private:
} // namespace MKLDNNPlugin
MKLDNNTensorIteratorNode::MKLDNNTensorIteratorNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
MKLDNNNode(layer, eng, cache) {}
MKLDNNNode(layer, eng, cache),
sub_graph(eng) {}
void MKLDNNTensorIteratorNode::getSupportedDescriptors() {
auto *ti = dynamic_cast<class InferenceEngine::TensorIterator*>(getCnnLayer().get());

View File

@ -23,8 +23,9 @@ public:
virtual ~PortMapHelper() = default;
virtual void execute(mkldnn::stream strm, int n_iter = -1) = 0;
protected:
std::vector<mkldnn::reorder> reorders;
std::vector<mkldnn::memory> mem_holder;
mkldnn::reorder reorder;
mkldnn::memory mem_holder_src;
mkldnn::memory mem_holder_dst;
};
@ -38,7 +39,7 @@ public:
virtual ~PortChecker() = default;
virtual int getStatus() = 0;
protected:
std::vector<mkldnn::memory> mem_holder;
mkldnn::memory mem_holder;
};

View File

@ -45,7 +45,7 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() {
auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
auto& inDims = getParentEdgeAt(0)->getDims();
memory::format fmt = MKLDNNMemory::GetPlainFormat(inDims);
memory::format_tag fmt = MKLDNNMemory::GetPlainFormat(inDims);
InferenceEngine::LayerConfig config;
config.dynBatchSupport = true;
@ -76,10 +76,8 @@ void MKLDNNTileNode::createPrimitive() {
void MKLDNNTileNode::execute(mkldnn::stream strm) {
auto& srcMemory = getParentEdgeAt(0)->getMemory();
const float *src_ptr = reinterpret_cast<const float*>(srcMemory.GetData()) +
srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
const float *src_ptr = reinterpret_cast<const float*>(srcMemory.GetPtr());
float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetPtr());
int m_inner_dim = 1;
int m_outer_dim = 1;
@ -94,16 +92,13 @@ void MKLDNNTileNode::execute(mkldnn::stream strm) {
m_inner_dim *= batchToProcess();
}
if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw8c) ||
(inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw8c))) {
if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && srcMemory.GetDesc().isBlockedCFormat(8)) {
/*
* We may enable tile processing directly to appropriate output format (nChw8c)
*/
m_inner_dim *= 8;
m_outer_dim /= 8;
} else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 &&
((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw16c) ||
(inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw16c))) {
} else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 && srcMemory.GetDesc().isBlockedCFormat(16)) {
/*
* We may enable tile processing directly to appropriate output format (nChw16c)
*/

View File

@ -3,6 +3,7 @@
//
#include "base.hpp"
#include "utils/general_utils.h"
#include "common/defs.h"
#include "common/softmax.h"
#include "common/cpu_convert.h"
@ -13,13 +14,15 @@
#include <mkldnn_extension_utils.h>
#include "utils/bfloat16.hpp"
#include "common/cpu_memcpy.h"
#include "jit_generator.hpp"
#include "jit_uni_eltwise.hpp"
#include "mkldnn.hpp"
#include <cpu/x64/jit_generator.hpp>
#include <cpu/x64/jit_uni_eltwise_injector.hpp>
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::cpu::x64;
using namespace mkldnn::impl::utils;
namespace InferenceEngine {
@ -46,6 +49,8 @@ struct jit_uni_logistic_kernel {
void operator()(const jit_args_logistic *args) { assert(ker_); ker_(args); }
virtual void create_ker() = 0;
jit_uni_logistic_kernel() : ker_(nullptr) {}
virtual ~jit_uni_logistic_kernel() {}
};
@ -54,8 +59,15 @@ template <cpu_isa_t isa>
struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_generator {
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_logistic_kernel_f32)
jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jit_uni_logistic_kernel(), jit_generator() {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jcp_(jcp), jit_uni_logistic_kernel(), jit_generator() {}
void create_ker() override {
jit_generator::create_kernel();
ker_ = (decltype(ker_))jit_ker();
}
void generate() override {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -76,12 +88,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
cmp(reg_work_amount, step);
jl(tail_loop_label, T_NEAR);
load_vector(vmm_src, ptr[reg_src], jcp.src_dt);
load_vector(vmm_src, ptr[reg_src], jcp_.src_dt);
compute_kernel();
store_vector(ptr[reg_dst], vmm_src, jcp.dst_dt);
store_vector(ptr[reg_dst], vmm_src, jcp_.dst_dt);
add(reg_src, step * jcp.src_data_size);
add(reg_dst, step * jcp.dst_data_size);
add(reg_src, step * jcp_.src_data_size);
add(reg_dst, step * jcp_.dst_data_size);
sub(reg_work_amount, step);
jmp(main_loop_label, T_NEAR);
@ -92,12 +104,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
cmp(reg_work_amount, step);
jl(exit_label, T_NEAR);
load_scalar(xmm_src, ptr[reg_src], jcp.src_dt);
load_scalar(xmm_src, ptr[reg_src], jcp_.src_dt);
compute_kernel();
store_scalar(ptr[reg_dst], xmm_src, jcp.dst_dt);
store_scalar(ptr[reg_dst], xmm_src, jcp_.dst_dt);
add(reg_src, step * jcp.src_data_size);
add(reg_dst, step * jcp.dst_data_size);
add(reg_src, step * jcp_.src_data_size);
add(reg_dst, step * jcp_.dst_data_size);
sub(reg_work_amount, step);
jmp(tail_loop_label, T_NEAR);
@ -113,12 +125,10 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
exp_injector->prepare_table();
prepare_table();
ker_ = (decltype(ker_))this->getCode();
}
private:
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
using Vmm = typename conditional3<isa == x64::sse41, Xbyak::Xmm, isa == x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; }
@ -143,6 +153,8 @@ private:
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
jit_logistic_config_params jcp_;
void compute_kernel() {
uni_vmovups(vmm_aux0, vmm_src);
uni_vandps(vmm_aux0, vmm_aux0, table_val(0));
@ -157,10 +169,10 @@ private:
uni_vmovups(vmm_aux2, table_val(1));
uni_vsubps(vmm_aux2, vmm_aux2, vmm_src);
if (isa == cpu::sse42) {
if (isa == x64::sse41) {
uni_vblendvps(vmm_aux2, vmm_aux2, vmm_src, vmm_aux0);
uni_vmovups(vmm_src, vmm_aux2);
} else if (isa == cpu::avx2) {
} else if (isa == x64::avx2) {
uni_vblendvps(vmm_src, vmm_aux2, vmm_src, vmm_aux0);
} else {
vptestmd(k_mask, vmm_aux0, vmm_aux0);
@ -281,19 +293,22 @@ public:
jcp.src_data_size = jcp.dst_data_size = output_prec.size();
block_size = 1;
if (mayiuse(cpu::avx512_common)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx512_common>(jcp));
if (mayiuse(x64::avx512_common)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<x64::avx512_common>(jcp));
block_size = 16;
} else if (mayiuse(cpu::avx2)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx2>(jcp));
} else if (mayiuse(x64::avx2)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<x64::avx2>(jcp));
block_size = 8;
} else if (mayiuse(cpu::sse42)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::sse42>(jcp));
} else if (mayiuse(x64::sse41)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<x64::sse41>(jcp));
block_size = 4;
}
softmax_kernel = std::make_shared<SoftmaxGeneric>(input_prec, output_prec);
if (logistic_kernel)
logistic_kernel->create_ker();
addConfig(layer, {DataConfigurator(ConfLayout::PLN, input_prec)}, {DataConfigurator(ConfLayout::PLN, output_prec)});
} catch (InferenceEngine::details::InferenceEngineException &ex) {
errorMsg = ex.what();
@ -413,7 +428,7 @@ private:
float_dst_data[i + start_index] = logistic_scalar(float_dst_data[i + start_index]);
}
} else if (Precision::BF16 == output_prec) {
auto bf16_dst_data = reinterpret_cast<bfloat16_t*>(dst_data);
auto bf16_dst_data = reinterpret_cast<MKLDNNPlugin::bfloat16_t*>(dst_data);
for (int i = 0; i < count; i++) {
bf16_dst_data[i + start_index] = logistic_scalar(bf16_dst_data[i + start_index]);
}

View File

@ -6,7 +6,6 @@
#include <cmath>
#include <limits>
#include "utils.hpp"
#include "nodes/common/emitter.h"
/**
@ -77,7 +76,7 @@ private:
class jit_emu_vcvtneps2bf16 : public jit_emitter {
public:
jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
};
@ -87,7 +86,7 @@ public:
private:
void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs,
const std::vector<size_t>& pool_vec_idxs, const std::vector<size_t>& pool_gpr_idxs) {
if (host_isa_ == mkldnn::impl::cpu::cpu_isa_t::avx512_common) {
if (host_isa_ == mkldnn::impl::cpu::x64::cpu_isa_t::avx512_common) {
Xbyak::Zmm in = Xbyak::Zmm(in_vec_idxs[0]);
Xbyak::Ymm out = Xbyak::Ymm(out_vec_idxs[0]);
Xbyak::Zmm aux = Xbyak::Zmm(aux_vec_idxs[0]);

View File

@ -6,8 +6,7 @@
#include "blob_factory.hpp"
#include "mkldnn_memory.h"
// It's so bad to include by relative path :-(
#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
#include "common/memory_desc_wrapper.hpp"
#include <fstream>

View File

@ -0,0 +1,43 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <cassert>
namespace MKLDNNPlugin {
template<typename T, typename U>
inline T div_up(const T a, const U b) {
assert(b);
return (a + b - 1) / b;
}
template<typename T, typename U>
inline T rnd_up(const T a, const U b) {
return div_up(a, b) * b;
}
template <typename T, typename P>
constexpr bool one_of(T val, P item) { return val == item; }
template <typename T, typename P, typename... Args>
constexpr bool one_of(T val, P item, Args... item_others) {
return val == item || one_of(val, item_others...);
}
template <typename T, typename P>
constexpr bool everyone_is(T val, P item) { return val == item; }
template <typename T, typename P, typename... Args>
constexpr bool everyone_is(T val, P item, Args... item_others) {
return val == item && everyone_is(val, item_others...);
}
constexpr inline bool implication(bool cause, bool cond) {
return !cause || !!cond;
}
} // namespace MKLDNNPlugin

View File

@ -373,6 +373,8 @@ bool fuse_type_to_constant(std::shared_ptr<Node> & node, element::Type to, const
new_const = change_constant_precision<element::Type_t::u8, element::Type_t::i32>(constant);
} else if (from == element::u16 && to == element::i32) {
new_const = change_constant_precision<element::Type_t::u16, element::Type_t::i32>(constant);
} else if (from == element::i16 && to == element::i32) {
new_const = change_constant_precision<element::Type_t::i16, element::Type_t::i32>(constant);
} else if (from == element::u32 && to == element::i32) {
new_const = change_constant_precision<element::Type_t::u32, element::Type_t::i32>(constant);
} else if (from == element::f16 && to == element::f32) {

View File

@ -100,7 +100,7 @@ protected:
// performance counters
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["CONV_2"] = "FP32";
expectedPrecisions["CONV_2"] = "BF16";
}
};

View File

@ -164,7 +164,7 @@ protected:
// performance counters
expectedPrecisions["Convolution_1"] = "FP32";
expectedPrecisions["ReLU_1"] = "ndef";
expectedPrecisions["AvgPool_1"] = "BF16";
expectedPrecisions["AvgPool_1"] = netPrecision == Precision::BF16 ? "BF16" : "FP32";
expectedPrecisions["Convolution_2"] = "BF16";
expectedPrecisions["ReLU_2"] = "ndef";
expectedPrecisions["MaxPool_2"] = "BF16";

View File

@ -16,7 +16,9 @@ const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP16
};
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
const std::vector<std::vector<size_t>> inputShapes = {{1, 1}, {2, 6}, {1, 1, 1}, {2, 6, 13},
{1, 1, 1, 1}, {3, 10, 5, 6}, {2, 8, 5, 18}, {2, 16, 3, 18}, {3, 49, 5, 6},
{1, 1, 1, 1, 1}, {3, 10, 2, 5, 6}, {2, 8, 1, 5, 18}, {2, 16, 4, 3, 18}, {3, 49, 7, 5, 6}};
const std::vector<std::vector<size_t>> constShapes = {{1}};
const std::vector<size_t> levels = {16, 255, 256};
@ -24,7 +26,6 @@ const std::pair<std::string, std::map<std::string, std::string>> config = {};
const std::vector<float> fqArgs = {};
const std::vector<float> inputParams = {};
const auto fqParams = ::testing::Combine(
::testing::ValuesIn(levels),
::testing::ValuesIn(constShapes),
@ -45,4 +46,47 @@ INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize, FakeQuantizeLayerTest,
::testing::Values(config)),
FakeQuantizeLayerTest::getTestCaseName);
const std::vector<std::vector<size_t>> inputShapesPerChannel = {{11, 10, 22, 19}, {11, 10, 5, 6}};
const std::vector<std::vector<size_t>> constShapesPerChannelAxis0 = {{11, 1, 1, 1}};
const std::vector<std::vector<size_t>> constShapesPerChannelAxis1 = {{1, 10, 1, 1}};
const auto fqParamsPerChannelAxis0 = ::testing::Combine(
::testing::ValuesIn(levels),
::testing::ValuesIn(constShapesPerChannelAxis0),
::testing::Values(fqArgs),
::testing::Values(inputParams)
);
const auto fqParamsPerChannelAxis1 = ::testing::Combine(
::testing::ValuesIn(levels),
::testing::ValuesIn(constShapesPerChannelAxis1),
::testing::Values(fqArgs),
::testing::Values(inputParams)
);
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizePerChannelAxis0, FakeQuantizeLayerTest,
::testing::Combine(
fqParamsPerChannelAxis0,
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(inputShapesPerChannel),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(config)),
FakeQuantizeLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizePerChannelAxis1, FakeQuantizeLayerTest,
::testing::Combine(
fqParamsPerChannelAxis1,
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(inputShapesPerChannel),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(config)),
FakeQuantizeLayerTest::getTestCaseName);
} // namespace

Some files were not shown because too many files have changed in this diff Show More