[CPU] Split layer nspc -> ncsp special case put back. (#3839)

This commit is contained in:
Maksim Kutakov 2021-02-02 15:40:50 +03:00 committed by GitHub
parent cc000e57e0
commit f0398212f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 216 additions and 41 deletions

View File

@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
@ -6,6 +6,7 @@
#include "common/cpu_memcpy.h"
#include <legacy/ie_layers.h>
#include <vector>
#include <queue>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include <climits>
@ -80,7 +81,7 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
using TensorDescFactory = std::function<TensorDesc(const Precision&, const SizeVector&)>;
constexpr size_t channelsPos = 1lu;
// perform guard checks
if (!supportedPrimitiveDescriptors.empty())
return;
@ -218,6 +219,16 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
}
supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormats);
}
// Special nspc -> ncsp case when splitting channels
if (axis == 1 && (dstFirstDims.ndims() == 4 || dstFirstDims.ndims() == 5)) {
auto plain = makePdInfo(&makePlainTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
auto perChannel = makePdInfo(&makePerChannelTensorDesc, inpPrecision, srcDims, outDims, impl_desc_type::ref);
plain.getConfig().inConfs[0].desc = perChannel.getConfig().inConfs[0].desc;
supportedPrimitiveDescriptors.push_back(plain);
}
}
void MKLDNNSplitNode::createPrimitive() {
@ -231,23 +242,49 @@ void MKLDNNSplitNode::createPrimitive() {
if (getSelectedPrimitiveDescriptor() == nullptr)
THROW_ERROR << "Preferable primitive descriptor is not set.";
if (!isOptimized())
prepareOptimizedParams();
canUseOptimizedNspc2Ncsp = true;
if (axis != 1)
canUseOptimizedNspc2Ncsp = false;
if (getParentEdgeAt(0)->getBlob()->getTensorDesc().getLayout() != NHWC &&
getParentEdgeAt(0)->getBlob()->getTensorDesc().getLayout() != NDHWC)
canUseOptimizedNspc2Ncsp = false;
for (size_t i = 0; i < getChildEdges().size(); i++) {
if (getChildEdgeAt(i)->getBlob()->getTensorDesc().getLayout() != NCHW &&
getChildEdgeAt(i)->getBlob()->getTensorDesc().getLayout() != NCDHW)
canUseOptimizedNspc2Ncsp = false;
}
if (!isOptimized()) {
initializeDstMemPtrs();
if (!canUseOptimizedNspc2Ncsp)
prepareOptimizedParams();
}
}
void MKLDNNSplitNode::execute(mkldnn::stream strm) {
if (isOptimized())
return;
if (dstMemPtrs.empty())
THROW_ERROR << "Output data pointers have not been initialized.";
int MB = batchToProcess();
if (canUseOptimizedNspc2Ncsp) {
optimizedNspc2Ncsp(MB);
return;
}
uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
size_t batch = this->getParentEdgeAt(0)->getDims()[0];
if (batch != MB)
optimizedParams.countStrides = optimizedParams.countStrides / batch * MB;
parallel_for2d(this->getChildEdges().size(), optimizedParams.countStrides, [&](size_t i, size_t j) {
uint8_t* dstData = optimizedParams.dstMemPtrs[i];
parallel_for2d(dstMemPtrs.size(), optimizedParams.countStrides, [&](size_t i, size_t j) {
uint8_t* dstData = dstMemPtrs[i];
cpu_memcpy(&dstData[j * optimizedParams.dataSize[i]],
&srcData[optimizedParams.srcDataOffsets[i] + j * optimizedParams.srcDataStride],
@ -346,7 +383,7 @@ void MKLDNNSplitNode::selectOptimalPrimitiveDescriptor() {
inNum = 0;
}
if (MKLDNNExtensionUtils::initTensorsAreEqual(
getSupportedPrimitiveDescriptors()[i].getConfig().inConfs[0].desc,
supportedPrimitiveDescriptors[i].getConfig().inConfs[0].desc,
parent_spd->getConfig().outConfs[inNum].desc)) {
canSelectPrimitive.push_back(i);
}
@ -364,6 +401,46 @@ void MKLDNNSplitNode::selectOptimalPrimitiveDescriptor() {
}
}
// if there are no inPlace, but more than one suitable configurations, select the one that matches the output layout
for (auto indx : canSelectPrimitive) {
bool outputDescFullMatch = true;
for (size_t i = 0; i < getChildEdges().size(); ++i) {
auto childEdge = getChildEdgeAt(i);
auto childPtr = childEdge->getChild();
auto& vecChildSpd = childPtr->getSupportedPrimitiveDescriptors();
const auto& outputDesc = supportedPrimitiveDescriptors[indx].getConfig().outConfs[i].desc;
if (!vecChildSpd.empty()) {
int inNum = childEdge->getOutputNum();
if (inNum < 0) {
inNum = 0;
}
bool hasMatchDesc = false;
for (auto& childSpd : vecChildSpd) {
if (inNum >= childSpd.getConfig().inConfs.size()) {
inNum = 0;
}
if (MKLDNNExtensionUtils::initTensorsAreEqual(outputDesc, childSpd.getConfig().inConfs[inNum].desc)) {
hasMatchDesc = true;
break;
}
}
if (!hasMatchDesc) {
outputDescFullMatch = false;
break;
}
}
}
if (outputDescFullMatch) {
selectPrimitiveDescriptorByIndex(static_cast<int>(indx));
return;
}
}
if (!canSelectPrimitive.empty()) {
selectPrimitiveDescriptorByIndex(static_cast<int>(canSelectPrimitive.front()));
return;
}
// if there are no matching data layouts, select first optimized implementation
for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
if (supportedPrimitiveDescriptors[i].getImplementationType() == impl_desc_type::unknown) {
@ -384,50 +461,119 @@ void MKLDNNSplitNode::setDynamicBatchLim(int lim) {
void MKLDNNSplitNode::prepareOptimizedParams() {
const auto& inpTensorDesc = this->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc;
const auto outputPortsCount = outDims.size();
//find axis order position
const auto& order = inpTensorDesc.getBlockingDesc().getOrder();
unsigned axisOrderPos = UINT_MAX;
unsigned axisOrderPos = std::numeric_limits<unsigned>::max();
for (size_t i = 0; i < order.size(); ++i) {
if (order[i] == axis) {
axisOrderPos = i;
break;
}
}
if (UINT_MAX == axisOrderPos) {
if (std::numeric_limits<unsigned>::max() == axisOrderPos) {
THROW_ERROR << "Can't find the axis in the input tensor order list";
}
uint8_t srcDataSize = inpTensorDesc.getPrecision().size();
const auto& srcDims = inpTensorDesc.getBlockingDesc().getBlockDims();
int nDims = srcDims.size();
const auto nDims = srcDims.size();
optimizedParams.countStrides = 1;
for (int i = 0; i < axisOrderPos; i++)
optimizedParams.countStrides *= srcDims[i];
optimizedParams.srcDataStride = 0;
optimizedParams.dataSize.resize(this->getChildEdges().size());
optimizedParams.dstMemPtrs.clear();
for (int i = 0; i < this->getChildEdges().size(); i++) {
if (uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(i)->getMemoryPtr()->GetPtr())) {
optimizedParams.dstMemPtrs.push_back(dstData);
} else {
THROW_ERROR << "can't get child edge indx " << i << "data.";
}
optimizedParams.dataSize.resize(outputPortsCount);
for (size_t i = 0; i < outputPortsCount; i++) {
auto outputEdge = this->getChildEdgesAtPort(i).front();
optimizedParams.dataSize[i] = srcDataSize;
for (int j = axisOrderPos; j < nDims; j++)
optimizedParams.dataSize[i] *= this->getChildEdgeAt(i)->getDesc().getBlockingDesc().getBlockDims()[j];
for (size_t j = axisOrderPos; j < nDims; j++)
optimizedParams.dataSize[i] *= outputEdge->getDesc().getBlockingDesc().getBlockDims()[j];
optimizedParams.srcDataStride += optimizedParams.dataSize[i];
}
optimizedParams.srcDataOffsets.resize(this->getChildEdges().size());
optimizedParams.srcDataOffsets.resize(outputPortsCount);
optimizedParams.srcDataOffsets[0] = 0;
for (int i = 1; i < this->getChildEdges().size(); i++) {
for (size_t i = 1; i < outputPortsCount; i++) {
optimizedParams.srcDataOffsets[i] = optimizedParams.srcDataOffsets[i - 1] + optimizedParams.dataSize[i - 1];
}
}
void MKLDNNSplitNode::optimizedNspc2Ncsp(size_t MB) {
auto parentEdge = getParentEdgeAt(0);
const int ndims = parentEdge->getDims().ndims();
const size_t IC = parentEdge->getDims()[1];
const size_t D = ndims == 5 ? parentEdge->getDims()[ndims - 3] : 1;
const size_t H = parentEdge->getDims()[ndims - 2];
const size_t W = parentEdge->getDims()[ndims - 1];
auto srcBlob = parentEdge->getBlob();
auto srcData = srcBlob->cbuffer().as<const uint8_t*>();
const auto dataSize = srcBlob->getTensorDesc().getPrecision().size();
const size_t DHW = D*H*W;
const size_t strideIB = DHW * IC * dataSize;
const size_t strideIW = IC*dataSize;
const size_t strideOC = DHW * dataSize;
for (size_t i = 0, sIdx = 0; i < outDims.size(); i++) {
auto dstData = dstMemPtrs[i];
size_t innerSize = 1;
auto dims = outDims[i].ToSizeVector();
for (size_t j = axis; j < dims.size(); j++) {
innerSize *= dims[j];
}
auto srcPtr = srcData + srcBlob->getTensorDesc().offset(sIdx) * dataSize;
const size_t OC = dims[1];
const size_t strideOB = OC * strideOC;
parallel_for2d(MB, DHW, [&](size_t b, size_t j) {
auto localSrcPtr = srcPtr + b*strideIB + j*strideIW;
auto localDstPtr = dstData + b*strideOB + j*dataSize;
for (size_t c = 0; c < OC; c++) {
cpu_memcpy(localDstPtr, localSrcPtr, dataSize);
localSrcPtr += dataSize;
localDstPtr += strideOC;
}
});
sIdx += innerSize;
}
}
void MKLDNNSplitNode::initializeDstMemPtrs() {
dstMemPtrs.clear();
//Here we have to place the output data pointers in the order that reflects the output edges order.
//It's important in case when several edges are connected to one port.
//This is a naive implementation, an indexed priority queue or modified treap would be a more elegant solution.
std::unordered_map<uint8_t*, size_t> mapDstPtrs;
using pair_t = std::pair<uint8_t*, size_t>;
for (size_t i = 0; i < getChildEdges().size(); ++i) {
auto outputEdge = this->getChildEdgeAt(i);
if (uint8_t* dstData = reinterpret_cast<uint8_t*>(outputEdge->getMemoryPtr()->GetPtr())) {
mapDstPtrs[dstData] = i;
} else {
THROW_ERROR << "can't get child edge indx " << i << "data.";
}
}
std::vector<uint8_t*> vecCountingSort(getChildEdges().size(), nullptr);
for (auto& item : mapDstPtrs) {
vecCountingSort[item.second] = item.first;
}
dstMemPtrs.reserve(vecCountingSort.size());
auto backInserter = std::back_inserter(dstMemPtrs);
std::copy_if(vecCountingSort.begin(), vecCountingSort.end(), backInserter, [](const uint8_t* x) {return x;});
dstMemPtrs.shrink_to_fit();
}
REG_MKLDNN_PRIM_FOR(MKLDNNSplitNode, Split);

View File

@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
@ -29,13 +29,17 @@ public:
private:
void prepareOptimizedParams();
void initializeDstMemPtrs();
void optimizedNspc2Ncsp(size_t MB);
bool canUseOptimizedNspc2Ncsp;
size_t axis = 1;
std::vector<uint8_t*> dstMemPtrs;
struct {
std::vector<size_t> dataSize;
std::vector<size_t> srcDataOffsets;
std::vector<uint8_t *> dstMemPtrs;
size_t srcDataStride;
size_t countStrides;
} optimizedParams;

View File

@ -1,4 +1,4 @@
// Copyright (C) 2020 Intel Corporation
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
@ -91,8 +91,11 @@ const auto planar_5D_ref = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref"}, "ref"};
const auto planar_4D = CPUSpecificParams{{nchw}, {nchw}, {}, "unknown"};
const auto planar_5D = CPUSpecificParams{{ncdhw}, {ncdhw}, {}, "unknown"};
const auto planarChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
const auto planarChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};
const auto perChannels_4D = CPUSpecificParams{{nhwc}, {nhwc}, {}, "ref"};
const auto perChannels_5D = CPUSpecificParams{{ndhwc}, {ndhwc}, {}, "ref"};
const auto perChannelsToPlanar_4D = CPUSpecificParams{{nhwc}, {nchw}, {}, "ref"};
const auto perChannelsToPlanar_5D = CPUSpecificParams{{ndhwc}, {ncdhw}, {}, "ref"};
const auto blocked8_4D = CPUSpecificParams{{nChw8c}, {nChw8c}, {}, "unknown"};
const auto blocked8_5D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {}, "unknown"};
@ -114,6 +117,28 @@ const std::vector<Precision> netPrecisions = {
Precision::BF16
};
INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
::testing::Combine(
::testing::Values(4),
::testing::Values(1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 28, 24, 9})),
::testing::Values(std::vector<size_t>({})),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(perChannelsToPlanar_4D)),
SplitLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Nspc2NcspSpecial, SplitLayerCPUTest,
::testing::Combine(
::testing::Values(3),
::testing::Values(1),
::testing::ValuesIn(netPrecisions),
::testing::Values(std::vector<size_t>({3, 21, 24, 9, 15})),
::testing::Values(std::vector<size_t>({})),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(perChannelsToPlanar_5D)),
SplitLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8inPlace, SplitLayerCPUTest,
::testing::Combine(
::testing::Values(3),
@ -122,7 +147,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8inPlace, SplitLayerCPUTest,
::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
::testing::Values(std::vector<size_t>({})),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D, planar_4D_ref, planarChannels_4D, blocked8_4D)),
::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D)),
SplitLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8, SplitLayerCPUTest,
@ -133,7 +158,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block8, SplitLayerCPUTest,
::testing::Values(std::vector<size_t>({3, 24, 24, 9})),
::testing::Values(std::vector<size_t>({})),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_4D, planar_4D_ref, planarChannels_4D, blocked8_4D_ref)),
::testing::Values(planar_4D, planar_4D_ref, perChannels_4D, blocked8_4D_ref)),
SplitLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Split4D_CPU_Block16inPlace, SplitLayerCPUTest,
@ -166,7 +191,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8inPlace, SplitLayerCPUTest,
::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
::testing::Values(std::vector<size_t>({})),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D, planar_5D_ref, planarChannels_5D, blocked8_5D)),
::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D)),
SplitLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8, SplitLayerCPUTest,
@ -177,7 +202,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block8, SplitLayerCPUTest,
::testing::Values(std::vector<size_t>({3, 24, 24, 9, 15})),
::testing::Values(std::vector<size_t>({})),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(planar_5D, planar_5D_ref, planarChannels_5D, blocked8_5D_ref)),
::testing::Values(planar_5D, planar_5D_ref, perChannels_5D, blocked8_5D_ref)),
SplitLayerCPUTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_Split5D_CPU_Block16inPlace, SplitLayerCPUTest,

View File

@ -1,4 +1,4 @@
// Copyright (C) 2018-2020 Intel Corporation
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
@ -230,27 +230,27 @@ INSTANTIATE_TEST_CASE_P(
split_test_params {
{1, 24, 2, 5},
{{1, 16, 2, 5}, {1, 8, 2, 5}},
1, 5, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
1, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
},
split_test_params {
{1, 20, 2, 5},
{{1, 13, 2, 5}, {1, 7, 2, 5}},
1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
1, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
},
split_test_params {
{1, 20, 2, 5},
{{1, 10, 2, 5}, {1, 10, 2, 5}},
1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
1, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
},
split_test_params {
{2, 20, 2, 5},
{{2, 10, 2, 5}, {2, 10, 2, 5}},
1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
1, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
},
split_test_params {
{2, 20, 2, 5},
{{2, 15, 2, 5}, {2, 5, 2, 5}},
1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
1, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
},
split_test_params {
{9, 11, 7, 5},
@ -275,7 +275,7 @@ INSTANTIATE_TEST_CASE_P(
split_test_params {
{5, 6, 7, 15},
{{5, 1, 7, 15}, {5, 2, 7, 15}, {5, 1, 7, 15}, {5, 2, 7, 15}},
1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
1, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
},
split_test_params {
{5, 6, 7, 15},
@ -290,15 +290,15 @@ INSTANTIATE_TEST_CASE_P(
split_test_params {
{5, 6, 7, 15},
{{5, 6, 7, 15}},
1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}},
1, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}},
split_test_params {
{1, 32, 16, 16, 16},
{{1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}},
1, 5, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}},
1, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}},
split_test_params {
{1, 32, 16, 16, 16},
{{1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}},
1, 5, MKLDNNPlugin::impl_desc_type::unknown, {}}));
1, 6, MKLDNNPlugin::impl_desc_type::unknown, {}}));
class MKLDNNGraphDynBatchSplitTests: public MKLDNNGraphSplitTests {
protected: