[CPU] CumSum tensors with rank >= 6 support. (#3859)

This commit is contained in:
Aleksandr Pertovsky 2021-02-02 10:45:52 +03:00 committed by GitHub
parent 537179b235
commit d754e9b311
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 58 deletions

View File

@ -16,11 +16,11 @@ namespace Cpu {
class CumSumImpl: public ExtLayerBase {
enum { CUM_SUM_DATA, AXIS, numOfInputs };
enum { N, C, D, H, W, numOfDims };
bool exclusive;
bool reverse;
size_t numOfDims;
size_t axis = 0;
std::vector<size_t> shape5d;
std::vector<size_t> shape;
public:
explicit CumSumImpl(const CNNLayer* layer) {
@ -31,9 +31,10 @@ public:
const auto &dataTensor = layer->insData[CUM_SUM_DATA].lock()->getTensorDesc();
const auto &dataShape = dataTensor.getDims();
if (dataShape.size() < 1 || dataShape.size() > 5) {
if (dataShape.size() < 1) {
THROW_IE_EXCEPTION << "CumSum layer with name '" << layerName << "' doesn't support 'data' input tensor with rank: " << dataShape.size();
}
numOfDims = dataShape.size();
exclusive = layer->GetParamAsBool("exclusive", false);
reverse = layer->GetParamAsBool("reverse", false);
@ -57,7 +58,7 @@ public:
if (dataShape != layer->outData[0]->getTensorDesc().getDims())
THROW_IE_EXCEPTION << "CumSum layer with name '" << layerName << "' has different 'data' input and output dimensions";
shape5d = get5dShape(dataShape);
shape = dataShape;
LayerConfig config;
for (size_t i = 0; i < layer->insData.size(); i++) {
@ -65,7 +66,7 @@ public:
inConfig.inPlace = -1;
inConfig.constant = false;
Precision inPrecision = layer->insData[i].lock()->getTensorDesc().getPrecision();
Precision inPrecision = i == 1 ? Precision(Precision::I32) : layer->insData[i].lock()->getTensorDesc().getPrecision();
if (inPrecision == Precision::BF16)
inPrecision = Precision::FP32;
const SizeVector& inDims = layer->insData[i].lock()->getTensorDesc().getDims();
@ -120,75 +121,121 @@ private:
void execImpl(const Blob::CPtr& _input, const Blob::Ptr& _output) {
const auto *input = _input->cbuffer().as<const dataType *>() + _input->getTensorDesc().getBlockingDesc().getOffsetPadding();
auto *output = _output->buffer().as<dataType *>() + _output->getTensorDesc().getBlockingDesc().getOffsetPadding();
const size_t offset = _input->getTensorDesc().getBlockingDesc().getStrides()[axis];
const std::vector<size_t> strides = _input->getTensorDesc().getBlockingDesc().getStrides();
if (reverse) {
if (exclusive) {
cumSum<true, true, dataType>(input, output, offset);
cumSum<true, true, dataType>(input, output, strides);
} else {
cumSum<true, false, dataType>(input, output, offset);
cumSum<true, false, dataType>(input, output, strides);
}
} else {
if (exclusive) {
cumSum<false, true, dataType>(input, output, offset);
cumSum<false, true, dataType>(input, output, strides);
} else {
cumSum<false, false, dataType>(input, output, offset);
cumSum<false, false, dataType>(input, output, strides);
}
}
}
template <bool reverse, bool exclusive, typename dataType>
void cumSum(const dataType *input, dataType *output, const size_t &offset) {
std::vector<size_t> iterationRange(numOfDims - 1);
void cumSum(const dataType *input, dataType *output, const std::vector<size_t> &strides) {
SizeVector iterationRange(numOfDims - 1);
size_t j = 0;
for (size_t i = 0; i < shape5d.size(); i++) {
for (size_t i = 0; i < shape.size(); i++) {
if (i == axis)
continue;
iterationRange[j++] = shape5d[i];
iterationRange[j++] = shape[i];
}
parallel_for4d(iterationRange[0], iterationRange[1], iterationRange[2], iterationRange[3], [&](size_t ir0, size_t ir1, size_t ir2, size_t ir3) {
std::vector<size_t> forStartOffset;
forStartOffset.push_back(ir0); forStartOffset.push_back(ir1); forStartOffset.push_back(ir2); forStartOffset.push_back(ir3);
forStartOffset.insert(forStartOffset.begin() + axis, 0);
size_t startOffset = getStartOffset(forStartOffset);
size_t work_amount_dst = std::accumulate(iterationRange.begin(), iterationRange.end(), 1, std::multiplies<size_t>());
parallel_nt(0, [&](const int ithr, const int nthr) {
size_t start = 0, end = 0;
SizeVector counters(numOfDims - 1, 0);
splitter(work_amount_dst, nthr, ithr, start, end);
const dataType *inputStart = input + startOffset;
dataType *outputStart = output + startOffset;
parallelItInit(start, counters, iterationRange);
if (reverse) {
if (exclusive) {
outputStart[offset*(shape5d[axis] - 1)] = 0;
for (int64_t i = shape5d[axis] - 2; i >= 0; i--) {
outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset];
for (size_t iwork = start; iwork < end; ++iwork) {
std::vector<size_t> forStartOffset(numOfDims);
forStartOffset[axis] = 0;
for (int64_t offsetIdx = 0, countersIdx = 0; offsetIdx < numOfDims; ++offsetIdx) {
if (offsetIdx == axis) {
continue;
}
forStartOffset[offsetIdx] = counters[countersIdx++];
}
size_t startOffset = getStartOffset(forStartOffset, strides);
const dataType *inputStart = input + startOffset;
dataType *outputStart = output + startOffset;
size_t offset = strides[axis];
if (reverse) {
if (exclusive) {
outputStart[offset*(shape[axis] - 1)] = 0;
for (int64_t i = shape[axis] - 2; i >= 0; i--) {
outputStart[i*offset] = inputStart[(i+1)*offset] + outputStart[(i+1)*offset];
}
} else {
outputStart[offset*(shape[axis] - 1)] = inputStart[offset * (shape[axis] - 1)];
for (int64_t i = shape[axis] - 2; i >= 0; i--) {
outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset];
}
}
} else {
outputStart[offset*(shape5d[axis] - 1)] = inputStart[offset*(shape5d[axis] - 1)];
for (int64_t i = shape5d[axis] - 2; i >= 0; i--) {
outputStart[i*offset] = inputStart[i*offset] + outputStart[(i+1)*offset];
}
}
} else {
if (exclusive) {
outputStart[0] = 0;
for (size_t i = 1; i < shape5d[axis]; i++) {
outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset];
}
} else {
outputStart[0] = inputStart[0];
for (size_t i = 1; i < shape5d[axis]; i++) {
outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset];
if (exclusive) {
outputStart[0] = 0;
for (size_t i = 1; i < shape[axis]; i++) {
outputStart[i*offset] = inputStart[(i-1)*offset] + outputStart[(i-1)*offset];
}
} else {
outputStart[0] = inputStart[0];
for (size_t i = 1; i < shape[axis]; i++) {
outputStart[i*offset] = inputStart[i*offset] + outputStart[(i-1)*offset];
}
}
}
parallelItStep(counters, iterationRange);
}
});
}
size_t getStartOffset(std::vector<size_t> &forStartOffset) {
return forStartOffset[N]*shape5d[C]*shape5d[D]*shape5d[H]*shape5d[W] + forStartOffset[C]*shape5d[D]*shape5d[H]*shape5d[W] +
forStartOffset[D]*shape5d[H]*shape5d[W] + forStartOffset[H]*shape5d[W] + forStartOffset[W];
void parallelItInit(size_t start, std::vector<size_t>& counters, const std::vector<size_t>& iterationRange) {
auto itCounter = counters.rbegin();
auto itWork = iterationRange.rbegin();
while (itCounter != counters.rend()) {
*itCounter = start % *itWork;
start /= *itWork;
++itCounter;
++itWork;
}
}
size_t getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) {
inline void parallelItStep(std::vector<size_t>& counters, const std::vector<size_t>& iterationRange) {
auto itCounter = counters.rbegin();
auto itWork = iterationRange.rbegin();
while (itCounter != counters.rend()) {
*itCounter = (*itCounter + 1) % *itWork;
if (*itCounter != 0) {
break;
}
++itCounter;
++itWork;
}
}
inline size_t getStartOffset(const std::vector<size_t> &forStartOffset, const std::vector<size_t>& strides) const {
size_t startOffset = 0;
for (size_t idx = 0; idx < forStartOffset.size(); ++idx) {
startOffset += forStartOffset[idx] * strides[idx];
}
return startOffset;
}
size_t getAxis(const Blob::CPtr& _axis, const Blob::CPtr& _data) const {
const auto& axisPrecision = _axis->getTensorDesc().getPrecision();
const int64_t dataShapeSize = static_cast<int64_t>(_data->getTensorDesc().getDims().size());
int64_t axisValueFromBlob;
@ -212,13 +259,6 @@ private:
return axisValueFromBlob >= 0 ? axisValueFromBlob : (axisValueFromBlob + dataShapeSize);
}
std::vector<size_t> get5dShape(const SizeVector& dims) {
std::vector<size_t> shape5d(numOfDims, 1);
for (size_t i = 0; i < dims.size(); i++)
shape5d[i] = dims[i];
return shape5d;
}
private:
std::string layerName;
};
@ -227,4 +267,4 @@ REG_FACTORY_FOR(CumSumImpl, CumSum);
} // namespace Cpu
} // namespace Extensions
} // namespace InferenceEngine
} // namespace InferenceEngine

View File

@ -14,7 +14,9 @@ const std::vector<std::vector<size_t>> shapes = {
{9, 15},
{16, 10, 12},
{5, 14, 5, 7},
{7, 8, 6, 7, 13}
{7, 8, 6, 7, 13},
{2, 3, 4, 2, 3, 5},
{4, 3, 6, 2, 3, 4, 5, 2, 3, 4},
};
const std::vector<InferenceEngine::Precision> inputPrecision = {
@ -25,14 +27,14 @@ const std::vector<InferenceEngine::Precision> inputPrecision = {
InferenceEngine::Precision::FP32
};
const std::vector<int64_t> axes = { 0, 1, 2, 3, 4 };
const std::vector<int64_t> negativeAxes = { -1, -2, -3, -4, -5 };
const std::vector<int64_t> axes = { 0, 1, 2, 3, 4, 5, 6};
const std::vector<int64_t> negativeAxes = { -1, -2, -3, -4, -5, -6 };
const std::vector<bool> exclusive = {true, false};
const std::vector<bool> reverse = {true, false};
const auto testCasesNegativeAxis = ::testing::Combine(
::testing::Values(std::vector<size_t>{4, 16, 3, 6, 5}),
::testing::Values(std::vector<size_t>{4, 16, 3, 6, 5, 2}),
::testing::Values(InferenceEngine::Precision::FP32),
::testing::ValuesIn(negativeAxes),
::testing::ValuesIn(exclusive),
@ -85,10 +87,29 @@ const auto testCasesAxis_4 = ::testing::Combine(
::testing::Values(CommonTestUtils::DEVICE_CPU)
);
const auto testCasesAxis_5 = ::testing::Combine(
::testing::ValuesIn(std::vector<std::vector<size_t>>(shapes.begin() + 5, shapes.end())),
::testing::ValuesIn(inputPrecision),
::testing::Values(axes[5]),
::testing::ValuesIn(exclusive),
::testing::ValuesIn(reverse),
::testing::Values(CommonTestUtils::DEVICE_CPU)
);
const auto testCasesAxis_6 = ::testing::Combine(
::testing::ValuesIn(std::vector<std::vector<size_t>>(shapes.begin() + 6, shapes.end())),
::testing::ValuesIn(inputPrecision),
::testing::Values(axes[6]),
::testing::ValuesIn(exclusive),
::testing::ValuesIn(reverse),
::testing::Values(CommonTestUtils::DEVICE_CPU)
);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_negative_axis, CumSumLayerTest, testCasesNegativeAxis, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_0, CumSumLayerTest, testCasesAxis_0, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_1, CumSumLayerTest, testCasesAxis_1, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_2, CumSumLayerTest, testCasesAxis_2, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_3, CumSumLayerTest, testCasesAxis_3, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_4, CumSumLayerTest, testCasesAxis_4, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_5, CumSumLayerTest, testCasesAxis_5, CumSumLayerTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_MKLDNN_TestsCumSum_axis_6, CumSumLayerTest, testCasesAxis_6, CumSumLayerTest::getTestCaseName);