Gna cnn2d with pooling support (#4574)

* [GNA] Support Pooling 2D

  Refactor Pooling
  Remove unused sum pooling
  Skip ReorderPoolActivation pass when 2D pooling detected
  Enable fusing of Pool2D with other components into GNA Operation

* Support GNA_SW_FP32 mode for Pooling 2D with tests

* Apply review

typo

* Apply review

Restore indentation
This commit is contained in:
Krzysztof Bruniecki 2021-03-15 14:16:45 +01:00 committed by GitHub
parent 95a13e05d5
commit ecee373220
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 366 additions and 146 deletions

View File

@ -255,34 +255,26 @@ void GNAPluginNS::backend::AMIntelDNN::InitConvolutional2DComponentPrivate(intel
#endif
void GNAPluginNS::backend::AMIntelDNN::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp,
uint32_t num_rows_in,
uint32_t num_columns_in,
uint32_t num_rows_out,
uint32_t num_columns_out,
uint32_t num_bytes_per_input,
uint32_t num_bytes_per_output,
uint32_t num_pool_size,
uint32_t num_pool_step,
uint32_t num_pool_stride,
bool do_sum_not_max,
float output_scale_factor,
void *&ptr_inputs,
void *&ptr_outputs,
bool postInitMem) {
comp.num_rows_in = num_rows_in;
comp.num_columns_in = num_columns_in;
comp.num_rows_out = num_rows_out;
comp.num_columns_out = num_columns_out;
std::array<uint32_t, 3> inCHW,
std::array<uint32_t, 3> outCHW,
uint32_t num_bytes_per_input,
uint32_t num_bytes_per_output,
std::array<uint32_t, 2> poolingWindowXY,
std::array<uint32_t, 2> poolingStrideXY,
float output_scale_factor,
void *&ptr_inputs,
void *&ptr_outputs,
bool postInitMem) {
comp.num_bytes_per_input = num_bytes_per_input;
comp.num_bytes_per_output = num_bytes_per_output;
comp.operation = kDnnMaxPoolOp;
comp.macro_operation = kDnnMacroOpNone;
comp.orientation_in = kDnnNonInterleavedOrientation;
comp.orientation_out = kDnnNonInterleavedOrientation;
comp.op.maxpool.num_inputs = num_pool_size;
comp.op.maxpool.num_inputs_step = num_pool_step;
comp.op.maxpool.num_inputs_stride = num_pool_stride;
comp.op.maxpool.do_sum_not_max = do_sum_not_max;
comp.op.maxpool.inCHW = inCHW;
comp.op.maxpool.outCHW = outCHW;
comp.op.maxpool.poolingWindowXY = poolingWindowXY;
comp.op.maxpool.poolingStrideXY = poolingStrideXY;
comp.output_scale_factor = output_scale_factor;
comp.input_scale_factor = output_scale_factor;
if (!postInitMem) {
@ -1209,11 +1201,17 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
}
break;
case kDnnMaxPoolOp: {
uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1;
out_file << "<pool_type> " << std::dec << num_pool_type << "\n";
out_file << "<pool_size> " << std::dec << component[i].op.maxpool.num_inputs << "\n";
out_file << "<pool_step> " << std::dec << component[i].op.maxpool.num_inputs_step << "\n";
out_file << "<pool_num_rows> " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n";
out_file << "<pool_type> MAX\n";
out_file << "<pool_window_x> " << std::dec << component[i].op.maxpool.poolingWindowXY[0] << "\n";
out_file << "<pool_window_y> " << std::dec << component[i].op.maxpool.poolingWindowXY[1] << "\n";
out_file << "<pool_stride_x> " << std::dec << component[i].op.maxpool.poolingStrideXY[0] << "\n";
out_file << "<pool_stride_y> " << std::dec << component[i].op.maxpool.poolingStrideXY[1] << "\n";
out_file << "<c_dim_in> " << std::dec << component[i].op.maxpool.inCHW[0] << "\n";
out_file << "<h_dim_in> " << std::dec << component[i].op.maxpool.inCHW[1] << "\n";
out_file << "<w_dim_in> " << std::dec << component[i].op.maxpool.inCHW[2] << "\n";
out_file << "<c_dim_out> " << std::dec << component[i].op.maxpool.outCHW[0] << "\n";
out_file << "<h_dim_out> " << std::dec << component[i].op.maxpool.outCHW[1] << "\n";
out_file << "<w_dim_out> " << std::dec << component[i].op.maxpool.outCHW[2] << "\n";
out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
<< component[i].output_scale_factor << "\n";
}
@ -1344,6 +1342,26 @@ uint32_t GNAPluginNS::backend::AMIntelDNN::CountLayers() {
return n;
}
namespace {
uint32_t outputFromConv(const uint32_t in, const uint32_t flt, const uint32_t stride) {
// floor[(in - flt)/stride] + 1, GNA Spec 1.24
if (flt > in || flt == 0 || stride == 0) {
THROW_GNA_EXCEPTION << "Invalid (input, filter, stride) = (" << in << "," << flt << "," << stride << ")";
}
return (in - flt) / stride + 1;
}
uint32_t outputFromPooling(const uint32_t in, const uint32_t window, const uint32_t stride) {
// ceil[(in - window)/stride] + 1, GNA Spec 1.24
if (window > in || window == 0 || stride == 0) {
THROW_GNA_EXCEPTION << "Invalid (input, window, stride) = (" << in << "," << window << "," << stride << ")";
}
if (window == in) return 1;
return (in - window - 1) / stride + 2;
}
} // namespace
#if GNA_LIB_VER == 2
void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(Gna2Model *gnaModel) {
Gna2Operation * gnaOperation;
@ -1622,18 +1640,29 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
#if GNA_LIB_VER == 2
} else if (gnaOperation->Type == Gna2OperationTypeConvolution) {
auto pwlOperand = gnaOperation->Operands[PwlOpIdx];
if (pwlOperand != nullptr && pwlOperand->Shape.Dimensions[0] != 0) {
THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
if (pwlOperand != nullptr && pwlOperand->Shape.Dimensions[0] != 0 &&
gnaOperation->Operands[InOpIdx]->Shape.NumberOfDimensions == 2) { // kDnnConvolutional1dOp
THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at index == " << i;
} else {
const auto poolMode = reinterpret_cast<Gna2PoolingMode*>(gnaUserAllocator(sizeof(Gna2PoolingMode)));
IE_ASSERT(poolMode != nullptr);
*poolMode = (comp.op.maxpool.do_sum_not_max) ? Gna2PoolingModeSum : Gna2PoolingModeMax;
const auto poolWindow = create_shape1D_parameter(comp.op.maxpool.num_inputs);
const auto poolStride = create_shape1D_parameter(comp.op.maxpool.num_inputs_step);
*poolMode = Gna2PoolingModeMax;
Gna2Shape* poolWindow{};
Gna2Shape* poolStride{};
if (gnaOperation->Operands[InOpIdx]->Shape.NumberOfDimensions == 2) { // kDnnConvolutional1dOp
// TODO: issue 50379 find out why looks like CNN1D pooling uses stride == window only
poolWindow = create_shape1D_parameter(comp.op.maxpool.poolingWindowXY[0]);
poolStride = create_shape1D_parameter(comp.op.maxpool.poolingWindowXY[0]);
} else {
poolWindow = create_shape2D_parameter(comp.op.maxpool.poolingWindowXY[1], comp.op.maxpool.poolingWindowXY[0]);
poolStride = create_shape2D_parameter(comp.op.maxpool.poolingStrideXY[1], comp.op.maxpool.poolingStrideXY[0]);
}
// number of output columns correction - based on GNA-library expectations
if ((gnaOperation->NumberOfParameters > PoolModeParamIdx && gnaOperation->Parameters[PoolModeParamIdx] !=nullptr) ||
if ((gnaOperation->NumberOfParameters > PoolModeParamIdx && gnaOperation->Parameters[PoolModeParamIdx] != nullptr) ||
(gnaOperation->NumberOfParameters > PoolWinParamIdx && gnaOperation->Parameters[PoolWinParamIdx] != nullptr) ||
(gnaOperation->NumberOfParameters > PoolStrideParamIdx && gnaOperation->Parameters[PoolStrideParamIdx] != nullptr)) {
THROW_GNA_EXCEPTION << "Pooling parameters should not be initialized";
@ -1642,15 +1671,41 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
HelperGna2OperationSetParameter(gnaOperation, gnaUserAllocator, gnaUserFree, PoolWinParamIdx, poolWindow);
HelperGna2OperationSetParameter(gnaOperation, gnaUserAllocator, gnaUserFree, PoolStrideParamIdx, poolStride);
const auto inVecCnt = gnaOperation->Operands[InOpIdx]->Shape.Dimensions[1];
auto& outputTensor = const_cast<Gna2Tensor&>(*gnaOperation->Operands[OutOpIdx]);
const auto fltStrideShape = reinterpret_cast<Gna2Shape*>(gnaOperation->Parameters[ConvStrideParamIdx]);
// adjust Gna2OperationTypeConvolution fused layer output dimensions to reflect convolution zeroPadding and pooling
if (gnaOperation->Operands[InOpIdx]->Shape.NumberOfDimensions == 2) { // kDnnConvolutional1dOp
const auto inVecCnt = gnaOperation->Operands[InOpIdx]->Shape.Dimensions[1];
const auto nFltSize = gnaOperation->Operands[FilterOpIdx]->Shape.Dimensions[1];
// Always move 1 "row"
const auto fltStrideSz = reinterpret_cast<Gna2Shape*>(gnaOperation->Parameters[ConvStrideParamIdx])->Dimensions[0];
const auto maxNCOE = (inVecCnt - nFltSize) / fltStrideSz + 1;
// FLAT input matrix, pooled outputs per filter
const_cast<Gna2Tensor*>(gnaOperation->Operands[OutOpIdx])->Shape.Dimensions[1] =
(maxNCOE - 1) / poolStride->Dimensions[0] + 1;
const auto nFltSize = gnaOperation->Operands[FilterOpIdx]->Shape.Dimensions[1];
// Always move 1 "row"
const auto fltStride = fltStrideShape->Dimensions[0];
const auto outFromConv = outputFromConv(inVecCnt, nFltSize, fltStride);
// FLAT input matrix, pooled outputs per filter
// TODO: Issue 50386 check why (outFromConv - 1) an not (outFromConv - poolingWindow)
outputTensor.Shape.Dimensions[1] =
(outFromConv - 1) / poolStride->Dimensions[0] + 1;
} else { // kDnnConvolutional2dOp
// Override GNA operation output pointer with the one from pooling component
outputTensor.Data = comp.ptr_outputs;
Gna2Shape zeroPadding{};
if (gnaOperation->NumberOfParameters > ZeroPaddingParamIdx && gnaOperation->Parameters[ZeroPaddingParamIdx] != nullptr) {
zeroPadding = *reinterpret_cast<Gna2Shape*>(gnaOperation->Parameters[ZeroPaddingParamIdx]);
}
const int beginOfHInNHWC = 1;
const int beginOfHInHW = 0;
for (auto&& dimHW : { 0, 1 }) {
const auto inputPadded = gnaOperation->Operands[InOpIdx]->Shape.Dimensions[beginOfHInNHWC + dimHW]
+ zeroPadding.Dimensions[beginOfHInHW + dimHW] * 2;
const auto nFltSize = gnaOperation->Operands[FilterOpIdx]->Shape.Dimensions[beginOfHInNHWC + dimHW];
const auto fltStride = fltStrideShape->Dimensions[beginOfHInHW + dimHW];
const auto outFromConv = outputFromConv(inputPadded, nFltSize, fltStride);
outputTensor.Shape.Dimensions[beginOfHInNHWC + dimHW] =
outputFromPooling(outFromConv, poolWindow->Dimensions[beginOfHInHW + dimHW], poolStride->Dimensions[beginOfHInHW + dimHW]);
}
AdvanceOperationIfAllApplied(component, i, gnaOperation);
}
}
#else
} else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) {
@ -1662,21 +1717,18 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
if (pConvolutionalLayer->pwl.nSegments != 0) {
THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
} else {
pConvolutionalLayer->poolType =
(component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING;
pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs;
pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step;
pConvolutionalLayer->poolType = INTEL_MAX_POOLING;
// TODO: issue 50379 find out why looks like CNN1D pooling uses stride == window only
pConvolutionalLayer->nPoolSize = component[i].op.maxpool.poolingWindowXY[0];
pConvolutionalLayer->nPoolStride = component[i].op.maxpool.poolingWindowXY[0];
// number of output columns correction - based on GNA-library expectations
auto nFltSize = pConvolutionalLayer->nFilterCoefficients;
auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns; // always move 1 "row"
auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1;
auto outFromConv = outputFromConv(pLayer->nInputColumns, nFltSize, fltStrideSz);
// FLAT input matrix, pooled outputs per filter
pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1);
// old code
// pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride;
// TODO: Issue 50386 check why (outFromConv - 1) an not (outFromConv - nPoolSize)
pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((outFromConv - 1) / pConvolutionalLayer->nPoolStride + 1);
}
#endif
} else {
@ -1729,7 +1781,7 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet
}
}
}
gnaOperation++;
AdvancePwlOperationIfAllApplied(component, i, gnaOperation);
#else
pLayer->pOutputs = component[i].ptr_outputs;
pLayer->nBytesPerOutput = component[i].num_bytes_per_output;

View File

@ -166,30 +166,22 @@ public:
template<class A, class B>
static void InitMaxpoolComponent(intel_dnn_component_t &cmp,
uint32_t num_rows_in,
uint32_t num_columns_in,
uint32_t num_rows_out,
uint32_t num_columns_out,
std::array<uint32_t, 3> inCHW,
std::array<uint32_t, 3> outCHW,
uint32_t num_bytes_per_input,
uint32_t num_bytes_per_output,
uint32_t num_pool_size,
uint32_t num_pool_step,
uint32_t num_pool_stride,
bool do_sum_not_max,
std::array<uint32_t, 2> poolingWindowXY,
std::array<uint32_t, 2> poolingStrideXY,
float output_scale_factor,
A *&ptr_inputs,
B *&ptr_outputs) {
InitMaxpoolComponentPrivate(cmp,
num_rows_in,
num_columns_in,
num_rows_out,
num_columns_out,
inCHW,
outCHW,
num_bytes_per_input,
num_bytes_per_output,
num_pool_size,
num_pool_step,
num_pool_stride,
do_sum_not_max,
poolingWindowXY,
poolingStrideXY,
output_scale_factor,
(void *&) ptr_inputs,
(void *&) ptr_outputs,
@ -389,16 +381,12 @@ private:
bool postInitMem);
static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp,
uint32_t num_rows_in,
uint32_t num_columns_in,
uint32_t num_rows_out,
uint32_t num_columns_out,
std::array<uint32_t, 3> inCHW,
std::array<uint32_t, 3> outCHW,
uint32_t num_bytes_per_input,
uint32_t num_bytes_per_output,
uint32_t num_pool_size,
uint32_t num_pool_step,
uint32_t num_pool_stride,
bool do_sum_not_max,
std::array<uint32_t, 2> poolingWindowXY,
std::array<uint32_t, 2> poolingStrideXY,
float output_scale_factor,
void *&ptr_inputs,
void *&ptr_outputs,

View File

@ -57,5 +57,12 @@ void AdvanceCnnOperationIfAllApplied(const std::vector<intel_dnn_component_t>& c
}
}
template <class T>
void AdvancePwlOperationIfAllApplied(const std::vector<intel_dnn_component_t>& component, int i, T*& operation) {
if (i == component.size() - 1 || (component[i + 1].operation != kDnnMaxPoolOp)) {
operation++;
}
}
} // namespace backend
} // namespace GNAPluginNS

View File

@ -164,10 +164,10 @@ typedef struct {
} intel_convolutional2D_t;
typedef struct {
uint32_t num_inputs; // pool size
uint32_t num_inputs_step; // pool step
uint32_t num_inputs_stride; // pool stride (number of convolution filters)
bool do_sum_not_max;
std::array<uint32_t, 2> poolingWindowXY;
std::array<uint32_t, 2> poolingStrideXY;
std::array<uint32_t, 3> inCHW;
std::array<uint32_t, 3> outCHW;
} intel_maxpool_t;
typedef struct {

View File

@ -523,7 +523,7 @@ class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBas
outData->setPrecision(Desc::mandatory().getInputPrecision());
}
} else {
if (LayerInfo(*cnnLayer).isActivation() ||
if (LayerInfo(*cnnLayer).isActivation() ||
LayerInfo(*cnnLayer).isCopy() ||
LayerInfo(*cnnLayer).isNonFunctional() ||
LayerInfo(*cnnLayer).isPermute() ||
@ -533,6 +533,13 @@ class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBas
outData->setPrecision(Desc::mandatory().getInputPrecision());
}
}
// for pooling layer output precision is the same as input precision
if (LayerInfo(*cnnLayer).isMaxPooling()) {
const auto inputPrecision = cnnLayer->insData.front().lock()->getPrecision();
for (auto&& outData : cnnLayer->outData) {
outData->setPrecision(inputPrecision);
}
}
}
cnnLayer->precision = Desc::mandatory().getInputPrecision();

View File

@ -833,15 +833,15 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
auto inputs = layer->insData.begin()->lock();
auto outputs = *layer->outData.begin();
auto in_order = getFromIRDimsOrderNCHW(inputs->getLayout());
const auto in_order = getFromIRDimsOrderNCHW(inputs->getLayout());
uint32_t w_dim_in = FROM_IR_DIM(inputs, in_order[3]);
uint32_t h_dim_in = FROM_IR_DIM(inputs, in_order[2]);
uint32_t c_dim_in = FROM_IR_DIM(inputs, in_order[1]);
const uint32_t c_dim_in = FROM_IR_DIM(inputs, in_order[1]);
auto out_order = getFromIRDimsOrderNCHW(outputs->getLayout());
const auto out_order = getFromIRDimsOrderNCHW(outputs->getLayout());
uint32_t w_dim_out = FROM_IR_DIM(outputs, out_order[3]);
uint32_t h_dim_out = FROM_IR_DIM(outputs, out_order[2]);
uint32_t c_dim_out = FROM_IR_DIM(outputs, out_order[1]);
const uint32_t c_dim_out = FROM_IR_DIM(outputs, out_order[1]);
if (w_dim_in == 1) { // swap dimensions if needed to support swapped 1D case
swap(h_dim_in, w_dim_in);
@ -849,12 +849,6 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
swap(pooling._kernel[X_AXIS], pooling._kernel[Y_AXIS]);
}
uint32_t num_rows_in = w_dim_in;
uint32_t num_columns_in = c_dim_in;
uint32_t num_rows_out = w_dim_out;
uint32_t num_columns_out = c_dim_out;
uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
void* ptr_inputs = nullptr;
void* ptr_outputs = nullptr;
@ -870,16 +864,12 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
}
dnn->InitMaxpoolComponent(currentComponent,
1,
num_columns_in * num_rows_in,
1,
num_columns_out * num_rows_out,
{ c_dim_in, h_dim_in, w_dim_in },
{ c_dim_out, h_dim_out, w_dim_out },
inputs->getPrecision().size(),
outputs->getPrecision().size(),
pooling._kernel[X_AXIS],
pooling._kernel[X_AXIS],
num_columns_in,
false,
{ pooling._kernel[X_AXIS], pooling._kernel[Y_AXIS] },
{ pooling._stride[X_AXIS], pooling._stride[Y_AXIS] },
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs,
ptr_outputs);
@ -887,7 +877,11 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims()))
* outputs->getPrecision().size();
size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->getPrecision().size();
const auto hw_in = h_dim_in * w_dim_in;
// TODO: Is this really needed?, find out why
uint32_t num_padding = ALIGN(hw_in, 8) - hw_in;
size_t num_data_bytes_in = c_dim_in * (hw_in + num_padding) * inputs->getPrecision().size();
connectInput(layer, ptr_inputs, num_data_bytes_in);
connectOutput(layer, ptr_outputs, num_data_bytes_out);

View File

@ -247,12 +247,22 @@ void GNAPlugin::ExportScores(void *ptr_dst,
ptr_src_vec, num_active_elements * sizeof(int16_t));
}
} else if (num_bytes_per_element == 4) { // should work for both int and float
for (uint32_t i = 0; i < num_frames; i++) {
void *ptr_dst_vec = reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(float);
const void *ptr_src_vec = reinterpret_cast<const uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(float);
memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float));
ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float),
ptr_src_vec, num_active_elements * sizeof(float));
if (num_bytes_per_element_input == 2) {
for (uint32_t i = 0; i < num_frames; i++) {
auto ptr_dst_vec = reinterpret_cast<int32_t*>(ptr_dst) + i * num_vector_elements;
auto ptr_src_vec = reinterpret_cast<const int16_t*>(ptr_src) + i * num_vector_stride;
for (uint32_t j = 0; j < num_vector_elements; j++) {
ptr_dst_vec[j] = ptr_src_vec[j];
}
}
} else {
for (uint32_t i = 0; i < num_frames; i++) {
void* ptr_dst_vec = reinterpret_cast<uint8_t*>(ptr_dst) + i * num_vector_elements * sizeof(float);
const void* ptr_src_vec = reinterpret_cast<const uint8_t*>(ptr_src) + i * num_vector_stride * sizeof(float);
memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float));
ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float),
ptr_src_vec, num_active_elements * sizeof(float));
}
}
} else {
THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";

View File

@ -208,6 +208,16 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
auto prevLayer = PrevFunctionalLayer(l, 0);
// No need to instert identity activation
// when activation was already there before pooling
// in case of CNN -> Activation -> Pooling order
if (LayerInfo(prevLayer).isPooling()) {
auto prevPrevLayer = PrevFunctionalLayer(prevLayer, 0);
if (LayerInfo(prevPrevLayer).isActivation()) {
return prevLayers;
}
}
if (!LayerInfo(prevLayer).has32BOutput())
return prevLayers;
@ -312,6 +322,13 @@ void ForbidActivationFusingPass::run() {
}
}
namespace {
template<class T>
bool is2D(T&& vec) {
return vec.size() >= 2 && vec[0] > 1 && vec[1] > 1;
}
} // namespace
void ReorderMaxPoolPass::run() {
// detecting following pattern
// conv->relu->maxpooling
@ -320,6 +337,10 @@ void ReorderMaxPoolPass::run() {
auto pool = LayerInfo(l);
if (!pool.isMaxPooling()) continue;
// don't reorder if pooling is 2D for CNN2D
auto pooling = dynamic_cast<PoolingLayer*>(l.get());
if (pooling == nullptr || (is2D(pooling->_kernel) || is2D(pooling->_stride))) continue;
// checking prev layer type
auto activation = LayerInfo(CNNNetPrevLayer(l));
if (!activation.isActivation()) continue;

View File

@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include <algorithm>
#include <limits>
#include <cstdint>
#include <cstdio>
@ -45,36 +46,38 @@ void CNNFilter32(intel_dnn_component_t *component) {
}
}
void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
void CNNMaxPoolLegacy(intel_dnn_component_t *component, intel_dnn_number_type_t number_type, const bool sumPoolingOverRide) {
const uint32_t num_inputs = component->op.maxpool.inCHW[0] * component->op.maxpool.inCHW[1] * component->op.maxpool.inCHW[2];
const uint32_t in_c = component->op.maxpool.inCHW[0];
// TODO: issue 50379 find out why looks like CNN1D pooling uses stride == window only
const uint32_t num_pool_size = component->op.maxpool.poolingWindowXY[0];
const uint32_t num_pool_step = component->op.maxpool.poolingWindowXY[0];
const uint32_t num_rows_in = num_inputs / in_c;
if (number_type == kDnnInt) {
int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
uint32_t num_inputs = component->num_columns_in;
uint32_t num_columns = component->op.maxpool.num_inputs_stride;
uint32_t num_pool_size = component->op.maxpool.num_inputs;
uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
for (uint32_t i = 0; i < num_columns; i++) {
for (uint32_t i = 0; i < in_c; i++) {
int32_t m = 0;
if (component->op.maxpool.do_sum_not_max) {
if (sumPoolingOverRide) {
uint32_t num_saturate = 0;
for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
int64_t sum = 0;
uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
for (uint32_t k = j; k < num_end; k++) {
sum += ptr_inputs[k * num_columns + i];
sum += ptr_inputs[k * in_c + i];
}
constexpr int32_t sum_max_threshold = std::numeric_limits<int32_t>::max();
constexpr int32_t sum_min_threshold = std::numeric_limits<int32_t>::min();
if (sum > sum_max_threshold) {
ptr_outputs[m * num_columns + i] = sum_max_threshold;
ptr_outputs[m * in_c + i] = sum_max_threshold;
num_saturate++;
} else if (sum < sum_min_threshold) {
ptr_outputs[m * num_columns + i] = sum_min_threshold;
ptr_outputs[m * in_c + i] = sum_min_threshold;
num_saturate++;
} else {
ptr_outputs[m * num_columns + i] = static_cast<int32_t>(sum);
ptr_outputs[m * in_c + i] = static_cast<int32_t>(sum);
}
m++;
}
@ -86,9 +89,9 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number
int32_t max = INT32_MIN;
uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
for (uint32_t k = j; k < num_end; k++) {
if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
if (ptr_inputs[k * in_c + i] > max) max = ptr_inputs[k * in_c + i];
}
ptr_outputs[m * num_columns + i] = max;
ptr_outputs[m * in_c + i] = max;
m++;
}
}
@ -96,22 +99,17 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number
} else {
float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
uint32_t num_inputs = component->num_columns_in;
uint32_t num_columns = component->op.maxpool.num_inputs_stride;
uint32_t num_pool_size = component->op.maxpool.num_inputs;
uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
for (uint32_t i = 0; i < num_columns; i++) {
for (uint32_t i = 0; i < in_c; i++) {
int32_t m = 0;
if (component->op.maxpool.do_sum_not_max) {
if (sumPoolingOverRide) {
for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
float sum = 0.0;
uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
for (uint32_t k = j; k < num_end; k++) {
sum += ptr_inputs[k * num_columns + i];
sum += ptr_inputs[k * in_c + i];
}
ptr_outputs[m * num_columns + i] = sum;
ptr_outputs[m * in_c + i] = sum;
m++;
}
} else {
@ -119,9 +117,9 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number
float max = -1e20f;
uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
for (uint32_t k = j; k < num_end; k++) {
if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
if (ptr_inputs[k * in_c + i] > max) max = ptr_inputs[k * in_c + i];
}
ptr_outputs[m * num_columns + i] = max;
ptr_outputs[m * in_c + i] = max;
m++;
}
}
@ -129,13 +127,63 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number
}
}
#if GNA_LIB_VER == 2
namespace {
// a1: fastest changing index
// A - size neede
template <typename T>
T getQubeIndex(T a1, T a2, T a3, T A2, T A3) {
return a1 * A2 * A3 + a2 * A3 + a3;
}
} // namespace
float MaxPool2D32SingleHWC(const unsigned poolWinH, const unsigned poolWinW,
const float* input, const unsigned IH, const unsigned IW, const unsigned IC,
const unsigned oh, const unsigned ow, const unsigned oc,
const uint32_t poolStrideH,
const uint32_t poolStrideW) {
float output = std::numeric_limits<float>::lowest();
const auto winStartH = oh * poolStrideH;
const auto winStartW = ow * poolStrideW;
for (unsigned winIdxH = 0; winIdxH < poolWinH && winStartH + winIdxH < IH; winIdxH++) {
for (unsigned winIdxW = 0; winIdxW < poolWinW && winStartW + winIdxW < IW; winIdxW++) {
const auto inputIndex = getQubeIndex(winStartH + winIdxH, winStartW + winIdxW, oc, IW, IC);
output = (std::max)(output, input[inputIndex]);
}
}
return output;
}
void CNNMaxPool2DFloat(intel_dnn_component_t* component) {
float* ptr_inputs = reinterpret_cast<float*>(component->ptr_inputs);
float* ptr_outputs = reinterpret_cast<float*>(component->ptr_outputs);
const auto OC = component->op.maxpool.outCHW[0];
const auto OH = component->op.maxpool.outCHW[1];
const auto OW = component->op.maxpool.outCHW[2];
const auto IC = component->op.maxpool.inCHW[0];
const auto IH = component->op.maxpool.inCHW[1];
const auto IW = component->op.maxpool.inCHW[2];
const auto poolWinW = component->op.maxpool.poolingWindowXY[0];
const auto poolWinH = component->op.maxpool.poolingWindowXY[1];
const auto poolStrideW = component->op.maxpool.poolingStrideXY[0];
const auto poolStrideH = component->op.maxpool.poolingStrideXY[1];
for (unsigned oc = 0; oc < OC; oc++) {
for (unsigned ow = 0; ow < OW; ow++) {
for (unsigned oh = 0; oh < OH; oh++) {
const auto outputIndex = getQubeIndex(oh, ow, oc, OW, OC);
ptr_outputs[outputIndex] = MaxPool2D32SingleHWC(poolWinH, poolWinW,
ptr_inputs, IH, IW, IC,
oh, ow, oc,
poolStrideH,
poolStrideW);
}
}
}
}
#if GNA_LIB_VER == 2
bool matchesPaddedArea(unsigned filterIndex, unsigned outputIndex, unsigned inputSize, unsigned paddingSize, unsigned stride) {
const auto paddedIndex = stride * outputIndex + filterIndex;
@ -228,3 +276,23 @@ void CNN2DFilter32(intel_dnn_component_t* component) {
}
#endif
namespace {
template<class T>
bool is2D(T&& vec) {
return vec.size() >= 2 && vec[0] > 1 && vec[1] > 1;
}
} // namespace
void CNNMaxPool(intel_dnn_component_t* component, intel_dnn_number_type_t number_type, const bool sumPoolingOverRide) {
if (is2D(component->op.maxpool.poolingStrideXY) ||
is2D(component->op.maxpool.poolingWindowXY)) {
if (!sumPoolingOverRide) {
CNNMaxPool2DFloat(component);
} else {
THROW_GNA_EXCEPTION << "SUM pooling2D not supported";
}
} else {
CNNMaxPoolLegacy(component, number_type, sumPoolingOverRide);
}
}

View File

@ -12,7 +12,7 @@
#define CNN_MAX_POOL_SIZE 6
void CNNFilter32(intel_dnn_component_t *component);
void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type);
void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type, const bool sumPoolingOverRide = false);
#if GNA_LIB_VER == 2
void CNN2DFilter32(intel_dnn_component_t* component);

View File

@ -4,6 +4,8 @@
#include <vector>
#include "gna/gna_config.hpp"
#include "subgraph_tests/convolution_relu_sequence.hpp"
#include "common_test_utils/test_constants.hpp"
@ -20,20 +22,49 @@ const std::vector<size_t> inputShapeSimple = {
{1, 32, 64, 16},
};
const std::vector<size_t> inputShapeSimpleWithPooling = {
{1, 32, 128, 32},
};
const std::vector<convReluSpecificParams> convReluSpecificParamsSimpleSeq {
{
{2, 2}, // Kernel size
{2, 2}, // Stride
{0, 0}, // Pad begin
{0, 0}, // Pad end
3 // Num out channels
3, // Num out channels
{1, 1}, //Pooling window
{1, 1} //Pooling stride
},
{
{2, 5}, // Kernel size
{2, 3}, // Stride
{0, 0}, // Pad begin
{0, 0}, // Pad end
8 // Num out channels
8, // Num out channels
{1, 1}, //Pooling window
{1, 1} //Pooling stride
},
};
const std::vector<convReluSpecificParams> convReluSpecificParamsSimpleSeqWithPooling {
{
{3, 3}, // Kernel size
{1, 1}, // Stride
{0, 0}, // Pad begin
{0, 0}, // Pad end
3, // Num out channels
{2, 3}, //Pooling window
{2, 3} //Pooling stride
},
{
{2, 2}, // Kernel size
{1, 2}, // Stride
{0, 0}, // Pad begin
{0, 0}, // Pad end
8, // Num out channels
{2, 3}, //Pooling window
{2, 2} //Pooling stride
},
};
@ -47,21 +78,27 @@ const std::vector<convReluSpecificParams> convReluSpecificParamsFBSeq = {
{1, 1}, // Stride
{2, 3}, // Pad begin
{2, 3}, // Pad end
32 // Num out channels
32, // Num out channels
{1, 1}, //Pooling window
{1, 1} //Pooling stride
},
{
{9, 5}, // Kernel size
{1, 1}, // Stride
{4, 2}, // Pad begin
{4, 2}, // Pad end
32 // Num out channels
32, // Num out channels
{1, 1}, //Pooling window
{1, 1} //Pooling stride
},
{
{1, 1}, // Kernel size
{1, 1}, // Stride
{0, 0}, // Pad begin
{0, 0}, // Pad end
8 // Num out channels
8, // Num out channels
{1, 1}, //Pooling window
{1, 1} //Pooling stride
},
};
@ -73,6 +110,22 @@ const std::vector<convReluSpecificParamsAll> convReluSpecificParamsAllAll = {
{
inputShapeFB,
convReluSpecificParamsFBSeq
},
{
inputShapeSimpleWithPooling,
convReluSpecificParamsSimpleSeqWithPooling
}
};
const std::vector<std::map<std::string, std::string> > configs = {
{
{InferenceEngine::GNAConfigParams::KEY_GNA_DEVICE_MODE, InferenceEngine::GNAConfigParams::GNA_AUTO}
},
{
{InferenceEngine::GNAConfigParams::KEY_GNA_DEVICE_MODE, InferenceEngine::GNAConfigParams::GNA_SW_FP32}
},
{
{InferenceEngine::GNAConfigParams::KEY_GNA_DEVICE_MODE, InferenceEngine::GNAConfigParams::GNA_SW_EXACT}
}
};
@ -83,7 +136,8 @@ INSTANTIATE_TEST_CASE_P(DISABLED_smoke_ConvolutionReluSequenceTest, ConvolutionR
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(CommonTestUtils::DEVICE_GNA)),
::testing::Values(CommonTestUtils::DEVICE_GNA),
::testing::ValuesIn(configs)),
ConvolutionReluSequenceTest::getTestCaseName);
} // namespace

View File

@ -22,6 +22,8 @@ typedef struct {
std::vector<ptrdiff_t> padBegin;
std::vector<ptrdiff_t> padEnd;
size_t numOutChannels;
InferenceEngine::SizeVector poolingWindow;
InferenceEngine::SizeVector poolingStride;
} convReluSpecificParams;
typedef struct {
@ -30,11 +32,12 @@ typedef struct {
} convReluSpecificParamsAll;
typedef std::tuple<
convReluSpecificParamsAll, // CNN2D sequence desc
InferenceEngine::Precision, // Net precision
InferenceEngine::Precision, // Input precision
InferenceEngine::Precision, // Output precision
LayerTestsUtils::TargetDevice // Device name
convReluSpecificParamsAll, // CNN2D sequence desc
InferenceEngine::Precision, // Net precision
InferenceEngine::Precision, // Input precision
InferenceEngine::Precision, // Output precision
LayerTestsUtils::TargetDevice, // Device name
std::map<std::string, std::string> // Configuration
> convReluSequenceTestParamsSet;
class ConvolutionReluSequenceTest : public testing::WithParamInterface<convReluSequenceTestParamsSet>,

View File

@ -11,7 +11,8 @@ std::string ConvolutionReluSequenceTest::getTestCaseName(testing::TestParamInfo<
InferenceEngine::Precision netPrecision;
InferenceEngine::Precision inPrc, outPrc;
std::string targetDevice;
std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice) =
std::map<std::string, std::string> config;
std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice, config) =
obj.param;
std::ostringstream result;
@ -27,8 +28,13 @@ std::string ConvolutionReluSequenceTest::getTestCaseName(testing::TestParamInfo<
result << "PB" << CommonTestUtils::vec2str(single.padBegin) << "_";
result << "PE" << CommonTestUtils::vec2str(single.padEnd) << "_";
result << "O=" << single.numOutChannels << "_";
result << "PW" << CommonTestUtils::vec2str(single.poolingWindow) << "_";
result << "PS" << CommonTestUtils::vec2str(single.poolingStride) << "_";
}
for (auto&& single : config) {
result << single.first << "=" << single.second;
}
return result.str();
}
@ -37,8 +43,10 @@ void ConvolutionReluSequenceTest::SetUp() {
const InferenceEngine::SizeVector dilation = { 1, 1 };
convReluSpecificParamsAll convParamsAll;
auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice) =
std::map<std::string, std::string> config;
std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice, config) =
this->GetParam();
configuration.insert(config.begin(), config.end());
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto params = ngraph::builder::makeParams(ngPrc, { convParamsAll.inputShape});
auto lastOutputs = ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params).front();
@ -67,6 +75,14 @@ void ConvolutionReluSequenceTest::SetUp() {
ngPrc, single.kernelSize, single.strides, single.padBegin, single.padEnd,
dilation, ngraph::op::PadType::EXPLICIT, single.numOutChannels, addBiases, filter_weights, biases));
lastOutputs = std::make_shared<ngraph::opset1::Relu>(conv);
if (single.poolingWindow.size() == 2 &&
(single.poolingWindow[0] != 1 ||
single.poolingWindow[1] != 1)) {
lastOutputs = std::make_shared<ngraph::opset3::MaxPool>(lastOutputs, single.poolingStride,
ngraph::Shape{ 0, 0 },
ngraph::Shape{ 0, 0 },
single.poolingWindow);
}
inputChannels = single.numOutChannels;
}