diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp index 4a758649e94..62ab9988019 100644 --- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp +++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp @@ -255,34 +255,26 @@ void GNAPluginNS::backend::AMIntelDNN::InitConvolutional2DComponentPrivate(intel #endif void GNAPluginNS::backend::AMIntelDNN::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp, - uint32_t num_rows_in, - uint32_t num_columns_in, - uint32_t num_rows_out, - uint32_t num_columns_out, - uint32_t num_bytes_per_input, - uint32_t num_bytes_per_output, - uint32_t num_pool_size, - uint32_t num_pool_step, - uint32_t num_pool_stride, - bool do_sum_not_max, - float output_scale_factor, - void *&ptr_inputs, - void *&ptr_outputs, - bool postInitMem) { - comp.num_rows_in = num_rows_in; - comp.num_columns_in = num_columns_in; - comp.num_rows_out = num_rows_out; - comp.num_columns_out = num_columns_out; + std::array inCHW, + std::array outCHW, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + std::array poolingWindowXY, + std::array poolingStrideXY, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + bool postInitMem) { comp.num_bytes_per_input = num_bytes_per_input; comp.num_bytes_per_output = num_bytes_per_output; comp.operation = kDnnMaxPoolOp; comp.macro_operation = kDnnMacroOpNone; comp.orientation_in = kDnnNonInterleavedOrientation; comp.orientation_out = kDnnNonInterleavedOrientation; - comp.op.maxpool.num_inputs = num_pool_size; - comp.op.maxpool.num_inputs_step = num_pool_step; - comp.op.maxpool.num_inputs_stride = num_pool_stride; - comp.op.maxpool.do_sum_not_max = do_sum_not_max; + comp.op.maxpool.inCHW = inCHW; + comp.op.maxpool.outCHW = outCHW; + comp.op.maxpool.poolingWindowXY = poolingWindowXY; + comp.op.maxpool.poolingStrideXY = poolingStrideXY; comp.output_scale_factor = output_scale_factor; comp.input_scale_factor = output_scale_factor; if (!postInitMem) { @@ -1209,11 +1201,17 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_ } break; case kDnnMaxPoolOp: { - uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1; - out_file << " " << std::dec << num_pool_type << "\n"; - out_file << " " << std::dec << component[i].op.maxpool.num_inputs << "\n"; - out_file << " " << std::dec << component[i].op.maxpool.num_inputs_step << "\n"; - out_file << " " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n"; + out_file << " MAX\n"; + out_file << " " << std::dec << component[i].op.maxpool.poolingWindowXY[0] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.poolingWindowXY[1] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.poolingStrideXY[0] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.poolingStrideXY[1] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.inCHW[0] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.inCHW[1] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.inCHW[2] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.outCHW[0] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.outCHW[1] << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.outCHW[2] << "\n"; out_file << std::setprecision(12) << std::scientific << " " << component[i].output_scale_factor << "\n"; } @@ -1344,6 +1342,26 @@ uint32_t GNAPluginNS::backend::AMIntelDNN::CountLayers() { return n; } +namespace { +uint32_t outputFromConv(const uint32_t in, const uint32_t flt, const uint32_t stride) { + // floor[(in - flt)/stride] + 1, GNA Spec 1.24 + if (flt > in || flt == 0 || stride == 0) { + THROW_GNA_EXCEPTION << "Invalid (input, filter, stride) = (" << in << "," << flt << "," << stride << ")"; + } + return (in - flt) / stride + 1; +} + +uint32_t outputFromPooling(const uint32_t in, const uint32_t window, const uint32_t stride) { + // ceil[(in - window)/stride] + 1, GNA Spec 1.24 + if (window > in || window == 0 || stride == 0) { + THROW_GNA_EXCEPTION << "Invalid (input, window, stride) = (" << in << "," << window << "," << stride << ")"; + } + if (window == in) return 1; + + return (in - window - 1) / stride + 2; +} +} // namespace + #if GNA_LIB_VER == 2 void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(Gna2Model *gnaModel) { Gna2Operation * gnaOperation; @@ -1622,18 +1640,29 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet #if GNA_LIB_VER == 2 } else if (gnaOperation->Type == Gna2OperationTypeConvolution) { auto pwlOperand = gnaOperation->Operands[PwlOpIdx]; - if (pwlOperand != nullptr && pwlOperand->Shape.Dimensions[0] != 0) { - THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i; + if (pwlOperand != nullptr && pwlOperand->Shape.Dimensions[0] != 0 && + gnaOperation->Operands[InOpIdx]->Shape.NumberOfDimensions == 2) { // kDnnConvolutional1dOp + THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at index == " << i; } else { const auto poolMode = reinterpret_cast(gnaUserAllocator(sizeof(Gna2PoolingMode))); IE_ASSERT(poolMode != nullptr); - *poolMode = (comp.op.maxpool.do_sum_not_max) ? Gna2PoolingModeSum : Gna2PoolingModeMax; - const auto poolWindow = create_shape1D_parameter(comp.op.maxpool.num_inputs); - const auto poolStride = create_shape1D_parameter(comp.op.maxpool.num_inputs_step); + *poolMode = Gna2PoolingModeMax; + + Gna2Shape* poolWindow{}; + Gna2Shape* poolStride{}; + + if (gnaOperation->Operands[InOpIdx]->Shape.NumberOfDimensions == 2) { // kDnnConvolutional1dOp + // TODO: issue 50379 find out why looks like CNN1D pooling uses stride == window only + poolWindow = create_shape1D_parameter(comp.op.maxpool.poolingWindowXY[0]); + poolStride = create_shape1D_parameter(comp.op.maxpool.poolingWindowXY[0]); + } else { + poolWindow = create_shape2D_parameter(comp.op.maxpool.poolingWindowXY[1], comp.op.maxpool.poolingWindowXY[0]); + poolStride = create_shape2D_parameter(comp.op.maxpool.poolingStrideXY[1], comp.op.maxpool.poolingStrideXY[0]); + } // number of output columns correction - based on GNA-library expectations - if ((gnaOperation->NumberOfParameters > PoolModeParamIdx && gnaOperation->Parameters[PoolModeParamIdx] !=nullptr) || + if ((gnaOperation->NumberOfParameters > PoolModeParamIdx && gnaOperation->Parameters[PoolModeParamIdx] != nullptr) || (gnaOperation->NumberOfParameters > PoolWinParamIdx && gnaOperation->Parameters[PoolWinParamIdx] != nullptr) || (gnaOperation->NumberOfParameters > PoolStrideParamIdx && gnaOperation->Parameters[PoolStrideParamIdx] != nullptr)) { THROW_GNA_EXCEPTION << "Pooling parameters should not be initialized"; @@ -1642,15 +1671,41 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet HelperGna2OperationSetParameter(gnaOperation, gnaUserAllocator, gnaUserFree, PoolWinParamIdx, poolWindow); HelperGna2OperationSetParameter(gnaOperation, gnaUserAllocator, gnaUserFree, PoolStrideParamIdx, poolStride); - const auto inVecCnt = gnaOperation->Operands[InOpIdx]->Shape.Dimensions[1]; + auto& outputTensor = const_cast(*gnaOperation->Operands[OutOpIdx]); + const auto fltStrideShape = reinterpret_cast(gnaOperation->Parameters[ConvStrideParamIdx]); + // adjust Gna2OperationTypeConvolution fused layer output dimensions to reflect convolution zeroPadding and pooling + if (gnaOperation->Operands[InOpIdx]->Shape.NumberOfDimensions == 2) { // kDnnConvolutional1dOp + const auto inVecCnt = gnaOperation->Operands[InOpIdx]->Shape.Dimensions[1]; - const auto nFltSize = gnaOperation->Operands[FilterOpIdx]->Shape.Dimensions[1]; - // Always move 1 "row" - const auto fltStrideSz = reinterpret_cast(gnaOperation->Parameters[ConvStrideParamIdx])->Dimensions[0]; - const auto maxNCOE = (inVecCnt - nFltSize) / fltStrideSz + 1; - // FLAT input matrix, pooled outputs per filter - const_cast(gnaOperation->Operands[OutOpIdx])->Shape.Dimensions[1] = - (maxNCOE - 1) / poolStride->Dimensions[0] + 1; + const auto nFltSize = gnaOperation->Operands[FilterOpIdx]->Shape.Dimensions[1]; + // Always move 1 "row" + const auto fltStride = fltStrideShape->Dimensions[0]; + const auto outFromConv = outputFromConv(inVecCnt, nFltSize, fltStride); + // FLAT input matrix, pooled outputs per filter + // TODO: Issue 50386 check why (outFromConv - 1) an not (outFromConv - poolingWindow) + outputTensor.Shape.Dimensions[1] = + (outFromConv - 1) / poolStride->Dimensions[0] + 1; + } else { // kDnnConvolutional2dOp + // Override GNA operation output pointer with the one from pooling component + outputTensor.Data = comp.ptr_outputs; + + Gna2Shape zeroPadding{}; + if (gnaOperation->NumberOfParameters > ZeroPaddingParamIdx && gnaOperation->Parameters[ZeroPaddingParamIdx] != nullptr) { + zeroPadding = *reinterpret_cast(gnaOperation->Parameters[ZeroPaddingParamIdx]); + } + const int beginOfHInNHWC = 1; + const int beginOfHInHW = 0; + for (auto&& dimHW : { 0, 1 }) { + const auto inputPadded = gnaOperation->Operands[InOpIdx]->Shape.Dimensions[beginOfHInNHWC + dimHW] + + zeroPadding.Dimensions[beginOfHInHW + dimHW] * 2; + const auto nFltSize = gnaOperation->Operands[FilterOpIdx]->Shape.Dimensions[beginOfHInNHWC + dimHW]; + const auto fltStride = fltStrideShape->Dimensions[beginOfHInHW + dimHW]; + const auto outFromConv = outputFromConv(inputPadded, nFltSize, fltStride); + outputTensor.Shape.Dimensions[beginOfHInNHWC + dimHW] = + outputFromPooling(outFromConv, poolWindow->Dimensions[beginOfHInHW + dimHW], poolStride->Dimensions[beginOfHInHW + dimHW]); + } + AdvanceOperationIfAllApplied(component, i, gnaOperation); + } } #else } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) { @@ -1662,21 +1717,18 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet if (pConvolutionalLayer->pwl.nSegments != 0) { THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i; } else { - pConvolutionalLayer->poolType = - (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING; - pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs; - pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step; - + pConvolutionalLayer->poolType = INTEL_MAX_POOLING; + // TODO: issue 50379 find out why looks like CNN1D pooling uses stride == window only + pConvolutionalLayer->nPoolSize = component[i].op.maxpool.poolingWindowXY[0]; + pConvolutionalLayer->nPoolStride = component[i].op.maxpool.poolingWindowXY[0]; // number of output columns correction - based on GNA-library expectations auto nFltSize = pConvolutionalLayer->nFilterCoefficients; auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns; // always move 1 "row" - auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1; + auto outFromConv = outputFromConv(pLayer->nInputColumns, nFltSize, fltStrideSz); // FLAT input matrix, pooled outputs per filter - pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1); - - // old code - // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride; + // TODO: Issue 50386 check why (outFromConv - 1) an not (outFromConv - nPoolSize) + pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((outFromConv - 1) / pConvolutionalLayer->nPoolStride + 1); } #endif } else { @@ -1729,7 +1781,7 @@ void GNAPluginNS::backend::AMIntelDNN::InitGNAStruct(intel_nnet_type_t *ptr_nnet } } } - gnaOperation++; + AdvancePwlOperationIfAllApplied(component, i, gnaOperation); #else pLayer->pOutputs = component[i].ptr_outputs; pLayer->nBytesPerOutput = component[i].num_bytes_per_output; diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp index 1a64f9b841f..3c635b5cab7 100644 --- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp +++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.hpp @@ -166,30 +166,22 @@ public: template static void InitMaxpoolComponent(intel_dnn_component_t &cmp, - uint32_t num_rows_in, - uint32_t num_columns_in, - uint32_t num_rows_out, - uint32_t num_columns_out, + std::array inCHW, + std::array outCHW, uint32_t num_bytes_per_input, uint32_t num_bytes_per_output, - uint32_t num_pool_size, - uint32_t num_pool_step, - uint32_t num_pool_stride, - bool do_sum_not_max, + std::array poolingWindowXY, + std::array poolingStrideXY, float output_scale_factor, A *&ptr_inputs, B *&ptr_outputs) { InitMaxpoolComponentPrivate(cmp, - num_rows_in, - num_columns_in, - num_rows_out, - num_columns_out, + inCHW, + outCHW, num_bytes_per_input, num_bytes_per_output, - num_pool_size, - num_pool_step, - num_pool_stride, - do_sum_not_max, + poolingWindowXY, + poolingStrideXY, output_scale_factor, (void *&) ptr_inputs, (void *&) ptr_outputs, @@ -389,16 +381,12 @@ private: bool postInitMem); static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp, - uint32_t num_rows_in, - uint32_t num_columns_in, - uint32_t num_rows_out, - uint32_t num_columns_out, + std::array inCHW, + std::array outCHW, uint32_t num_bytes_per_input, uint32_t num_bytes_per_output, - uint32_t num_pool_size, - uint32_t num_pool_step, - uint32_t num_pool_stride, - bool do_sum_not_max, + std::array poolingWindowXY, + std::array poolingStrideXY, float output_scale_factor, void *&ptr_inputs, void *&ptr_outputs, diff --git a/inference-engine/src/gna_plugin/backend/dnn.hpp b/inference-engine/src/gna_plugin/backend/dnn.hpp index 3369e73a98c..f75e092cd80 100644 --- a/inference-engine/src/gna_plugin/backend/dnn.hpp +++ b/inference-engine/src/gna_plugin/backend/dnn.hpp @@ -57,5 +57,12 @@ void AdvanceCnnOperationIfAllApplied(const std::vector& c } } +template +void AdvancePwlOperationIfAllApplied(const std::vector& component, int i, T*& operation) { + if (i == component.size() - 1 || (component[i + 1].operation != kDnnMaxPoolOp)) { + operation++; + } +} + } // namespace backend } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h index ea0b5a1e399..78bfe17bd0a 100644 --- a/inference-engine/src/gna_plugin/backend/dnn_types.h +++ b/inference-engine/src/gna_plugin/backend/dnn_types.h @@ -164,10 +164,10 @@ typedef struct { } intel_convolutional2D_t; typedef struct { - uint32_t num_inputs; // pool size - uint32_t num_inputs_step; // pool step - uint32_t num_inputs_stride; // pool stride (number of convolution filters) - bool do_sum_not_max; + std::array poolingWindowXY; + std::array poolingStrideXY; + std::array inCHW; + std::array outCHW; } intel_maxpool_t; typedef struct { diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp index 6f38366f6e5..f115ec19353 100644 --- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp @@ -523,7 +523,7 @@ class DataQuantizer : public DataQuantizerBas outData->setPrecision(Desc::mandatory().getInputPrecision()); } } else { - if (LayerInfo(*cnnLayer).isActivation() || + if (LayerInfo(*cnnLayer).isActivation() || LayerInfo(*cnnLayer).isCopy() || LayerInfo(*cnnLayer).isNonFunctional() || LayerInfo(*cnnLayer).isPermute() || @@ -533,6 +533,13 @@ class DataQuantizer : public DataQuantizerBas outData->setPrecision(Desc::mandatory().getInputPrecision()); } } + // for pooling layer output precision is the same as input precision + if (LayerInfo(*cnnLayer).isMaxPooling()) { + const auto inputPrecision = cnnLayer->insData.front().lock()->getPrecision(); + for (auto&& outData : cnnLayer->outData) { + outData->setPrecision(inputPrecision); + } + } } cnnLayer->precision = Desc::mandatory().getInputPrecision(); diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp index bf6caf1e5b4..a16b0ab731f 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp @@ -833,15 +833,15 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) { auto inputs = layer->insData.begin()->lock(); auto outputs = *layer->outData.begin(); - auto in_order = getFromIRDimsOrderNCHW(inputs->getLayout()); + const auto in_order = getFromIRDimsOrderNCHW(inputs->getLayout()); uint32_t w_dim_in = FROM_IR_DIM(inputs, in_order[3]); uint32_t h_dim_in = FROM_IR_DIM(inputs, in_order[2]); - uint32_t c_dim_in = FROM_IR_DIM(inputs, in_order[1]); + const uint32_t c_dim_in = FROM_IR_DIM(inputs, in_order[1]); - auto out_order = getFromIRDimsOrderNCHW(outputs->getLayout()); + const auto out_order = getFromIRDimsOrderNCHW(outputs->getLayout()); uint32_t w_dim_out = FROM_IR_DIM(outputs, out_order[3]); uint32_t h_dim_out = FROM_IR_DIM(outputs, out_order[2]); - uint32_t c_dim_out = FROM_IR_DIM(outputs, out_order[1]); + const uint32_t c_dim_out = FROM_IR_DIM(outputs, out_order[1]); if (w_dim_in == 1) { // swap dimensions if needed to support swapped 1D case swap(h_dim_in, w_dim_in); @@ -849,12 +849,6 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) { swap(pooling._kernel[X_AXIS], pooling._kernel[Y_AXIS]); } - uint32_t num_rows_in = w_dim_in; - uint32_t num_columns_in = c_dim_in; - uint32_t num_rows_out = w_dim_out; - uint32_t num_columns_out = c_dim_out; - uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; - void* ptr_inputs = nullptr; void* ptr_outputs = nullptr; @@ -870,16 +864,12 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) { } dnn->InitMaxpoolComponent(currentComponent, - 1, - num_columns_in * num_rows_in, - 1, - num_columns_out * num_rows_out, + { c_dim_in, h_dim_in, w_dim_in }, + { c_dim_out, h_dim_out, w_dim_out }, inputs->getPrecision().size(), outputs->getPrecision().size(), - pooling._kernel[X_AXIS], - pooling._kernel[X_AXIS], - num_columns_in, - false, + { pooling._kernel[X_AXIS], pooling._kernel[Y_AXIS] }, + { pooling._stride[X_AXIS], pooling._stride[Y_AXIS] }, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(), ptr_inputs, ptr_outputs); @@ -887,7 +877,11 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) { size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims())) * outputs->getPrecision().size(); - size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->getPrecision().size(); + const auto hw_in = h_dim_in * w_dim_in; + + // TODO: Is this really needed?, find out why + uint32_t num_padding = ALIGN(hw_in, 8) - hw_in; + size_t num_data_bytes_in = c_dim_in * (hw_in + num_padding) * inputs->getPrecision().size(); connectInput(layer, ptr_inputs, num_data_bytes_in); connectOutput(layer, ptr_outputs, num_data_bytes_out); diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index e0441e5f805..10d79f74d7d 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -247,12 +247,22 @@ void GNAPlugin::ExportScores(void *ptr_dst, ptr_src_vec, num_active_elements * sizeof(int16_t)); } } else if (num_bytes_per_element == 4) { // should work for both int and float - for (uint32_t i = 0; i < num_frames; i++) { - void *ptr_dst_vec = reinterpret_cast(ptr_dst) + i * num_vector_elements * sizeof(float); - const void *ptr_src_vec = reinterpret_cast(ptr_src) + i * num_vector_stride * sizeof(float); - memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float)); - ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float), - ptr_src_vec, num_active_elements * sizeof(float)); + if (num_bytes_per_element_input == 2) { + for (uint32_t i = 0; i < num_frames; i++) { + auto ptr_dst_vec = reinterpret_cast(ptr_dst) + i * num_vector_elements; + auto ptr_src_vec = reinterpret_cast(ptr_src) + i * num_vector_stride; + for (uint32_t j = 0; j < num_vector_elements; j++) { + ptr_dst_vec[j] = ptr_src_vec[j]; + } + } + } else { + for (uint32_t i = 0; i < num_frames; i++) { + void* ptr_dst_vec = reinterpret_cast(ptr_dst) + i * num_vector_elements * sizeof(float); + const void* ptr_src_vec = reinterpret_cast(ptr_src) + i * num_vector_stride * sizeof(float); + memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float)); + ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float), + ptr_src_vec, num_active_elements * sizeof(float)); + } } } else { THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes"; diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp index 8af50ebc32e..a085a3f757c 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp @@ -208,6 +208,16 @@ static std::vector getCandidatesForIdentityInsertion(const CNNLayer auto prevLayer = PrevFunctionalLayer(l, 0); + // No need to instert identity activation + // when activation was already there before pooling + // in case of CNN -> Activation -> Pooling order + if (LayerInfo(prevLayer).isPooling()) { + auto prevPrevLayer = PrevFunctionalLayer(prevLayer, 0); + if (LayerInfo(prevPrevLayer).isActivation()) { + return prevLayers; + } + } + if (!LayerInfo(prevLayer).has32BOutput()) return prevLayers; @@ -312,6 +322,13 @@ void ForbidActivationFusingPass::run() { } } +namespace { + template + bool is2D(T&& vec) { + return vec.size() >= 2 && vec[0] > 1 && vec[1] > 1; + } +} // namespace + void ReorderMaxPoolPass::run() { // detecting following pattern // conv->relu->maxpooling @@ -320,6 +337,10 @@ void ReorderMaxPoolPass::run() { auto pool = LayerInfo(l); if (!pool.isMaxPooling()) continue; + // don't reorder if pooling is 2D for CNN2D + auto pooling = dynamic_cast(l.get()); + if (pooling == nullptr || (is2D(pooling->_kernel) || is2D(pooling->_stride))) continue; + // checking prev layer type auto activation = LayerInfo(CNNNetPrevLayer(l)); if (!activation.isActivation()) continue; diff --git a/inference-engine/src/gna_plugin/runtime/cnn.cpp b/inference-engine/src/gna_plugin/runtime/cnn.cpp index 65c99c50a78..32955cd0fac 100644 --- a/inference-engine/src/gna_plugin/runtime/cnn.cpp +++ b/inference-engine/src/gna_plugin/runtime/cnn.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include #include #include #include @@ -45,36 +46,38 @@ void CNNFilter32(intel_dnn_component_t *component) { } } -void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) { +void CNNMaxPoolLegacy(intel_dnn_component_t *component, intel_dnn_number_type_t number_type, const bool sumPoolingOverRide) { + const uint32_t num_inputs = component->op.maxpool.inCHW[0] * component->op.maxpool.inCHW[1] * component->op.maxpool.inCHW[2]; + const uint32_t in_c = component->op.maxpool.inCHW[0]; + // TODO: issue 50379 find out why looks like CNN1D pooling uses stride == window only + const uint32_t num_pool_size = component->op.maxpool.poolingWindowXY[0]; + const uint32_t num_pool_step = component->op.maxpool.poolingWindowXY[0]; + const uint32_t num_rows_in = num_inputs / in_c; + if (number_type == kDnnInt) { int32_t *ptr_inputs = reinterpret_cast(component->ptr_inputs); int32_t *ptr_outputs = reinterpret_cast(component->ptr_outputs); - uint32_t num_inputs = component->num_columns_in; - uint32_t num_columns = component->op.maxpool.num_inputs_stride; - uint32_t num_pool_size = component->op.maxpool.num_inputs; - uint32_t num_pool_step = component->op.maxpool.num_inputs_step; - uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride; - for (uint32_t i = 0; i < num_columns; i++) { + for (uint32_t i = 0; i < in_c; i++) { int32_t m = 0; - if (component->op.maxpool.do_sum_not_max) { + if (sumPoolingOverRide) { uint32_t num_saturate = 0; for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) { int64_t sum = 0; uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; for (uint32_t k = j; k < num_end; k++) { - sum += ptr_inputs[k * num_columns + i]; + sum += ptr_inputs[k * in_c + i]; } constexpr int32_t sum_max_threshold = std::numeric_limits::max(); constexpr int32_t sum_min_threshold = std::numeric_limits::min(); if (sum > sum_max_threshold) { - ptr_outputs[m * num_columns + i] = sum_max_threshold; + ptr_outputs[m * in_c + i] = sum_max_threshold; num_saturate++; } else if (sum < sum_min_threshold) { - ptr_outputs[m * num_columns + i] = sum_min_threshold; + ptr_outputs[m * in_c + i] = sum_min_threshold; num_saturate++; } else { - ptr_outputs[m * num_columns + i] = static_cast(sum); + ptr_outputs[m * in_c + i] = static_cast(sum); } m++; } @@ -86,9 +89,9 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number int32_t max = INT32_MIN; uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; for (uint32_t k = j; k < num_end; k++) { - if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i]; + if (ptr_inputs[k * in_c + i] > max) max = ptr_inputs[k * in_c + i]; } - ptr_outputs[m * num_columns + i] = max; + ptr_outputs[m * in_c + i] = max; m++; } } @@ -96,22 +99,17 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number } else { float *ptr_inputs = reinterpret_cast(component->ptr_inputs); float *ptr_outputs = reinterpret_cast(component->ptr_outputs); - uint32_t num_inputs = component->num_columns_in; - uint32_t num_columns = component->op.maxpool.num_inputs_stride; - uint32_t num_pool_size = component->op.maxpool.num_inputs; - uint32_t num_pool_step = component->op.maxpool.num_inputs_step; - uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride; - for (uint32_t i = 0; i < num_columns; i++) { + for (uint32_t i = 0; i < in_c; i++) { int32_t m = 0; - if (component->op.maxpool.do_sum_not_max) { + if (sumPoolingOverRide) { for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) { float sum = 0.0; uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; for (uint32_t k = j; k < num_end; k++) { - sum += ptr_inputs[k * num_columns + i]; + sum += ptr_inputs[k * in_c + i]; } - ptr_outputs[m * num_columns + i] = sum; + ptr_outputs[m * in_c + i] = sum; m++; } } else { @@ -119,9 +117,9 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number float max = -1e20f; uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; for (uint32_t k = j; k < num_end; k++) { - if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i]; + if (ptr_inputs[k * in_c + i] > max) max = ptr_inputs[k * in_c + i]; } - ptr_outputs[m * num_columns + i] = max; + ptr_outputs[m * in_c + i] = max; m++; } } @@ -129,13 +127,63 @@ void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number } } -#if GNA_LIB_VER == 2 +namespace { // a1: fastest changing index // A - size neede template T getQubeIndex(T a1, T a2, T a3, T A2, T A3) { return a1 * A2 * A3 + a2 * A3 + a3; } +} // namespace + +float MaxPool2D32SingleHWC(const unsigned poolWinH, const unsigned poolWinW, + const float* input, const unsigned IH, const unsigned IW, const unsigned IC, + const unsigned oh, const unsigned ow, const unsigned oc, + const uint32_t poolStrideH, + const uint32_t poolStrideW) { + float output = std::numeric_limits::lowest(); + const auto winStartH = oh * poolStrideH; + const auto winStartW = ow * poolStrideW; + for (unsigned winIdxH = 0; winIdxH < poolWinH && winStartH + winIdxH < IH; winIdxH++) { + for (unsigned winIdxW = 0; winIdxW < poolWinW && winStartW + winIdxW < IW; winIdxW++) { + const auto inputIndex = getQubeIndex(winStartH + winIdxH, winStartW + winIdxW, oc, IW, IC); + output = (std::max)(output, input[inputIndex]); + } + } + return output; +} + +void CNNMaxPool2DFloat(intel_dnn_component_t* component) { + float* ptr_inputs = reinterpret_cast(component->ptr_inputs); + float* ptr_outputs = reinterpret_cast(component->ptr_outputs); + const auto OC = component->op.maxpool.outCHW[0]; + const auto OH = component->op.maxpool.outCHW[1]; + const auto OW = component->op.maxpool.outCHW[2]; + + const auto IC = component->op.maxpool.inCHW[0]; + const auto IH = component->op.maxpool.inCHW[1]; + const auto IW = component->op.maxpool.inCHW[2]; + + const auto poolWinW = component->op.maxpool.poolingWindowXY[0]; + const auto poolWinH = component->op.maxpool.poolingWindowXY[1]; + const auto poolStrideW = component->op.maxpool.poolingStrideXY[0]; + const auto poolStrideH = component->op.maxpool.poolingStrideXY[1]; + + for (unsigned oc = 0; oc < OC; oc++) { + for (unsigned ow = 0; ow < OW; ow++) { + for (unsigned oh = 0; oh < OH; oh++) { + const auto outputIndex = getQubeIndex(oh, ow, oc, OW, OC); + ptr_outputs[outputIndex] = MaxPool2D32SingleHWC(poolWinH, poolWinW, + ptr_inputs, IH, IW, IC, + oh, ow, oc, + poolStrideH, + poolStrideW); + } + } + } +} + +#if GNA_LIB_VER == 2 bool matchesPaddedArea(unsigned filterIndex, unsigned outputIndex, unsigned inputSize, unsigned paddingSize, unsigned stride) { const auto paddedIndex = stride * outputIndex + filterIndex; @@ -228,3 +276,23 @@ void CNN2DFilter32(intel_dnn_component_t* component) { } #endif + +namespace { +template +bool is2D(T&& vec) { + return vec.size() >= 2 && vec[0] > 1 && vec[1] > 1; +} +} // namespace + +void CNNMaxPool(intel_dnn_component_t* component, intel_dnn_number_type_t number_type, const bool sumPoolingOverRide) { + if (is2D(component->op.maxpool.poolingStrideXY) || + is2D(component->op.maxpool.poolingWindowXY)) { + if (!sumPoolingOverRide) { + CNNMaxPool2DFloat(component); + } else { + THROW_GNA_EXCEPTION << "SUM pooling2D not supported"; + } + } else { + CNNMaxPoolLegacy(component, number_type, sumPoolingOverRide); + } +} diff --git a/inference-engine/src/gna_plugin/runtime/cnn.h b/inference-engine/src/gna_plugin/runtime/cnn.h index 5f3a667ab91..03b24d5babd 100644 --- a/inference-engine/src/gna_plugin/runtime/cnn.h +++ b/inference-engine/src/gna_plugin/runtime/cnn.h @@ -12,7 +12,7 @@ #define CNN_MAX_POOL_SIZE 6 void CNNFilter32(intel_dnn_component_t *component); -void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type); +void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type, const bool sumPoolingOverRide = false); #if GNA_LIB_VER == 2 void CNN2DFilter32(intel_dnn_component_t* component); diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp index aff4668c8d9..a8e54a18c2a 100644 --- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp +++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp @@ -4,6 +4,8 @@ #include +#include "gna/gna_config.hpp" + #include "subgraph_tests/convolution_relu_sequence.hpp" #include "common_test_utils/test_constants.hpp" @@ -20,20 +22,49 @@ const std::vector inputShapeSimple = { {1, 32, 64, 16}, }; +const std::vector inputShapeSimpleWithPooling = { + {1, 32, 128, 32}, +}; + const std::vector convReluSpecificParamsSimpleSeq { { {2, 2}, // Kernel size {2, 2}, // Stride {0, 0}, // Pad begin {0, 0}, // Pad end - 3 // Num out channels + 3, // Num out channels + {1, 1}, //Pooling window + {1, 1} //Pooling stride }, { {2, 5}, // Kernel size {2, 3}, // Stride {0, 0}, // Pad begin {0, 0}, // Pad end - 8 // Num out channels + 8, // Num out channels + {1, 1}, //Pooling window + {1, 1} //Pooling stride + }, +}; + +const std::vector convReluSpecificParamsSimpleSeqWithPooling { + { + {3, 3}, // Kernel size + {1, 1}, // Stride + {0, 0}, // Pad begin + {0, 0}, // Pad end + 3, // Num out channels + {2, 3}, //Pooling window + {2, 3} //Pooling stride + }, + { + {2, 2}, // Kernel size + {1, 2}, // Stride + {0, 0}, // Pad begin + {0, 0}, // Pad end + 8, // Num out channels + {2, 3}, //Pooling window + {2, 2} //Pooling stride }, }; @@ -47,21 +78,27 @@ const std::vector convReluSpecificParamsFBSeq = { {1, 1}, // Stride {2, 3}, // Pad begin {2, 3}, // Pad end - 32 // Num out channels + 32, // Num out channels + {1, 1}, //Pooling window + {1, 1} //Pooling stride }, { {9, 5}, // Kernel size {1, 1}, // Stride {4, 2}, // Pad begin {4, 2}, // Pad end - 32 // Num out channels + 32, // Num out channels + {1, 1}, //Pooling window + {1, 1} //Pooling stride }, { {1, 1}, // Kernel size {1, 1}, // Stride {0, 0}, // Pad begin {0, 0}, // Pad end - 8 // Num out channels + 8, // Num out channels + {1, 1}, //Pooling window + {1, 1} //Pooling stride }, }; @@ -73,6 +110,22 @@ const std::vector convReluSpecificParamsAllAll = { { inputShapeFB, convReluSpecificParamsFBSeq + }, + { + inputShapeSimpleWithPooling, + convReluSpecificParamsSimpleSeqWithPooling + } +}; + +const std::vector > configs = { + { + {InferenceEngine::GNAConfigParams::KEY_GNA_DEVICE_MODE, InferenceEngine::GNAConfigParams::GNA_AUTO} + }, + { + {InferenceEngine::GNAConfigParams::KEY_GNA_DEVICE_MODE, InferenceEngine::GNAConfigParams::GNA_SW_FP32} + }, + { + {InferenceEngine::GNAConfigParams::KEY_GNA_DEVICE_MODE, InferenceEngine::GNAConfigParams::GNA_SW_EXACT} } }; @@ -83,7 +136,8 @@ INSTANTIATE_TEST_CASE_P(DISABLED_smoke_ConvolutionReluSequenceTest, ConvolutionR ::testing::ValuesIn(netPrecisions), ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), ::testing::Values(InferenceEngine::Precision::UNSPECIFIED), - ::testing::Values(CommonTestUtils::DEVICE_GNA)), + ::testing::Values(CommonTestUtils::DEVICE_GNA), + ::testing::ValuesIn(configs)), ConvolutionReluSequenceTest::getTestCaseName); } // namespace diff --git a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/convolution_relu_sequence.hpp b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/convolution_relu_sequence.hpp index 58d8ccda501..a70de5d41e6 100644 --- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/convolution_relu_sequence.hpp +++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/convolution_relu_sequence.hpp @@ -22,6 +22,8 @@ typedef struct { std::vector padBegin; std::vector padEnd; size_t numOutChannels; + InferenceEngine::SizeVector poolingWindow; + InferenceEngine::SizeVector poolingStride; } convReluSpecificParams; typedef struct { @@ -30,11 +32,12 @@ typedef struct { } convReluSpecificParamsAll; typedef std::tuple< - convReluSpecificParamsAll, // CNN2D sequence desc - InferenceEngine::Precision, // Net precision - InferenceEngine::Precision, // Input precision - InferenceEngine::Precision, // Output precision - LayerTestsUtils::TargetDevice // Device name + convReluSpecificParamsAll, // CNN2D sequence desc + InferenceEngine::Precision, // Net precision + InferenceEngine::Precision, // Input precision + InferenceEngine::Precision, // Output precision + LayerTestsUtils::TargetDevice, // Device name + std::map // Configuration > convReluSequenceTestParamsSet; class ConvolutionReluSequenceTest : public testing::WithParamInterface, diff --git a/inference-engine/tests/functional/shared_test_classes/src/subgraph/convolution_relu_sequence.cpp b/inference-engine/tests/functional/shared_test_classes/src/subgraph/convolution_relu_sequence.cpp index 0d3d3da7098..76dfdd8f990 100644 --- a/inference-engine/tests/functional/shared_test_classes/src/subgraph/convolution_relu_sequence.cpp +++ b/inference-engine/tests/functional/shared_test_classes/src/subgraph/convolution_relu_sequence.cpp @@ -11,7 +11,8 @@ std::string ConvolutionReluSequenceTest::getTestCaseName(testing::TestParamInfo< InferenceEngine::Precision netPrecision; InferenceEngine::Precision inPrc, outPrc; std::string targetDevice; - std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice) = + std::map config; + std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice, config) = obj.param; std::ostringstream result; @@ -27,8 +28,13 @@ std::string ConvolutionReluSequenceTest::getTestCaseName(testing::TestParamInfo< result << "PB" << CommonTestUtils::vec2str(single.padBegin) << "_"; result << "PE" << CommonTestUtils::vec2str(single.padEnd) << "_"; result << "O=" << single.numOutChannels << "_"; + result << "PW" << CommonTestUtils::vec2str(single.poolingWindow) << "_"; + result << "PS" << CommonTestUtils::vec2str(single.poolingStride) << "_"; } + for (auto&& single : config) { + result << single.first << "=" << single.second; + } return result.str(); } @@ -37,8 +43,10 @@ void ConvolutionReluSequenceTest::SetUp() { const InferenceEngine::SizeVector dilation = { 1, 1 }; convReluSpecificParamsAll convParamsAll; auto netPrecision = InferenceEngine::Precision::UNSPECIFIED; - std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice) = + std::map config; + std::tie(convParamsAll, netPrecision, inPrc, outPrc, targetDevice, config) = this->GetParam(); + configuration.insert(config.begin(), config.end()); auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); auto params = ngraph::builder::makeParams(ngPrc, { convParamsAll.inputShape}); auto lastOutputs = ngraph::helpers::castOps2Nodes(params).front(); @@ -67,6 +75,14 @@ void ConvolutionReluSequenceTest::SetUp() { ngPrc, single.kernelSize, single.strides, single.padBegin, single.padEnd, dilation, ngraph::op::PadType::EXPLICIT, single.numOutChannels, addBiases, filter_weights, biases)); lastOutputs = std::make_shared(conv); + if (single.poolingWindow.size() == 2 && + (single.poolingWindow[0] != 1 || + single.poolingWindow[1] != 1)) { + lastOutputs = std::make_shared(lastOutputs, single.poolingStride, + ngraph::Shape{ 0, 0 }, + ngraph::Shape{ 0, 0 }, + single.poolingWindow); + } inputChannels = single.numOutChannels; }