Fix performance on resnet50 quantized models (#7670)

* Fix performance on resnet50 quantized models LP transformations won't work on the model unless the last 4 inputs to FakeQuantize are constants. In order to meet that requirement, we need to perform constant folding for those inputs in QuantizeLinear ONNX operator. Ticket: 65375 * fix "Cannot find blob with name: y" exception during onnx_model_quant_conv_linear * remove linking with onnx_ngraph_frontend * fix exclude path
2021-10-13 09:18:37 +02:00 · 2021-10-13 09:18:37 +02:00 · db527fff41
commit db527fff41
parent 0d020974f9
14 changed files with 401 additions and 16 deletions
--- a/cmake/test_model_zoo.cmake
+++ b/cmake/test_model_zoo.cmake
@ -66,6 +66,11 @@ ov_model_convert("${CMAKE_CURRENT_SOURCE_DIR}/ngraph/test"
                 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/test_model_zoo/ngraph"
                  onnx_out_files)
 set(rel_path "inference-engine/tests/functional/plugin/shared/models")
 ov_model_convert("${OpenVINO_SOURCE_DIR}/${rel_path}"
                 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/test_model_zoo/func_tests/models"
                 ft_out_files)
 set(rel_path "inference-engine/tests/functional/inference_engine/onnx_reader")
 ov_model_convert("${OpenVINO_SOURCE_DIR}/${rel_path}"
                 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/test_model_zoo/onnx_reader"
@ -116,6 +121,7 @@ if(ENABLE_TESTS)
    endif()
    add_custom_target(test_model_zoo DEPENDS ${onnx_out_files}
                                             ${ft_out_files}
                                             ${ie_onnx_out_files}
                                             ${ie_serialize_out_files}
                                             ${ie_onnx_import_out_files})
--- a/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/network_helper.hpp
@ -80,11 +80,11 @@ public:
    static std::shared_ptr<Node> swapMultiplyAndAdd(std::shared_ptr<opset1::Add> addAfterMultiply, const int multiplyBranch);
-    static void copyInfo(const std::vector<std::shared_ptr<Node>>& sources, const std::vector<std::shared_ptr<Node>>& targets);
+    static void copyInfo(const std::vector<std::shared_ptr<Node>>& sources, const std::vector<std::shared_ptr<Node>>& targets, bool overrideName = true);
-    static void copyInfo(const std::vector<std::shared_ptr<Node>>& sources, const std::shared_ptr<Node>& target);
+    static void copyInfo(const std::vector<std::shared_ptr<Node>>& sources, const std::shared_ptr<Node>& target, bool overrideName = true);
-    static void copyInfo(const std::shared_ptr<Node>& source, const std::shared_ptr<Node>& target);
+    static void copyInfo(const std::shared_ptr<Node>& source, const std::shared_ptr<Node>& target, bool overrideName = true);
    static bool isScalarLike(std::shared_ptr<opset1::Constant> constant);
--- a/inference-engine/src/low_precision_transformations/src/mat_mul.cpp
+++ b/inference-engine/src/low_precision_transformations/src/mat_mul.cpp
@ -108,7 +108,7 @@ bool MatMulTransformation::transform(TransformationContext &context, ngraph::pat
        // multiply by weights: [1, ..., 1, Y] x [Y, Z] => [1, ..., 1, Z]
        const auto newSubConst = NetworkHelper::toScalarIfPossible(fold<opset1::MatMul>(
-            broadcastedConst,
+            foldConvert(broadcastedConst, newMatMul->get_element_type()),
            foldConvert(newMatMul->input_value(1), newMatMul->get_element_type()),
            newMatMul->get_transpose_a(),
            newMatMul->get_transpose_b()));
--- a/inference-engine/src/low_precision_transformations/src/network_helper.cpp
+++ b/inference-engine/src/low_precision_transformations/src/network_helper.cpp
@ -302,12 +302,13 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<opset1::
 void NetworkHelper::copyInfo(
    const std::vector<std::shared_ptr<Node>>& sources,
-    const std::vector<std::shared_ptr<Node>>& targets) {
+    const std::vector<std::shared_ptr<Node>>& targets,
    bool overrideName) {
    ngraph::copy_runtime_info(sources, targets);
    for (const auto& target : targets) {
        const std::string friendlyName = sources[0]->get_friendly_name();
-        if (!friendlyName.empty()) {
+        if (!friendlyName.empty() && overrideName) {
            target->set_friendly_name(friendlyName);
        }
@ -345,12 +346,12 @@ void NetworkHelper::copyInfo(
    }
 }
-void NetworkHelper::copyInfo(const std::vector<std::shared_ptr<Node>>& sources, const std::shared_ptr<Node>& target) {
+void NetworkHelper::copyInfo(const std::vector<std::shared_ptr<Node>>& sources, const std::shared_ptr<Node>& target, bool overrideName) {
-    copyInfo(sources, std::vector<std::shared_ptr<Node>>{ target });
+    copyInfo(sources, std::vector<std::shared_ptr<Node>>{ target }, overrideName);
 }
-void NetworkHelper::copyInfo(const std::shared_ptr<Node>& source, const std::shared_ptr<Node>& target) {
+void NetworkHelper::copyInfo(const std::shared_ptr<Node>& source, const std::shared_ptr<Node>& target, bool overrideName) {
-    copyInfo(std::vector<std::shared_ptr<Node>>{ source }, std::vector<std::shared_ptr<Node>>{ target });
+    copyInfo(std::vector<std::shared_ptr<Node>>{ source }, std::vector<std::shared_ptr<Node>>{ target }, overrideName);
 }
 bool NetworkHelper::isScalarLike(std::shared_ptr<opset1::Constant> constant) {
@ -657,8 +658,10 @@ std::shared_ptr<opset1::FakeQuantize> NetworkHelper::fuseConvert(const std::shar
        ngraph::op::TemporaryReplaceOutputType(fakeQuantize->input_value(4), element::f32).get(),
        fakeQuantize->get_levels());
    NetworkHelper::setOutDataPrecisionForTypeRelaxed(newFakeQuantize, node->get_output_element_type(0));
    newFakeQuantize->set_friendly_name(node->get_friendly_name());
    replace_node(node->shared_from_this(), newFakeQuantize);
-    NetworkHelper::copyInfo(fakeQuantize, newFakeQuantize);
+    bool overrideName = false;
    NetworkHelper::copyInfo(fakeQuantize, newFakeQuantize, overrideName);
    return newFakeQuantize;
 }
--- a/inference-engine/tests/functional/plugin/cpu/CMakeLists.txt
+++ b/inference-engine/tests/functional/plugin/cpu/CMakeLists.txt
@ -16,7 +16,7 @@ if (NGRAPH_ONNX_FRONTEND_ENABLE)
    list(APPEND LINK_LIBRARIES onnx_custom_op)
    list(APPEND DEPENDENCIES template_extension onnx_custom_op)
 else()
-    set(EXCLUDED_SOURCE_PATHS "${CMAKE_CURRENT_SOURCE_DIR}/extension")
+    set(EXCLUDED_SOURCE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/extension ${CMAKE_CURRENT_SOURCE_DIR}/onnx)
 endif()
 addIeTargetTest(
--- a/inference-engine/tests/functional/plugin/cpu/onnx/quantized_models_tests.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/onnx/quantized_models_tests.cpp
@ -0,0 +1,12 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include <gtest/gtest.h>
 #include "onnx/quantized_models_tests.hpp"
 using namespace ONNXTestsDefinitions;
 INSTANTIATE_TEST_SUITE_P(ONNXQuantizedModels, QuantizedModelsTests,
                        ::testing::Values(CommonTestUtils::DEVICE_CPU),
                        QuantizedModelsTests::getTestCaseName);
--- a/inference-engine/tests/functional/plugin/shared/CMakeLists.txt
+++ b/inference-engine/tests/functional/plugin/shared/CMakeLists.txt
@ -6,6 +6,14 @@ set(TARGET_NAME funcSharedTests)
 set(PUBLIC_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include")
 set(DEPENDENCIES inference_engine mock_engine HeteroPlugin MultiDevicePlugin)
 if (NGRAPH_ONNX_FRONTEND_ENABLE)
    list(APPEND DEPENDENCIES test_model_zoo)
    list(APPEND DEFINES TEST_MODELS="${TEST_MODEL_ZOO}/func_tests/models/")
 else()
    set(EXCLUDED_SOURCE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/onnx)
 endif()
 addIeTarget(
        NAME ${TARGET_NAME}
        TYPE STATIC
@ -15,6 +23,8 @@ addIeTarget(
        ADD_CPPLINT
        DEVELOPER_PACKAGE
            inference_engine_tests
        EXCLUDED_SOURCE_PATHS ${EXCLUDED_SOURCE_PATHS}
        DEFINES ${DEFINES}
        INCLUDES
            PUBLIC
                ${PUBLIC_HEADERS_DIR}
@ -29,10 +39,7 @@ addIeTarget(
                openvino::util
                inference_engine_transformations
        DEPENDENCIES
-            inference_engine
+            ${DEPENDENCIES}
            mock_engine
            HeteroPlugin
            MultiDevicePlugin
 )
 # CVS-55376
--- a/inference-engine/tests/functional/plugin/shared/include/onnx/quantized_models_tests.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/onnx/quantized_models_tests.hpp
@ -0,0 +1,22 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
 #include <string>
 #include "shared_test_classes/base/layer_test_utils.hpp"
 namespace ONNXTestsDefinitions {
 class QuantizedModelsTests : public testing::WithParamInterface<std::string>,
                            virtual public LayerTestsUtils::LayerTestsCommon {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<std::string>& obj);
 protected:
    void SetUp() override;
    void runModel(const char* model, const std::unordered_map<std::string, ngraph::element::Type_t>& expected_layer_types);
 };
 } // namespace ONNXTestsDefinitions
--- a/inference-engine/tests/functional/plugin/shared/models/max_pool_fq.prototxt
+++ b/inference-engine/tests/functional/plugin/shared/models/max_pool_fq.prototxt
@ -0,0 +1,132 @@
 ir_version: 6
 producer_name: "pytorch"
 producer_version: "1.8"
 graph {
  node {
    output: "884"
    name: "Constant_10"
    op_type: "Constant"
    attribute {
      name: "value"
      t {
        dims: 1
        data_type: 1
        raw_data: "\000\000\000\000"
      }
      type: TENSOR
    }
  }
  node {
    output: "885"
    name: "Constant_11"
    op_type: "Constant"
    attribute {
      name: "value"
      t {
        dims: 1
        data_type: 1
        raw_data: "6\241\311@"
      }
      type: TENSOR
    }
  }
  node {
    input: "883"
    input: "884"
    input: "885"
    input: "884"
    input: "885"
    output: "886"
    name: "FakeQuantize_12"
    op_type: "FakeQuantize"
    attribute {
      name: "levels"
      i: 256
      type: INT
    }
    domain: "org.openvinotoolkit"
  }
  node {
    input: "886"
    output: "887"
    name: "MaxPool_13"
    op_type: "MaxPool"
    attribute {
      name: "ceil_mode"
      i: 0
      type: INT
    }
    attribute {
      name: "kernel_shape"
      ints: 3
      ints: 3
      type: INTS
    }
    attribute {
      name: "pads"
      ints: 1
      ints: 1
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: "strides"
      ints: 2
      ints: 2
      type: INTS
    }
  }
  name: "torch-jit-export"
  input {
    name: "883"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 64
          }
          dim {
            dim_value: 112
          }
          dim {
            dim_value: 112
          }
        }
      }
    }
  }
  output {
    name: "887"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 64
          }
          dim {
            dim_value: 56
          }
          dim {
            dim_value: 56
          }
        }
      }
    }
  }
 }
 opset_import {
  version: 10
 }
 opset_import {
  domain: "org.openvinotoolkit"
  version: 1
 }
--- a/inference-engine/tests/functional/plugin/shared/models/max_pool_qdq.prototxt
+++ b/inference-engine/tests/functional/plugin/shared/models/max_pool_qdq.prototxt
@ -0,0 +1,126 @@
 ir_version: 6
 producer_name: "pytorch"
 producer_version: "1.8"
 graph {
  node {
    output: "886"
    name: "Constant_12"
    op_type: "Constant"
    attribute {
      name: "value"
      t {
        data_type: 1
        raw_data: "\242k\312<"
      }
      type: TENSOR
    }
  }
  node {
    output: "887"
    name: "Constant_13"
    op_type: "Constant"
    attribute {
      name: "value"
      t {
        data_type: 2
        raw_data: "\000"
      }
      type: TENSOR
    }
  }
  node {
    input: "885"
    input: "886"
    input: "887"
    output: "888"
    name: "QuantizeLinear_14"
    op_type: "QuantizeLinear"
  }
  node {
    input: "888"
    input: "886"
    input: "887"
    output: "889"
    name: "DequantizeLinear_15"
    op_type: "DequantizeLinear"
  }
  node {
    input: "889"
    output: "890"
    name: "MaxPool_16"
    op_type: "MaxPool"
    attribute {
      name: "ceil_mode"
      i: 0
      type: INT
    }
    attribute {
      name: "kernel_shape"
      ints: 3
      ints: 3
      type: INTS
    }
    attribute {
      name: "pads"
      ints: 1
      ints: 1
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: "strides"
      ints: 2
      ints: 2
      type: INTS
    }
  }
  name: "torch-jit-export"
  input {
    name: "885"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 64
          }
          dim {
            dim_value: 112
          }
          dim {
            dim_value: 112
          }
        }
      }
    }
  }
  output {
    name: "890"
    type {
      tensor_type {
        elem_type: 1
        shape {
          dim {
            dim_value: 1
          }
          dim {
            dim_value: 64
          }
          dim {
            dim_value: 56
          }
          dim {
            dim_value: 56
          }
        }
      }
    }
  }
 }
 opset_import {
  version: 10
 }
--- a/inference-engine/tests/functional/plugin/shared/src/onnx/quantized_models_tests.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/onnx/quantized_models_tests.cpp
@ -0,0 +1,49 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include <file_utils.h>
 #include "onnx/quantized_models_tests.hpp"
 namespace ONNXTestsDefinitions {
 std::string QuantizedModelsTests::getTestCaseName(const testing::TestParamInfo<std::string>& obj) {
    std::string targetDevice = obj.param;
    std::ostringstream result;
    result << "device=" << targetDevice;
    return result.str();
 }
 void QuantizedModelsTests::SetUp() {
    targetDevice = this->GetParam();
 }
 static std::string getModelFullPath(const char* path) {
    return FileUtils::makePath<char>(TEST_MODELS, path);
 }
 void QuantizedModelsTests::runModel(const char* model, const std::unordered_map<std::string, ngraph::element::Type_t>& expected_layer_types) {
    auto ie = getCore();
    auto network = ie->ReadNetwork(getModelFullPath(model));
    function = network.getFunction();
    Run();
    auto runtime_function = executableNetwork.GetExecGraphInfo().getFunction();
    int ops_found = 0;
    for (const auto& node : runtime_function->get_ordered_ops()) {
        const auto& name = node->get_friendly_name();
        if (expected_layer_types.count(name)) {
            ops_found++;
            ASSERT_EQ(expected_layer_types.at(name), node->get_element_type());
        }
    }
    ASSERT_GT(ops_found, 0);
 }
 TEST_P(QuantizedModelsTests, MaxPoolQDQ) {
    runModel("max_pool_qdq.onnx", {{"890_original", ngraph::element::u8}});
 }
 TEST_P(QuantizedModelsTests, MaxPoolFQ) {
    runModel("max_pool_fq.onnx", {{"887_original", ngraph::element::u8}});
 }
 } // namespace ONNXTestsDefinitions
--- a/ngraph/core/src/op/equal.cpp
+++ b/ngraph/core/src/op/equal.cpp
@ -34,6 +34,8 @@ bool evaluate_equal(const HostTensorPtr& arg0,
    out->set_broadcast(broadcast_spec, arg0, arg1, element::boolean);
    switch (arg0->get_element_type()) {
        NGRAPH_TYPE_CASE(evaluate_equal, boolean, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_equal, i8, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_equal, u8, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_equal, i32, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_equal, i64, arg0, arg1, out, broadcast_spec);
        NGRAPH_TYPE_CASE(evaluate_equal, u32, arg0, arg1, out, broadcast_spec);
@ -72,6 +74,8 @@ bool op::v1::Equal::has_evaluate() const {
    NGRAPH_OP_SCOPE(v1_Equal_has_evaluate);
    switch (get_input_element_type(0)) {
    case ngraph::element::boolean:
    case ngraph::element::i8:
    case ngraph::element::u8:
    case ngraph::element::i32:
    case ngraph::element::i64:
    case ngraph::element::u32:
--- a/ngraph/frontend/onnx/frontend/src/op/quantize_linear.cpp
+++ b/ngraph/frontend/onnx/frontend/src/op/quantize_linear.cpp
@ -88,9 +88,13 @@ std::tuple<std::shared_ptr<ngraph::Node>, std::shared_ptr<ngraph::Node>> get_inp
    input_low =
        std::make_shared<default_opset::Multiply>(y_scale,
                                                  std::make_shared<default_opset::Subtract>(output_low, zero_point));
    if (auto constant = get_constant_from_source(input_low))
        input_low = constant;
    input_high =
        std::make_shared<default_opset::Multiply>(y_scale,
                                                  std::make_shared<default_opset::Subtract>(output_high, zero_point));
    if (auto constant = get_constant_from_source(input_high))
        input_high = constant;
    return std::make_tuple(input_low, input_high);
 }
--- a/ngraph/test/constant_folding.cpp
+++ b/ngraph/test/constant_folding.cpp
@ -444,6 +444,26 @@ TEST(constant_folding, const_convert) {
        vector<int64_t> expected{1, 2, 3, 4, 5};
        test_const_convert(in, expected);
    }
    {
        vector<int8_t> in{-128, -2, 0, 1, 3, 127};
        vector<float> expected{-128, -2, 0, 1, 3, 127};
        test_const_convert(in, expected);
    }
    {
        vector<uint8_t> in{0, 1, 3, 127, 255};
        vector<float> expected{0, 1, 3, 127, 255};
        test_const_convert(in, expected);
    }
    {
        vector<float> in{-300, -128, -1, 0, 33, 127, 128};
        vector<int8_t> expected{-44, -128, -1, 0, 33, 127, -128};
        test_const_convert(in, expected);
    }
    {
        vector<float> in{0, 33, 127, 255, 256};
        vector<uint8_t> expected{0, 33, 127, 255, 0};
        test_const_convert(in, expected);
    }
 }
 TEST(constant_folding, shape_of_v0) {