[LPT] [GPU] Multiply to group convolution (#9971)

* [LPT] MultiplyToGroupConvolution optimization for GPU * [LPT] MatMul in FP32 in GPU workarround support * [LPT] GPU plugin tests
2022-02-01 08:10:27 +03:00 · 2022-02-01 08:10:27 +03:00 · cc19ff74f1
commit cc19ff74f1
parent 8c7e0d9479
12 changed files with 102 additions and 35 deletions
--- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp
+++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp
@ -239,9 +239,11 @@ public:
    public:
        Params(
            const bool updatePrecisions = true,
-            element::Type deqPrecision = element::f32) :
+            element::Type deqPrecision = element::f32,
+            const bool reshapeIgnorePerTensorQuantizationCheck = false) :
            updatePrecisions(updatePrecisions),
-            deqPrecision(deqPrecision) {}
+            deqPrecision(deqPrecision),
+            reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck) {}

        Params& setUpdatePrecisions(const bool updatePrecisions) {
            this->updatePrecisions = updatePrecisions;
@ -255,6 +257,8 @@ public:

        bool updatePrecisions;
        element::Type deqPrecision;
+        // to support GPU workarround to keep Reshape and MatMul in FP32
+        bool reshapeIgnorePerTensorQuantizationCheck;
    };

    class PrecisionDetails {
@ -322,6 +326,7 @@ protected:

    bool updatePrecisions;
    element::Type deqPrecision;
+    bool reshapeIgnorePerTensorQuantizationCheck;

    static constexpr char originalLayerPostfix[] = "_original";
    TransformationContext* context;
--- a/src/common/low_precision_transformations/include/low_precision/network_helper.hpp
+++ b/src/common/low_precision_transformations/include/low_precision/network_helper.hpp
@ -87,7 +87,7 @@ public:

    static std::shared_ptr<opset1::Constant> toScalar(std::shared_ptr<opset1::Constant> constant);

-    static std::shared_ptr<Node> getConstantInput(const std::shared_ptr<Node>& node, const bool convertIsExpected = false);
+    static std::shared_ptr<Node> getConstantInput(const std::shared_ptr<const Node>& node, const bool convertIsExpected = false);

    static std::vector<size_t> updateReshapeValues(
        const Shape& elementwiseConstantShape,
--- a/src/common/low_precision_transformations/src/layer_transformation.cpp
+++ b/src/common/low_precision_transformations/src/layer_transformation.cpp
@ -30,6 +30,7 @@ std::mutex LayerTransformation::defaultPrecisionsMutex;
 LayerTransformation::LayerTransformation(const Params& params) :
    updatePrecisions(params.updatePrecisions),
    deqPrecision(params.deqPrecision),
+    reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck),
    context(nullptr) {}

 void LayerTransformation::setContext(TransformationContext* context) noexcept {
--- a/src/common/low_precision_transformations/src/network_helper.cpp
+++ b/src/common/low_precision_transformations/src/network_helper.cpp
@ -373,7 +373,7 @@ std::shared_ptr<opset1::Constant> NetworkHelper::toScalar(std::shared_ptr<opset1
    return std::make_shared<opset1::Constant>(constant->get_element_type(), Shape{}, constant->get_data_ptr());
 }

-std::shared_ptr<Node> NetworkHelper::getConstantInput(const std::shared_ptr<Node>& node, const bool convertIsExpected) {
+std::shared_ptr<Node> NetworkHelper::getConstantInput(const std::shared_ptr<const Node>& node, const bool convertIsExpected) {
    std::shared_ptr<Node> parent = ov::as_type_ptr<opset1::Constant>(node->input_value(0).get_node_shared_ptr());
    if (parent != nullptr) {
        return parent;
--- a/src/common/low_precision_transformations/src/reshape.cpp
+++ b/src/common/low_precision_transformations/src/reshape.cpp
@ -195,8 +195,18 @@ bool ReshapeTransformation::canBeTransformed(const TransformationContext& contex
        return false;
    }

-    if (((dequantization.subtract == nullptr) || NetworkHelper::isScalarLike(dequantization.subtractConstant)) &&
-        ((dequantization.multiply == nullptr) || NetworkHelper::isScalarLike(dequantization.multiplyConstant))) {
+    bool ignorePerTensorQuantizationCheck = false;
+    if (reshapeIgnorePerTensorQuantizationCheck) {
+        const auto inputs = op->get_output_target_inputs(0);
+        if (inputs.size() == 1ul) {
+            const auto consumer = inputs.begin()->get_node();
+            ignorePerTensorQuantizationCheck = ngraph::as_type<ngraph::opset1::MatMul>(consumer) != nullptr;
+        }
+    }
+
+    if (!ignorePerTensorQuantizationCheck &&
+        (((dequantization.subtract == nullptr) || NetworkHelper::isScalarLike(dequantization.subtractConstant)) &&
+        ((dequantization.multiply == nullptr) || NetworkHelper::isScalarLike(dequantization.multiplyConstant)))) {
        return true;
    }

--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@ -409,13 +409,47 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {

            return LayerTransformation::isAsymmetricQuantization(node) || WeightableLayerTransformation::isAsymmetricOnWeights(node);
        });
+
        if (!use_onednn) {
            lptPassConfig->set_callback<MatMulTransformation>([](const_node_ptr& node) -> bool {
                return MatMulTransformation::is3DTensorOnActivations(node);
            });
        }

-        lptManager.register_pass<LowPrecision>(supportedPrecisions, perTensorQuantization);
+        lptPassConfig->set_callback<MultiplyToGroupConvolutionTransformation>([&](const_node_ptr& node) -> bool {
+            // disable MultiplyToGroupConvolution if Multiply with Constant can be fused
+
+            const auto dequantization = NetworkHelper::getDequantization(node, 0, true);
+            std::shared_ptr<ov::Node> parent = dequantization.empty() ? nullptr : dequantization.data.get_node()->shared_from_this();
+            if (parent == nullptr) {
+                const auto constantNode = NetworkHelper::getConstantInput(node);
+                const auto constant = constantNode == nullptr ? nullptr : ngraph::as_type_ptr<ngraph::opset1::Constant>(constantNode);
+                if (constant != nullptr) {
+                    auto parent = node->get_input_node_shared_ptr(0);
+                    if (parent == constant) {
+                        parent = node->get_input_node_shared_ptr(1);
+                    }
+                }
+            }
+
+            if (parent != nullptr) {
+                const auto parentHasOneConsumer = parent->get_output_target_inputs(0).size() == 1ul;
+                if (parentHasOneConsumer) {
+                    return true;
+                }
+            }
+
+            // disable MultiplyToGroupConvolution for Multiply with scalar
+
+            if (MultiplyToGroupConvolutionTransformation::isDynamicOrScalar(node)) {
+                return true;
+            }
+
+            return false;
+        });
+
+        auto params = LayerTransformation::Params(true, element::f32, true);
+        lptManager.register_pass<LowPrecision>(supportedPrecisions, perTensorQuantization, params);
        lptManager.run_passes(func);
    }

--- a/src/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
+++ b/src/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
@ -23,35 +23,40 @@ const std::vector<MultiplyToGroupConvolutionTransformationParam> params = {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
        {{1.f, 2.f, 3.f}, element::f32, Shape{1, 3, 1, 1}},
        "output/GroupConvolution",
-        "U8"
+        "U8",
+        true
    },
    // Multiply with scalar is not transformed to GroupConvolution
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
        {{4.f}, element::f32, Shape{1, 1, 1, 1}},
        "output/GroupConvolution",
-        ""
+        "",
+        true
    },
    // Multiply with scalar is not transformed to GroupConvolution
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
        {{4.f}, element::f32, Shape{}},
        "output/GroupConvolution",
-        ""
+        "",
+        true
    },
    // Zero point
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { -1.28f }, { 1.27f }, { -1.28f }, { 1.27f } },
        {{1.f, 2.f, 3.f}, element::f32, Shape{1, 3, 1, 1}},
        "output/GroupConvolution",
-        "U8"
+        "U8",
+        true
    },
    // Zero point
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { -1.28f }, { 1.27f / 2.f }, { -1.28f }, { 1.27f / 2.f} },
        {{1.f, 2.f, 3.f}, element::f32, Shape{1, 3, 1, 1}},
        "output/GroupConvolution",
-        "U8"
+        "U8",
+        true
    }
 };

--- a/src/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
@ -23,36 +23,41 @@ const std::vector<MultiplyToGroupConvolutionTransformationParam> params = {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
        {{1.f, 2.f, 3.f}, element::f32, Shape{1, 3, 1, 1}},
        "output/GroupConvolution",
-        "U8"
-    },
-    // Multiply with scalar is transformed to GroupConvolution
-    {
-        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
-        {{4.f}, element::f32, Shape{1, 1, 1, 1}},
-        "output/GroupConvolution",
-        "U8"
-    },
-    // multiply with scalar is transformed to groupconvolution
-    {
-        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
-        {{4.f}, element::f32, Shape{}},
-        "output/GroupConvolution",
-        "U8"
+        "U8",
+        false
    },
    // zero point
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { -1.28f }, { 1.27f }, { -1.28f }, { 1.27f } },
        {{1.f, 2.f, 3.f}, element::f32, Shape{1, 3, 1, 1}},
        "output/GroupConvolution",
-        "I8"
+        "I8",
+        false
    },
    // zero point
    {
        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { -1.28f }, { 1.27f / 2.f }, { -1.28f }, { 1.27f / 2.f} },
        {{1.f, 2.f, 3.f}, element::f32, Shape{1, 3, 1, 1}},
        "output/GroupConvolution",
-        "U8"
-    }
+        "U8",
+        false
+    },
+
+    // Multiply => GroupConvolution optimizations
+    {
+        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
+        {{3.f}, element::f32, Shape{1, 1, 1, 1}},
+        "output/GroupConvolution",
+        "",
+        false
+    },
+    {
+        { 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 25.5f } },
+        {{3.f}, element::f32, Shape{1, 1, 1, 1}},
+        "output/GroupConvolution",
+        "",
+        true
+    },
 };

 INSTANTIATE_TEST_SUITE_P(smoke_LPT, MultiplyToGroupConvolutionTransformation,
--- a/src/tests/functional/plugin/shared/include/low_precision_transformations/multiply_to_group_convolution_transformation.hpp
+++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/multiply_to_group_convolution_transformation.hpp
@ -22,6 +22,7 @@ public:
    builder::subgraph::Constant constant;
    std::string layerName;
    std::string expectedKernelType;
+    bool parentHasOneConsumer;
 };

 typedef std::tuple <
--- a/src/tests/functional/plugin/shared/src/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
+++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/multiply_to_group_convolution_transformation.cpp
@ -34,7 +34,8 @@ std::string MultiplyToGroupConvolutionTransformation::getTestCaseName(const test
        param.fqOnData << "_" <<
        param.constant << "_" <<
        param.layerName << "_" <<
-        param.expectedKernelType;
+        param.expectedKernelType << "_" <<
+        param.parentHasOneConsumer;
    return result.str();
 }

@ -48,7 +49,8 @@ void MultiplyToGroupConvolutionTransformation::SetUp() {
        precision,
        shape,
        param.fqOnData,
-        param.constant);
+        param.constant,
+        param.parentHasOneConsumer);
 }

 void MultiplyToGroupConvolutionTransformation::Run() {
--- a/src/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/multiply_to_group_convolution_function.hpp
+++ b/src/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/multiply_to_group_convolution_function.hpp
@ -29,7 +29,8 @@ public:
        const ngraph::element::Type precision,
        const ngraph::PartialShape& inputShape,
        const FakeQuantizeOnData& fqOnData,
-        const Constant& constant);
+        const Constant& constant,
+        const bool parentHasOneConsumer = true);

    static std::shared_ptr<ngraph::Function> getReference(
        const ngraph::PartialShape& inputShape,
--- a/src/tests/ngraph_helpers/lpt_ngraph_functions/src/multiply_to_group_convolution_function.cpp
+++ b/src/tests/ngraph_helpers/lpt_ngraph_functions/src/multiply_to_group_convolution_function.cpp
@ -39,7 +39,8 @@ std::shared_ptr<ngraph::Function> MultiplyToGroupConvolutionFunction::getOrigina
    const ngraph::element::Type precision,
    const ngraph::PartialShape& inputShape,
    const FakeQuantizeOnData& fqOnData,
-    const Constant& constant) {
+    const Constant& constant,
+    const bool parentHasOneConsumer) {
    const auto input = std::make_shared<ngraph::opset1::Parameter>(precision, inputShape);
    const auto fakeQuantize = makeFakeQuantize(input, precision, fqOnData);

@ -58,7 +59,9 @@ std::shared_ptr<ngraph::Function> MultiplyToGroupConvolutionFunction::getOrigina
        std::make_shared<ngraph::opset1::Constant>(constant.outPrecision, constant.shape, constant.values));
    multiply->set_friendly_name("output");

-    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(multiply)};
+    ngraph::ResultVector results = parentHasOneConsumer ?
+        ngraph::ResultVector{std::make_shared<ngraph::opset1::Result>(multiply)} :
+        ngraph::ResultVector{std::make_shared<ngraph::opset1::Result>(maxPool), std::make_shared<ngraph::opset1::Result>(multiply)};
    return std::make_shared<ngraph::Function>(results, ngraph::ParameterVector{ input }, "MultiplyToGroupConvolutionFunction");
 }