[GPU] Change default infer precision to fp16 (#14752)

2023-01-04 15:55:44 +04:00 · 2023-01-04 15:55:44 +04:00 · e23b8492c5
commit e23b8492c5
parent f31ebd4947
12 changed files with 82 additions and 32 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/device_config.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/device_config.hpp
@ -32,7 +32,7 @@ struct Config {
                                          max_dynamic_batch(1),
                                          customLayers({}),
                                          kernels_cache_dir(""),
-                                          inference_precision(ov::element::undefined),
+                                          inference_precision(ov::element::f16),
                                          task_exec_config({"GPU plugin internal task executor",                        // name
                                                    std::max(1, static_cast<int>(std::thread::hardware_concurrency())), // # of streams
                                                    1,                                                                  // # of threads per streams
--- a/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp
+++ b/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp
@ -55,7 +55,7 @@ TEST_P(OVConcurrencyTest, canInferTwoExecNets) {
        auto fn = fn_ptrs[i];

        auto exec_net = ie.compile_model(fn_ptrs[i], CommonTestUtils::DEVICE_GPU,
-                                         {{ov::ie::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
+                                         {ov::num_streams(num_streams), ov::hint::inference_precision(ov::element::f32)});

        auto input = fn_ptrs[i]->get_parameters().at(0);
        auto output = fn_ptrs[i]->get_results().at(0);
@ -115,7 +115,7 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) {
    auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat();

    auto ie = ov::Core();
-    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
+    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));

    const int infer_requests_num = 2;
    ov::InferRequest infer_request1 = compiled_model.create_infer_request();
@ -193,7 +193,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) {
    auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);

    auto ie = ov::Core();
-    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
+    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));

    ov::InferRequest infer_request1 = compiled_model.create_infer_request();
    ov::InferRequest infer_request2 = compiled_model.create_infer_request();
@ -232,7 +232,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) {
    auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);

    auto ie = ov::Core();
-    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
+    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));

    ov::InferRequest infer_request1 = compiled_model.create_infer_request();
    ov::InferRequest infer_request2 = compiled_model.create_infer_request();
@ -258,7 +258,7 @@ TEST(canSwapTensorsBetweenInferRequests, outputs) {
    auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat();

    auto ie = ov::Core();
-    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
+    auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));

    const int infer_requests_num = 2;
    ov::InferRequest infer_request1 = compiled_model.create_infer_request();
--- a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
+++ b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
@ -40,6 +40,7 @@ public:
                            {CONFIG_KEY(AUTO_BATCH_TIMEOUT) , "0"},
                            };
            }
+        config.insert({ov::hint::inference_precision.name(), "f32"});
        fn_ptr = ov::test::behavior::getDefaultNGraphFunctionForTheDevice(with_auto_batching ? CommonTestUtils::DEVICE_BATCH : deviceName);
    }
    static std::string getTestCaseName(const testing::TestParamInfo<bool>& obj) {
@ -229,7 +230,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) {
    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());

    auto ie = PluginCache::get().ie();
-    auto exec_net_regular = ie->LoadNetwork(net, deviceName);
+    auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});

    // regular inference
    auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -276,7 +277,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());

    auto ie = PluginCache::get().ie();
-    auto exec_net_regular = ie->LoadNetwork(net, deviceName);
+    auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});

    // regular inference
    auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -304,7 +305,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
    // without calling thread blocks
    auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
-    auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
    auto inf_req_shared = exec_net_shared.CreateInferRequest();

    // Allocate shared buffers for input and output data which will be set to infer request
@ -374,7 +375,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());

    auto ie = PluginCache::get().ie();
-    auto exec_net_regular = ie->LoadNetwork(net, deviceName);
+    auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});

    // regular inference
    auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -403,7 +404,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
    // without calling thread blocks
    auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
-    auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
    auto inf_req_shared = exec_net_shared.CreateInferRequest();

    // Allocate shared buffers for input and output data which will be set to infer request
@ -468,7 +469,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) {
    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());

    auto ie = PluginCache::get().ie();
-    auto exec_net_regular = ie->LoadNetwork(net, deviceName);
+    auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});

    // regular inference
    auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -497,7 +498,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) {
    // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
    // without calling thread blocks
    auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
-    auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
+    auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
    auto inf_req_shared = exec_net_shared.CreateInferRequest();

    // Allocate shared buffers for input and output data which will be set to infer request
@ -600,7 +601,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) {

    /* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */
    auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU,
-                { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES} });
+                { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES}, {ov::hint::inference_precision.name(), "f32"} });
    auto inf_req_remote = exec_net_b.CreateInferRequest();
    auto cldnn_context = exec_net_b.GetContext();
    cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
@ -669,7 +670,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) {
    net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8);
    net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);

-    auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU);
+    auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU, {{ov::hint::inference_precision.name(), "f32"}});

    auto inf_req_local = exec_net_b1.CreateInferRequest();

@ -740,7 +741,8 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) {
        net.getInputsInfo().begin()->second->setPrecision(Precision::FP32);

        auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU,
-                                       {{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
+                                       {{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)},
+                                        {ov::hint::inference_precision.name(), "f32"}});

        for (int j = 0; j < num_streams * num_requests; j++) {
            outputs.push_back(net.getOutputsInfo().begin()->first);
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/core_integration.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/core_integration.cpp
@ -344,12 +344,12 @@ TEST_P(OVClassGetPropertyTest_GPU, GetAndSetEnableProfilingNoThrow) {
 TEST_P(OVClassGetPropertyTest_GPU, GetAndSetInferencePrecisionNoThrow) {
    ov::Core ie;
    auto value = ov::element::undefined;
-    const auto expected_default_precision = ov::element::undefined;
+    const auto expected_default_precision = ov::element::f16;

    OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision));
    ASSERT_EQ(expected_default_precision, value);

-    const auto forced_precision = ov::element::f16;
+    const auto forced_precision = ov::element::f32;

    OV_ASSERT_NO_THROW(ie.set_property(target_device, ov::hint::inference_precision(forced_precision)));
    OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision));
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/core_config.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/core_config.cpp
@ -5,4 +5,16 @@
 #include "functional_test_utils/core_config.hpp"

 void CoreConfiguration(LayerTestsUtils::LayerTestsCommon* test) {
+    std::shared_ptr<InferenceEngine::Core> core = PluginCache::get().ie();
+    ov::element::Type hint = ov::element::f32;
+    for (auto& param : test->GetFunction()->get_parameters()) {
+        if (param->get_output_element_type(0) == ov::element::f16) {
+            hint = ov::element::f16;
+            break;
+        }
+    }
+
+    // Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary
+    std::map<std::string, std::string> config = {{"INFERENCE_PRECISION_HINT", hint.get_type_name()}};
+    core->SetConfig(config, CommonTestUtils::DEVICE_GPU);
 }
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
@ -125,5 +125,8 @@ std::vector<std::string> disabledTestPatterns() {
            R"(.*smoke_GroupConvolution1D_ExplicitPadding_Disabled.*)",
            R"(.*smoke_GroupConvolutionLayerGPUTest_dynamic1DSymPad_Disabled.*)",
            R"(.*smoke_ConvolutionLayerGPUTest_dynamic1DSymPad.*)",
+
+            // Looks like the test is targeting CPU plugin and doesn't respect that execution graph may vary from plugin to plugin
+            R"(.*ExecGraphSerializationTest.*)",
    };
 }
--- a/src/tests/functional/plugin/shared/include/behavior/plugin/auto_batching_tests.hpp
+++ b/src/tests/functional/plugin/shared/include/behavior/plugin/auto_batching_tests.hpp
@ -70,8 +70,11 @@ protected:
                n.second->setPrecision(Precision::FP32);
            }
            std::map<std::string, std::string> config;
-            if (target_device.find("GPU") != std::string::npos)
+            if (target_device.find("GPU") != std::string::npos) {
                config[CONFIG_KEY(GPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
+                config["INFERENCE_PRECISION_HINT"] = "f32";
+            }
+
            if (target_device.find("CPU") != std::string::npos) {
                config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
                config[CONFIG_KEY(ENFORCE_BF16)] = CONFIG_VALUE(NO);
--- a/src/tests/functional/plugin/shared/src/execution_graph_tests/normalize_l2_decomposition.cpp
+++ b/src/tests/functional/plugin/shared/src/execution_graph_tests/normalize_l2_decomposition.cpp
@ -11,6 +11,7 @@

 #include "functional_test_utils/skip_tests_config.hpp"
 #include "common_test_utils/ngraph_test_utils.hpp"
+#include "common_test_utils/test_constants.hpp"
 #include "execution_graph_tests/normalize_l2_decomposition.hpp"

 namespace ExecutionGraphTests {
@ -33,7 +34,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNonContiguousAxes
      const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});

      auto core = ov::Core();
-      const auto compiled_model = core.compile_model(model, device_name);
+      ov::AnyMap config;
+      if (device_name == CommonTestUtils::DEVICE_GPU)
+        config.insert(ov::hint::inference_precision(ov::element::f32));
+      const auto compiled_model = core.compile_model(model, device_name, config);

      ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied
 }
@ -50,7 +54,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNormalizeOverAllA
      const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});

      auto core = ov::Core();
-      const auto compiled_model = core.compile_model(model, device_name);
+      ov::AnyMap config;
+      if (device_name == CommonTestUtils::DEVICE_GPU)
+        config.insert(ov::hint::inference_precision(ov::element::f32));
+      const auto compiled_model = core.compile_model(model, device_name, config);

      ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied
 }
@ -67,7 +74,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForNotSorted) {
      const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});

      auto core = ov::Core();
-      const auto compiled_model = core.compile_model(model, device_name);
+      ov::AnyMap config;
+      if (device_name == CommonTestUtils::DEVICE_GPU)
+        config.insert(ov::hint::inference_precision(ov::element::f32));
+      const auto compiled_model = core.compile_model(model, device_name, config);

      ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied
 }
@ -84,7 +94,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForSingleAxis) {
      const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});

      auto core = ov::Core();
-      const auto compiled_model = core.compile_model(model, device_name);
+      ov::AnyMap config;
+      if (device_name == CommonTestUtils::DEVICE_GPU)
+        config.insert(ov::hint::inference_precision(ov::element::f32));
+      const auto compiled_model = core.compile_model(model, device_name, config);

      ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied
 }
--- a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp
@ -216,6 +216,18 @@ void SubgraphBaseTest::compile_model() {
        }
    #endif

+    // Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary
+    if (targetDevice == CommonTestUtils::DEVICE_GPU) {
+        ov::element::Type hint = ov::element::f32;
+        for (auto& param : function->get_parameters()) {
+            if (param->get_output_element_type(0) == ov::element::f16) {
+                hint = ov::element::f16;
+                break;
+            }
+        }
+        configuration.insert({ov::hint::inference_precision.name(), hint});
+    }
+
    compiledModel = core->compile_model(function, targetDevice, configuration);
    if (is_report_stages) {
        auto end_time = std::chrono::system_clock::now();
--- a/tests/layer_tests/common/layer_test_class.py
+++ b/tests/layer_tests/common/layer_test_class.py
@ -76,6 +76,11 @@ class CommonLayerTest:
        #     (flag, resp) = ir.compare(ref_net)
        #     assert flag, '\n'.join(resp)

+        config = None
+        # GPU default execution precision is FP16, so if we want to check FP32 inference we need to set explicit precision hint
+        if ie_device == 'GPU' and precision == 'FP32':
+            config = {'INFERENCE_PRECISION_HINT' : 'f32'}
+
        if self.use_old_api:
            ie_engine = IEInfer(model=path_to_xml,
                                weights=path_to_bin,
@ -93,7 +98,7 @@ class CommonLayerTest:
            inputs_dict = self._prepare_input(ie_engine.get_inputs_info(precision))

        # IE infer:
-        infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout)
+        infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout, config=config)

        if hasattr(self, 'skip_framework') and self.skip_framework:
            warnings.warn('Framework is skipped')
--- a/tests/layer_tests/common/layer_utils.py
+++ b/tests/layer_tests/common/layer_utils.py
@ -23,14 +23,14 @@ class BaseInfer:
        self.name = name
        self.res = None

-    def fw_infer(self, input_data):
+    def fw_infer(self, input_data, config=None):
        raise RuntimeError("This is base class, please implement infer function for the specific framework")

    def get_inputs_info(self, precision) -> dict:
        raise RuntimeError("This is base class, please implement get_inputs_info function for the specific framework")

-    def infer(self, input_data, infer_timeout=10):
-        self.res = multiprocessing_run(self.fw_infer, [input_data], self.name, infer_timeout)
+    def infer(self, input_data, config=None, infer_timeout=10):
+        self.res = multiprocessing_run(self.fw_infer, [input_data, config], self.name, infer_timeout)
        return self.res


@ -41,7 +41,7 @@ class IEInfer(BaseInfer):
        self.model = model
        self.weights = weights

-    def fw_infer(self, input_data):
+    def fw_infer(self, input_data, config=None):

        print("Inference Engine version: {}".format(ie_get_version()))
        print("Creating IE Core Engine...")
@ -49,7 +49,7 @@ class IEInfer(BaseInfer):
        print("Reading network files")
        net = ie.read_network(self.model, self.weights)
        print("Loading network")
-        exec_net = ie.load_network(net, self.device)
+        exec_net = ie.load_network(net, self.device, config)
        print("Starting inference")
        result = exec_net.infer(input_data)

@ -78,14 +78,14 @@ class InferAPI20(BaseInfer):
        self.weights = weights
        self.use_new_frontend = use_new_frontend

-    def fw_infer(self, input_data):
+    def fw_infer(self, input_data, config=None):
        print("Inference Engine version: {}".format(ie2_get_version()))
        print("Creating IE Core Engine...")
        ie = Core()
        print("Reading network files")
        net = ie.read_model(self.model, self.weights)
        print("Loading network")
-        exec_net = ie.compile_model(net, self.device)
+        exec_net = ie.compile_model(net, self.device, config)
        print("Starting inference")
        request = exec_net.create_infer_request()
        request_result = request.infer(input_data)
--- a/tests/layer_tests/common/onnx_layer_test_class.py
+++ b/tests/layer_tests/common/onnx_layer_test_class.py
@ -19,7 +19,7 @@ class OnnxRuntimeInfer(BaseInfer):
        super().__init__('OnnxRuntime')
        self.net = net

-    def fw_infer(self, input_data):
+    def fw_infer(self, input_data, config=None):
        import onnxruntime as rt

        sess = rt.InferenceSession(self.net)