diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/device_config.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/device_config.hpp index 227390ee2ef..364f1159238 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/device_config.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/device_config.hpp @@ -32,7 +32,7 @@ struct Config { max_dynamic_batch(1), customLayers({}), kernels_cache_dir(""), - inference_precision(ov::element::undefined), + inference_precision(ov::element::f16), task_exec_config({"GPU plugin internal task executor", // name std::max(1, static_cast(std::thread::hardware_concurrency())), // # of streams 1, // # of threads per streams diff --git a/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp b/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp index b8156314737..ed91286008e 100644 --- a/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp +++ b/src/tests/functional/plugin/gpu/concurrency/gpu_concurrency_tests.cpp @@ -55,7 +55,7 @@ TEST_P(OVConcurrencyTest, canInferTwoExecNets) { auto fn = fn_ptrs[i]; auto exec_net = ie.compile_model(fn_ptrs[i], CommonTestUtils::DEVICE_GPU, - {{ov::ie::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}}); + {ov::num_streams(num_streams), ov::hint::inference_precision(ov::element::f32)}); auto input = fn_ptrs[i]->get_parameters().at(0); auto output = fn_ptrs[i]->get_results().at(0); @@ -115,7 +115,7 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) { auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat(); auto ie = ov::Core(); - auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU); + auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32)); const int infer_requests_num = 2; ov::InferRequest infer_request1 = compiled_model.create_infer_request(); @@ -193,7 +193,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) { auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32); auto ie = ov::Core(); - auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU); + auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32)); ov::InferRequest infer_request1 = compiled_model.create_infer_request(); ov::InferRequest infer_request2 = compiled_model.create_infer_request(); @@ -232,7 +232,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) { auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32); auto ie = ov::Core(); - auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU); + auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32)); ov::InferRequest infer_request1 = compiled_model.create_infer_request(); ov::InferRequest infer_request2 = compiled_model.create_infer_request(); @@ -258,7 +258,7 @@ TEST(canSwapTensorsBetweenInferRequests, outputs) { auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat(); auto ie = ov::Core(); - auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU); + auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32)); const int infer_requests_num = 2; ov::InferRequest infer_request1 = compiled_model.create_infer_request(); diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp index 9cb8db61e15..041bb1aea2c 100644 --- a/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp +++ b/src/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp @@ -40,6 +40,7 @@ public: {CONFIG_KEY(AUTO_BATCH_TIMEOUT) , "0"}, }; } + config.insert({ov::hint::inference_precision.name(), "f32"}); fn_ptr = ov::test::behavior::getDefaultNGraphFunctionForTheDevice(with_auto_batching ? CommonTestUtils::DEVICE_BATCH : deviceName); } static std::string getTestCaseName(const testing::TestParamInfo& obj) { @@ -229,7 +230,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) { auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc()); auto ie = PluginCache::get().ie(); - auto exec_net_regular = ie->LoadNetwork(net, deviceName); + auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}}); // regular inference auto inf_req_regular = exec_net_regular.CreateInferRequest(); @@ -276,7 +277,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) { auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc()); auto ie = PluginCache::get().ie(); - auto exec_net_regular = ie->LoadNetwork(net, deviceName); + auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}}); // regular inference auto inf_req_regular = exec_net_regular.CreateInferRequest(); @@ -304,7 +305,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) { // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases) // without calling thread blocks auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get()); - auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed + auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}}); auto inf_req_shared = exec_net_shared.CreateInferRequest(); // Allocate shared buffers for input and output data which will be set to infer request @@ -374,7 +375,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) { auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc()); auto ie = PluginCache::get().ie(); - auto exec_net_regular = ie->LoadNetwork(net, deviceName); + auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}}); // regular inference auto inf_req_regular = exec_net_regular.CreateInferRequest(); @@ -403,7 +404,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) { // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases) // without calling thread blocks auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get()); - auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed + auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}}); auto inf_req_shared = exec_net_shared.CreateInferRequest(); // Allocate shared buffers for input and output data which will be set to infer request @@ -468,7 +469,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) { auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc()); auto ie = PluginCache::get().ie(); - auto exec_net_regular = ie->LoadNetwork(net, deviceName); + auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}}); // regular inference auto inf_req_regular = exec_net_regular.CreateInferRequest(); @@ -497,7 +498,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) { // In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases) // without calling thread blocks auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get()); - auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed + auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}}); auto inf_req_shared = exec_net_shared.CreateInferRequest(); // Allocate shared buffers for input and output data which will be set to infer request @@ -600,7 +601,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) { /* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */ auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU, - { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES} }); + { { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES}, {ov::hint::inference_precision.name(), "f32"} }); auto inf_req_remote = exec_net_b.CreateInferRequest(); auto cldnn_context = exec_net_b.GetContext(); cl_context ctx = std::dynamic_pointer_cast(cldnn_context)->get(); @@ -669,7 +670,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) { net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8); net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12); - auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU); + auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU, {{ov::hint::inference_precision.name(), "f32"}}); auto inf_req_local = exec_net_b1.CreateInferRequest(); @@ -740,7 +741,8 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) { net.getInputsInfo().begin()->second->setPrecision(Precision::FP32); auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU, - {{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}}); + {{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}, + {ov::hint::inference_precision.name(), "f32"}}); for (int j = 0; j < num_streams * num_requests; j++) { outputs.push_back(net.getOutputsInfo().begin()->first); diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/core_integration.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/core_integration.cpp index d26778d07d5..fe5ad0951b5 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/core_integration.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/behavior/ov_plugin/core_integration.cpp @@ -344,12 +344,12 @@ TEST_P(OVClassGetPropertyTest_GPU, GetAndSetEnableProfilingNoThrow) { TEST_P(OVClassGetPropertyTest_GPU, GetAndSetInferencePrecisionNoThrow) { ov::Core ie; auto value = ov::element::undefined; - const auto expected_default_precision = ov::element::undefined; + const auto expected_default_precision = ov::element::f16; OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision)); ASSERT_EQ(expected_default_precision, value); - const auto forced_precision = ov::element::f16; + const auto forced_precision = ov::element::f32; OV_ASSERT_NO_THROW(ie.set_property(target_device, ov::hint::inference_precision(forced_precision))); OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision)); diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/core_config.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/core_config.cpp index 5e7e7675443..138101533f4 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/core_config.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/core_config.cpp @@ -5,4 +5,16 @@ #include "functional_test_utils/core_config.hpp" void CoreConfiguration(LayerTestsUtils::LayerTestsCommon* test) { + std::shared_ptr core = PluginCache::get().ie(); + ov::element::Type hint = ov::element::f32; + for (auto& param : test->GetFunction()->get_parameters()) { + if (param->get_output_element_type(0) == ov::element::f16) { + hint = ov::element::f16; + break; + } + } + + // Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary + std::map config = {{"INFERENCE_PRECISION_HINT", hint.get_type_name()}}; + core->SetConfig(config, CommonTestUtils::DEVICE_GPU); } diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp index 02a4f39b992..0193d47b053 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp @@ -125,5 +125,8 @@ std::vector disabledTestPatterns() { R"(.*smoke_GroupConvolution1D_ExplicitPadding_Disabled.*)", R"(.*smoke_GroupConvolutionLayerGPUTest_dynamic1DSymPad_Disabled.*)", R"(.*smoke_ConvolutionLayerGPUTest_dynamic1DSymPad.*)", + + // Looks like the test is targeting CPU plugin and doesn't respect that execution graph may vary from plugin to plugin + R"(.*ExecGraphSerializationTest.*)", }; } diff --git a/src/tests/functional/plugin/shared/include/behavior/plugin/auto_batching_tests.hpp b/src/tests/functional/plugin/shared/include/behavior/plugin/auto_batching_tests.hpp index 270eed3dad0..733d1f9246a 100644 --- a/src/tests/functional/plugin/shared/include/behavior/plugin/auto_batching_tests.hpp +++ b/src/tests/functional/plugin/shared/include/behavior/plugin/auto_batching_tests.hpp @@ -70,8 +70,11 @@ protected: n.second->setPrecision(Precision::FP32); } std::map config; - if (target_device.find("GPU") != std::string::npos) + if (target_device.find("GPU") != std::string::npos) { config[CONFIG_KEY(GPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams); + config["INFERENCE_PRECISION_HINT"] = "f32"; + } + if (target_device.find("CPU") != std::string::npos) { config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams); config[CONFIG_KEY(ENFORCE_BF16)] = CONFIG_VALUE(NO); diff --git a/src/tests/functional/plugin/shared/src/execution_graph_tests/normalize_l2_decomposition.cpp b/src/tests/functional/plugin/shared/src/execution_graph_tests/normalize_l2_decomposition.cpp index 006617645a8..fc56c880a13 100644 --- a/src/tests/functional/plugin/shared/src/execution_graph_tests/normalize_l2_decomposition.cpp +++ b/src/tests/functional/plugin/shared/src/execution_graph_tests/normalize_l2_decomposition.cpp @@ -11,6 +11,7 @@ #include "functional_test_utils/skip_tests_config.hpp" #include "common_test_utils/ngraph_test_utils.hpp" +#include "common_test_utils/test_constants.hpp" #include "execution_graph_tests/normalize_l2_decomposition.hpp" namespace ExecutionGraphTests { @@ -33,7 +34,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNonContiguousAxes const auto model = std::make_shared(ov::NodeVector{normalize_l2}, ov::ParameterVector{input}); auto core = ov::Core(); - const auto compiled_model = core.compile_model(model, device_name); + ov::AnyMap config; + if (device_name == CommonTestUtils::DEVICE_GPU) + config.insert(ov::hint::inference_precision(ov::element::f32)); + const auto compiled_model = core.compile_model(model, device_name, config); ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied } @@ -50,7 +54,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNormalizeOverAllA const auto model = std::make_shared(ov::NodeVector{normalize_l2}, ov::ParameterVector{input}); auto core = ov::Core(); - const auto compiled_model = core.compile_model(model, device_name); + ov::AnyMap config; + if (device_name == CommonTestUtils::DEVICE_GPU) + config.insert(ov::hint::inference_precision(ov::element::f32)); + const auto compiled_model = core.compile_model(model, device_name, config); ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied } @@ -67,7 +74,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForNotSorted) { const auto model = std::make_shared(ov::NodeVector{normalize_l2}, ov::ParameterVector{input}); auto core = ov::Core(); - const auto compiled_model = core.compile_model(model, device_name); + ov::AnyMap config; + if (device_name == CommonTestUtils::DEVICE_GPU) + config.insert(ov::hint::inference_precision(ov::element::f32)); + const auto compiled_model = core.compile_model(model, device_name, config); ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied } @@ -84,7 +94,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForSingleAxis) { const auto model = std::make_shared(ov::NodeVector{normalize_l2}, ov::ParameterVector{input}); auto core = ov::Core(); - const auto compiled_model = core.compile_model(model, device_name); + ov::AnyMap config; + if (device_name == CommonTestUtils::DEVICE_GPU) + config.insert(ov::hint::inference_precision(ov::element::f32)); + const auto compiled_model = core.compile_model(model, device_name, config); ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied } diff --git a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp index 1d87c79abdd..132fb3efb05 100644 --- a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp +++ b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp @@ -216,6 +216,18 @@ void SubgraphBaseTest::compile_model() { } #endif + // Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary + if (targetDevice == CommonTestUtils::DEVICE_GPU) { + ov::element::Type hint = ov::element::f32; + for (auto& param : function->get_parameters()) { + if (param->get_output_element_type(0) == ov::element::f16) { + hint = ov::element::f16; + break; + } + } + configuration.insert({ov::hint::inference_precision.name(), hint}); + } + compiledModel = core->compile_model(function, targetDevice, configuration); if (is_report_stages) { auto end_time = std::chrono::system_clock::now(); diff --git a/tests/layer_tests/common/layer_test_class.py b/tests/layer_tests/common/layer_test_class.py index 3989bcd27d8..c1c26cd8fc6 100644 --- a/tests/layer_tests/common/layer_test_class.py +++ b/tests/layer_tests/common/layer_test_class.py @@ -76,6 +76,11 @@ class CommonLayerTest: # (flag, resp) = ir.compare(ref_net) # assert flag, '\n'.join(resp) + config = None + # GPU default execution precision is FP16, so if we want to check FP32 inference we need to set explicit precision hint + if ie_device == 'GPU' and precision == 'FP32': + config = {'INFERENCE_PRECISION_HINT' : 'f32'} + if self.use_old_api: ie_engine = IEInfer(model=path_to_xml, weights=path_to_bin, @@ -93,7 +98,7 @@ class CommonLayerTest: inputs_dict = self._prepare_input(ie_engine.get_inputs_info(precision)) # IE infer: - infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout) + infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout, config=config) if hasattr(self, 'skip_framework') and self.skip_framework: warnings.warn('Framework is skipped') diff --git a/tests/layer_tests/common/layer_utils.py b/tests/layer_tests/common/layer_utils.py index 4cc43d3d075..c2e3152db08 100644 --- a/tests/layer_tests/common/layer_utils.py +++ b/tests/layer_tests/common/layer_utils.py @@ -23,14 +23,14 @@ class BaseInfer: self.name = name self.res = None - def fw_infer(self, input_data): + def fw_infer(self, input_data, config=None): raise RuntimeError("This is base class, please implement infer function for the specific framework") def get_inputs_info(self, precision) -> dict: raise RuntimeError("This is base class, please implement get_inputs_info function for the specific framework") - def infer(self, input_data, infer_timeout=10): - self.res = multiprocessing_run(self.fw_infer, [input_data], self.name, infer_timeout) + def infer(self, input_data, config=None, infer_timeout=10): + self.res = multiprocessing_run(self.fw_infer, [input_data, config], self.name, infer_timeout) return self.res @@ -41,7 +41,7 @@ class IEInfer(BaseInfer): self.model = model self.weights = weights - def fw_infer(self, input_data): + def fw_infer(self, input_data, config=None): print("Inference Engine version: {}".format(ie_get_version())) print("Creating IE Core Engine...") @@ -49,7 +49,7 @@ class IEInfer(BaseInfer): print("Reading network files") net = ie.read_network(self.model, self.weights) print("Loading network") - exec_net = ie.load_network(net, self.device) + exec_net = ie.load_network(net, self.device, config) print("Starting inference") result = exec_net.infer(input_data) @@ -78,14 +78,14 @@ class InferAPI20(BaseInfer): self.weights = weights self.use_new_frontend = use_new_frontend - def fw_infer(self, input_data): + def fw_infer(self, input_data, config=None): print("Inference Engine version: {}".format(ie2_get_version())) print("Creating IE Core Engine...") ie = Core() print("Reading network files") net = ie.read_model(self.model, self.weights) print("Loading network") - exec_net = ie.compile_model(net, self.device) + exec_net = ie.compile_model(net, self.device, config) print("Starting inference") request = exec_net.create_infer_request() request_result = request.infer(input_data) diff --git a/tests/layer_tests/common/onnx_layer_test_class.py b/tests/layer_tests/common/onnx_layer_test_class.py index fa1ad08da50..2fd8eb7b20a 100644 --- a/tests/layer_tests/common/onnx_layer_test_class.py +++ b/tests/layer_tests/common/onnx_layer_test_class.py @@ -19,7 +19,7 @@ class OnnxRuntimeInfer(BaseInfer): super().__init__('OnnxRuntime') self.net = net - def fw_infer(self, input_data): + def fw_infer(self, input_data, config=None): import onnxruntime as rt sess = rt.InferenceSession(self.net)