[GPU] Change default infer precision to fp16 (#14752)
This commit is contained in:
parent
f31ebd4947
commit
e23b8492c5
@ -32,7 +32,7 @@ struct Config {
|
||||
max_dynamic_batch(1),
|
||||
customLayers({}),
|
||||
kernels_cache_dir(""),
|
||||
inference_precision(ov::element::undefined),
|
||||
inference_precision(ov::element::f16),
|
||||
task_exec_config({"GPU plugin internal task executor", // name
|
||||
std::max(1, static_cast<int>(std::thread::hardware_concurrency())), // # of streams
|
||||
1, // # of threads per streams
|
||||
|
@ -55,7 +55,7 @@ TEST_P(OVConcurrencyTest, canInferTwoExecNets) {
|
||||
auto fn = fn_ptrs[i];
|
||||
|
||||
auto exec_net = ie.compile_model(fn_ptrs[i], CommonTestUtils::DEVICE_GPU,
|
||||
{{ov::ie::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
|
||||
{ov::num_streams(num_streams), ov::hint::inference_precision(ov::element::f32)});
|
||||
|
||||
auto input = fn_ptrs[i]->get_parameters().at(0);
|
||||
auto output = fn_ptrs[i]->get_results().at(0);
|
||||
@ -115,7 +115,7 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) {
|
||||
auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat();
|
||||
|
||||
auto ie = ov::Core();
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
|
||||
|
||||
const int infer_requests_num = 2;
|
||||
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
|
||||
@ -193,7 +193,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) {
|
||||
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
|
||||
|
||||
auto ie = ov::Core();
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
|
||||
|
||||
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
|
||||
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
|
||||
@ -232,7 +232,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) {
|
||||
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
|
||||
|
||||
auto ie = ov::Core();
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
|
||||
|
||||
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
|
||||
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
|
||||
@ -258,7 +258,7 @@ TEST(canSwapTensorsBetweenInferRequests, outputs) {
|
||||
auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat();
|
||||
|
||||
auto ie = ov::Core();
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
|
||||
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
|
||||
|
||||
const int infer_requests_num = 2;
|
||||
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
|
||||
|
@ -40,6 +40,7 @@ public:
|
||||
{CONFIG_KEY(AUTO_BATCH_TIMEOUT) , "0"},
|
||||
};
|
||||
}
|
||||
config.insert({ov::hint::inference_precision.name(), "f32"});
|
||||
fn_ptr = ov::test::behavior::getDefaultNGraphFunctionForTheDevice(with_auto_batching ? CommonTestUtils::DEVICE_BATCH : deviceName);
|
||||
}
|
||||
static std::string getTestCaseName(const testing::TestParamInfo<bool>& obj) {
|
||||
@ -229,7 +230,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) {
|
||||
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
|
||||
|
||||
auto ie = PluginCache::get().ie();
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
|
||||
// regular inference
|
||||
auto inf_req_regular = exec_net_regular.CreateInferRequest();
|
||||
@ -276,7 +277,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
|
||||
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
|
||||
|
||||
auto ie = PluginCache::get().ie();
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
|
||||
// regular inference
|
||||
auto inf_req_regular = exec_net_regular.CreateInferRequest();
|
||||
@ -304,7 +305,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
|
||||
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
|
||||
// without calling thread blocks
|
||||
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
|
||||
auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
|
||||
auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
auto inf_req_shared = exec_net_shared.CreateInferRequest();
|
||||
|
||||
// Allocate shared buffers for input and output data which will be set to infer request
|
||||
@ -374,7 +375,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
|
||||
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
|
||||
|
||||
auto ie = PluginCache::get().ie();
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
|
||||
// regular inference
|
||||
auto inf_req_regular = exec_net_regular.CreateInferRequest();
|
||||
@ -403,7 +404,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
|
||||
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
|
||||
// without calling thread blocks
|
||||
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
|
||||
auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
|
||||
auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
auto inf_req_shared = exec_net_shared.CreateInferRequest();
|
||||
|
||||
// Allocate shared buffers for input and output data which will be set to infer request
|
||||
@ -468,7 +469,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) {
|
||||
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
|
||||
|
||||
auto ie = PluginCache::get().ie();
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
|
||||
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
|
||||
// regular inference
|
||||
auto inf_req_regular = exec_net_regular.CreateInferRequest();
|
||||
@ -497,7 +498,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) {
|
||||
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
|
||||
// without calling thread blocks
|
||||
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
|
||||
auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
|
||||
auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
auto inf_req_shared = exec_net_shared.CreateInferRequest();
|
||||
|
||||
// Allocate shared buffers for input and output data which will be set to infer request
|
||||
@ -600,7 +601,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) {
|
||||
|
||||
/* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */
|
||||
auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU,
|
||||
{ { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES} });
|
||||
{ { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES}, {ov::hint::inference_precision.name(), "f32"} });
|
||||
auto inf_req_remote = exec_net_b.CreateInferRequest();
|
||||
auto cldnn_context = exec_net_b.GetContext();
|
||||
cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
|
||||
@ -669,7 +670,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) {
|
||||
net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8);
|
||||
net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
|
||||
|
||||
auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU);
|
||||
auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU, {{ov::hint::inference_precision.name(), "f32"}});
|
||||
|
||||
auto inf_req_local = exec_net_b1.CreateInferRequest();
|
||||
|
||||
@ -740,7 +741,8 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) {
|
||||
net.getInputsInfo().begin()->second->setPrecision(Precision::FP32);
|
||||
|
||||
auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU,
|
||||
{{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
|
||||
{{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)},
|
||||
{ov::hint::inference_precision.name(), "f32"}});
|
||||
|
||||
for (int j = 0; j < num_streams * num_requests; j++) {
|
||||
outputs.push_back(net.getOutputsInfo().begin()->first);
|
||||
|
@ -344,12 +344,12 @@ TEST_P(OVClassGetPropertyTest_GPU, GetAndSetEnableProfilingNoThrow) {
|
||||
TEST_P(OVClassGetPropertyTest_GPU, GetAndSetInferencePrecisionNoThrow) {
|
||||
ov::Core ie;
|
||||
auto value = ov::element::undefined;
|
||||
const auto expected_default_precision = ov::element::undefined;
|
||||
const auto expected_default_precision = ov::element::f16;
|
||||
|
||||
OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision));
|
||||
ASSERT_EQ(expected_default_precision, value);
|
||||
|
||||
const auto forced_precision = ov::element::f16;
|
||||
const auto forced_precision = ov::element::f32;
|
||||
|
||||
OV_ASSERT_NO_THROW(ie.set_property(target_device, ov::hint::inference_precision(forced_precision)));
|
||||
OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision));
|
||||
|
@ -5,4 +5,16 @@
|
||||
#include "functional_test_utils/core_config.hpp"
|
||||
|
||||
void CoreConfiguration(LayerTestsUtils::LayerTestsCommon* test) {
|
||||
std::shared_ptr<InferenceEngine::Core> core = PluginCache::get().ie();
|
||||
ov::element::Type hint = ov::element::f32;
|
||||
for (auto& param : test->GetFunction()->get_parameters()) {
|
||||
if (param->get_output_element_type(0) == ov::element::f16) {
|
||||
hint = ov::element::f16;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary
|
||||
std::map<std::string, std::string> config = {{"INFERENCE_PRECISION_HINT", hint.get_type_name()}};
|
||||
core->SetConfig(config, CommonTestUtils::DEVICE_GPU);
|
||||
}
|
||||
|
@ -125,5 +125,8 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
R"(.*smoke_GroupConvolution1D_ExplicitPadding_Disabled.*)",
|
||||
R"(.*smoke_GroupConvolutionLayerGPUTest_dynamic1DSymPad_Disabled.*)",
|
||||
R"(.*smoke_ConvolutionLayerGPUTest_dynamic1DSymPad.*)",
|
||||
|
||||
// Looks like the test is targeting CPU plugin and doesn't respect that execution graph may vary from plugin to plugin
|
||||
R"(.*ExecGraphSerializationTest.*)",
|
||||
};
|
||||
}
|
||||
|
@ -70,8 +70,11 @@ protected:
|
||||
n.second->setPrecision(Precision::FP32);
|
||||
}
|
||||
std::map<std::string, std::string> config;
|
||||
if (target_device.find("GPU") != std::string::npos)
|
||||
if (target_device.find("GPU") != std::string::npos) {
|
||||
config[CONFIG_KEY(GPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
|
||||
config["INFERENCE_PRECISION_HINT"] = "f32";
|
||||
}
|
||||
|
||||
if (target_device.find("CPU") != std::string::npos) {
|
||||
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
|
||||
config[CONFIG_KEY(ENFORCE_BF16)] = CONFIG_VALUE(NO);
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "functional_test_utils/skip_tests_config.hpp"
|
||||
#include "common_test_utils/ngraph_test_utils.hpp"
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
#include "execution_graph_tests/normalize_l2_decomposition.hpp"
|
||||
|
||||
namespace ExecutionGraphTests {
|
||||
@ -33,7 +34,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNonContiguousAxes
|
||||
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
|
||||
|
||||
auto core = ov::Core();
|
||||
const auto compiled_model = core.compile_model(model, device_name);
|
||||
ov::AnyMap config;
|
||||
if (device_name == CommonTestUtils::DEVICE_GPU)
|
||||
config.insert(ov::hint::inference_precision(ov::element::f32));
|
||||
const auto compiled_model = core.compile_model(model, device_name, config);
|
||||
|
||||
ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied
|
||||
}
|
||||
@ -50,7 +54,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNormalizeOverAllA
|
||||
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
|
||||
|
||||
auto core = ov::Core();
|
||||
const auto compiled_model = core.compile_model(model, device_name);
|
||||
ov::AnyMap config;
|
||||
if (device_name == CommonTestUtils::DEVICE_GPU)
|
||||
config.insert(ov::hint::inference_precision(ov::element::f32));
|
||||
const auto compiled_model = core.compile_model(model, device_name, config);
|
||||
|
||||
ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied
|
||||
}
|
||||
@ -67,7 +74,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForNotSorted) {
|
||||
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
|
||||
|
||||
auto core = ov::Core();
|
||||
const auto compiled_model = core.compile_model(model, device_name);
|
||||
ov::AnyMap config;
|
||||
if (device_name == CommonTestUtils::DEVICE_GPU)
|
||||
config.insert(ov::hint::inference_precision(ov::element::f32));
|
||||
const auto compiled_model = core.compile_model(model, device_name, config);
|
||||
|
||||
ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied
|
||||
}
|
||||
@ -84,7 +94,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForSingleAxis) {
|
||||
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
|
||||
|
||||
auto core = ov::Core();
|
||||
const auto compiled_model = core.compile_model(model, device_name);
|
||||
ov::AnyMap config;
|
||||
if (device_name == CommonTestUtils::DEVICE_GPU)
|
||||
config.insert(ov::hint::inference_precision(ov::element::f32));
|
||||
const auto compiled_model = core.compile_model(model, device_name, config);
|
||||
|
||||
ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied
|
||||
}
|
||||
|
@ -216,6 +216,18 @@ void SubgraphBaseTest::compile_model() {
|
||||
}
|
||||
#endif
|
||||
|
||||
// Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary
|
||||
if (targetDevice == CommonTestUtils::DEVICE_GPU) {
|
||||
ov::element::Type hint = ov::element::f32;
|
||||
for (auto& param : function->get_parameters()) {
|
||||
if (param->get_output_element_type(0) == ov::element::f16) {
|
||||
hint = ov::element::f16;
|
||||
break;
|
||||
}
|
||||
}
|
||||
configuration.insert({ov::hint::inference_precision.name(), hint});
|
||||
}
|
||||
|
||||
compiledModel = core->compile_model(function, targetDevice, configuration);
|
||||
if (is_report_stages) {
|
||||
auto end_time = std::chrono::system_clock::now();
|
||||
|
@ -76,6 +76,11 @@ class CommonLayerTest:
|
||||
# (flag, resp) = ir.compare(ref_net)
|
||||
# assert flag, '\n'.join(resp)
|
||||
|
||||
config = None
|
||||
# GPU default execution precision is FP16, so if we want to check FP32 inference we need to set explicit precision hint
|
||||
if ie_device == 'GPU' and precision == 'FP32':
|
||||
config = {'INFERENCE_PRECISION_HINT' : 'f32'}
|
||||
|
||||
if self.use_old_api:
|
||||
ie_engine = IEInfer(model=path_to_xml,
|
||||
weights=path_to_bin,
|
||||
@ -93,7 +98,7 @@ class CommonLayerTest:
|
||||
inputs_dict = self._prepare_input(ie_engine.get_inputs_info(precision))
|
||||
|
||||
# IE infer:
|
||||
infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout)
|
||||
infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout, config=config)
|
||||
|
||||
if hasattr(self, 'skip_framework') and self.skip_framework:
|
||||
warnings.warn('Framework is skipped')
|
||||
|
@ -23,14 +23,14 @@ class BaseInfer:
|
||||
self.name = name
|
||||
self.res = None
|
||||
|
||||
def fw_infer(self, input_data):
|
||||
def fw_infer(self, input_data, config=None):
|
||||
raise RuntimeError("This is base class, please implement infer function for the specific framework")
|
||||
|
||||
def get_inputs_info(self, precision) -> dict:
|
||||
raise RuntimeError("This is base class, please implement get_inputs_info function for the specific framework")
|
||||
|
||||
def infer(self, input_data, infer_timeout=10):
|
||||
self.res = multiprocessing_run(self.fw_infer, [input_data], self.name, infer_timeout)
|
||||
def infer(self, input_data, config=None, infer_timeout=10):
|
||||
self.res = multiprocessing_run(self.fw_infer, [input_data, config], self.name, infer_timeout)
|
||||
return self.res
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ class IEInfer(BaseInfer):
|
||||
self.model = model
|
||||
self.weights = weights
|
||||
|
||||
def fw_infer(self, input_data):
|
||||
def fw_infer(self, input_data, config=None):
|
||||
|
||||
print("Inference Engine version: {}".format(ie_get_version()))
|
||||
print("Creating IE Core Engine...")
|
||||
@ -49,7 +49,7 @@ class IEInfer(BaseInfer):
|
||||
print("Reading network files")
|
||||
net = ie.read_network(self.model, self.weights)
|
||||
print("Loading network")
|
||||
exec_net = ie.load_network(net, self.device)
|
||||
exec_net = ie.load_network(net, self.device, config)
|
||||
print("Starting inference")
|
||||
result = exec_net.infer(input_data)
|
||||
|
||||
@ -78,14 +78,14 @@ class InferAPI20(BaseInfer):
|
||||
self.weights = weights
|
||||
self.use_new_frontend = use_new_frontend
|
||||
|
||||
def fw_infer(self, input_data):
|
||||
def fw_infer(self, input_data, config=None):
|
||||
print("Inference Engine version: {}".format(ie2_get_version()))
|
||||
print("Creating IE Core Engine...")
|
||||
ie = Core()
|
||||
print("Reading network files")
|
||||
net = ie.read_model(self.model, self.weights)
|
||||
print("Loading network")
|
||||
exec_net = ie.compile_model(net, self.device)
|
||||
exec_net = ie.compile_model(net, self.device, config)
|
||||
print("Starting inference")
|
||||
request = exec_net.create_infer_request()
|
||||
request_result = request.infer(input_data)
|
||||
|
@ -19,7 +19,7 @@ class OnnxRuntimeInfer(BaseInfer):
|
||||
super().__init__('OnnxRuntime')
|
||||
self.net = net
|
||||
|
||||
def fw_infer(self, input_data):
|
||||
def fw_infer(self, input_data, config=None):
|
||||
import onnxruntime as rt
|
||||
|
||||
sess = rt.InferenceSession(self.net)
|
||||
|
Loading…
Reference in New Issue
Block a user