[GPU] Change default infer precision to fp16 (#14752)

This commit is contained in:
Vladimir Paramuzov 2023-01-04 15:55:44 +04:00 committed by GitHub
parent f31ebd4947
commit e23b8492c5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 82 additions and 32 deletions

View File

@ -32,7 +32,7 @@ struct Config {
max_dynamic_batch(1),
customLayers({}),
kernels_cache_dir(""),
inference_precision(ov::element::undefined),
inference_precision(ov::element::f16),
task_exec_config({"GPU plugin internal task executor", // name
std::max(1, static_cast<int>(std::thread::hardware_concurrency())), // # of streams
1, // # of threads per streams

View File

@ -55,7 +55,7 @@ TEST_P(OVConcurrencyTest, canInferTwoExecNets) {
auto fn = fn_ptrs[i];
auto exec_net = ie.compile_model(fn_ptrs[i], CommonTestUtils::DEVICE_GPU,
{{ov::ie::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
{ov::num_streams(num_streams), ov::hint::inference_precision(ov::element::f32)});
auto input = fn_ptrs[i]->get_parameters().at(0);
auto output = fn_ptrs[i]->get_results().at(0);
@ -115,7 +115,7 @@ TEST(canSwapTensorsBetweenInferRequests, inputs) {
auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat();
auto ie = ov::Core();
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
const int infer_requests_num = 2;
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
@ -193,7 +193,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, usmHostIsNotChanged) {
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
auto ie = ov::Core();
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
@ -232,7 +232,7 @@ TEST(smoke_InferRequestDeviceMemoryAllocation, canSetSystemHostTensor) {
auto fn = ngraph::builder::subgraph::makeDetectionOutput(ngraph::element::Type_t::f32);
auto ie = ov::Core();
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
ov::InferRequest infer_request1 = compiled_model.create_infer_request();
ov::InferRequest infer_request2 = compiled_model.create_infer_request();
@ -258,7 +258,7 @@ TEST(canSwapTensorsBetweenInferRequests, outputs) {
auto fn = ngraph::builder::subgraph::makeSplitMultiConvConcat();
auto ie = ov::Core();
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU);
auto compiled_model = ie.compile_model(fn, CommonTestUtils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f32));
const int infer_requests_num = 2;
ov::InferRequest infer_request1 = compiled_model.create_infer_request();

View File

@ -40,6 +40,7 @@ public:
{CONFIG_KEY(AUTO_BATCH_TIMEOUT) , "0"},
};
}
config.insert({ov::hint::inference_precision.name(), "f32"});
fn_ptr = ov::test::behavior::getDefaultNGraphFunctionForTheDevice(with_auto_batching ? CommonTestUtils::DEVICE_BATCH : deviceName);
}
static std::string getTestCaseName(const testing::TestParamInfo<bool>& obj) {
@ -229,7 +230,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserContext) {
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
auto ie = PluginCache::get().ie();
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
// regular inference
auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -276,7 +277,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
auto ie = PluginCache::get().ie();
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
// regular inference
auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -304,7 +305,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_out_of_order) {
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
// without calling thread blocks
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
auto inf_req_shared = exec_net_shared.CreateInferRequest();
// Allocate shared buffers for input and output data which will be set to infer request
@ -374,7 +375,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
auto ie = PluginCache::get().ie();
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
// regular inference
auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -403,7 +404,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_in_order) {
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
// without calling thread blocks
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
auto inf_req_shared = exec_net_shared.CreateInferRequest();
// Allocate shared buffers for input and output data which will be set to infer request
@ -468,7 +469,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) {
auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
auto ie = PluginCache::get().ie();
auto exec_net_regular = ie->LoadNetwork(net, deviceName);
auto exec_net_regular = ie->LoadNetwork(net, deviceName, {{ov::hint::inference_precision.name(), "f32"}});
// regular inference
auto inf_req_regular = exec_net_regular.CreateInferRequest();
@ -497,7 +498,7 @@ TEST_P(RemoteBlob_Test, smoke_canInferOnUserQueue_infer_call_many_times) {
// In this scenario we create shared OCL queue and run simple pre-process action and post-process action (buffer copies in both cases)
// without calling thread blocks
auto remote_context = make_shared_context(*ie, deviceName, ocl_instance->_queue.get());
auto exec_net_shared = ie->LoadNetwork(net, remote_context); // no auto-batching support, so no config is passed
auto exec_net_shared = ie->LoadNetwork(net, remote_context, {{ov::hint::inference_precision.name(), "f32"}});
auto inf_req_shared = exec_net_shared.CreateInferRequest();
// Allocate shared buffers for input and output data which will be set to infer request
@ -600,7 +601,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) {
/* XXX: is it correct to set KEY_CLDNN_NV12_TWO_INPUTS in case of remote blob? */
auto exec_net_b = ie.LoadNetwork(net_remote, CommonTestUtils::DEVICE_GPU,
{ { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES} });
{ { GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS, PluginConfigParams::YES}, {ov::hint::inference_precision.name(), "f32"} });
auto inf_req_remote = exec_net_b.CreateInferRequest();
auto cldnn_context = exec_net_b.GetContext();
cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
@ -669,7 +670,7 @@ TEST_P(BatchedBlob_Test, canInputNV12) {
net_local.getInputsInfo().begin()->second->setPrecision(Precision::U8);
net_local.getInputsInfo().begin()->second->getPreProcess().setColorFormat(ColorFormat::NV12);
auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU);
auto exec_net_b1 = ie.LoadNetwork(net_local, CommonTestUtils::DEVICE_GPU, {{ov::hint::inference_precision.name(), "f32"}});
auto inf_req_local = exec_net_b1.CreateInferRequest();
@ -740,7 +741,8 @@ TEST_P(TwoNets_Test, canInferTwoExecNets) {
net.getInputsInfo().begin()->second->setPrecision(Precision::FP32);
auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU,
{{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
{{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)},
{ov::hint::inference_precision.name(), "f32"}});
for (int j = 0; j < num_streams * num_requests; j++) {
outputs.push_back(net.getOutputsInfo().begin()->first);

View File

@ -344,12 +344,12 @@ TEST_P(OVClassGetPropertyTest_GPU, GetAndSetEnableProfilingNoThrow) {
TEST_P(OVClassGetPropertyTest_GPU, GetAndSetInferencePrecisionNoThrow) {
ov::Core ie;
auto value = ov::element::undefined;
const auto expected_default_precision = ov::element::undefined;
const auto expected_default_precision = ov::element::f16;
OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision));
ASSERT_EQ(expected_default_precision, value);
const auto forced_precision = ov::element::f16;
const auto forced_precision = ov::element::f32;
OV_ASSERT_NO_THROW(ie.set_property(target_device, ov::hint::inference_precision(forced_precision)));
OV_ASSERT_NO_THROW(value = ie.get_property(target_device, ov::hint::inference_precision));

View File

@ -5,4 +5,16 @@
#include "functional_test_utils/core_config.hpp"
void CoreConfiguration(LayerTestsUtils::LayerTestsCommon* test) {
std::shared_ptr<InferenceEngine::Core> core = PluginCache::get().ie();
ov::element::Type hint = ov::element::f32;
for (auto& param : test->GetFunction()->get_parameters()) {
if (param->get_output_element_type(0) == ov::element::f16) {
hint = ov::element::f16;
break;
}
}
// Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary
std::map<std::string, std::string> config = {{"INFERENCE_PRECISION_HINT", hint.get_type_name()}};
core->SetConfig(config, CommonTestUtils::DEVICE_GPU);
}

View File

@ -125,5 +125,8 @@ std::vector<std::string> disabledTestPatterns() {
R"(.*smoke_GroupConvolution1D_ExplicitPadding_Disabled.*)",
R"(.*smoke_GroupConvolutionLayerGPUTest_dynamic1DSymPad_Disabled.*)",
R"(.*smoke_ConvolutionLayerGPUTest_dynamic1DSymPad.*)",
// Looks like the test is targeting CPU plugin and doesn't respect that execution graph may vary from plugin to plugin
R"(.*ExecGraphSerializationTest.*)",
};
}

View File

@ -70,8 +70,11 @@ protected:
n.second->setPrecision(Precision::FP32);
}
std::map<std::string, std::string> config;
if (target_device.find("GPU") != std::string::npos)
if (target_device.find("GPU") != std::string::npos) {
config[CONFIG_KEY(GPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
config["INFERENCE_PRECISION_HINT"] = "f32";
}
if (target_device.find("CPU") != std::string::npos) {
config[CONFIG_KEY(CPU_THROUGHPUT_STREAMS)] = std::to_string(num_streams);
config[CONFIG_KEY(ENFORCE_BF16)] = CONFIG_VALUE(NO);

View File

@ -11,6 +11,7 @@
#include "functional_test_utils/skip_tests_config.hpp"
#include "common_test_utils/ngraph_test_utils.hpp"
#include "common_test_utils/test_constants.hpp"
#include "execution_graph_tests/normalize_l2_decomposition.hpp"
namespace ExecutionGraphTests {
@ -33,7 +34,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNonContiguousAxes
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
auto core = ov::Core();
const auto compiled_model = core.compile_model(model, device_name);
ov::AnyMap config;
if (device_name == CommonTestUtils::DEVICE_GPU)
config.insert(ov::hint::inference_precision(ov::element::f32));
const auto compiled_model = core.compile_model(model, device_name, config);
ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied
}
@ -50,7 +54,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeAppliedForNormalizeOverAllA
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
auto core = ov::Core();
const auto compiled_model = core.compile_model(model, device_name);
ov::AnyMap config;
if (device_name == CommonTestUtils::DEVICE_GPU)
config.insert(ov::hint::inference_precision(ov::element::f32));
const auto compiled_model = core.compile_model(model, device_name, config);
ASSERT_TRUE(model->get_ops().size() < compiled_model.get_runtime_model()->get_ops().size()); // decomposition applied
}
@ -67,7 +74,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForNotSorted) {
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
auto core = ov::Core();
const auto compiled_model = core.compile_model(model, device_name);
ov::AnyMap config;
if (device_name == CommonTestUtils::DEVICE_GPU)
config.insert(ov::hint::inference_precision(ov::element::f32));
const auto compiled_model = core.compile_model(model, device_name, config);
ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied
}
@ -84,7 +94,10 @@ TEST_P(ExecGrapDecomposeNormalizeL2, CheckIfDecomposeNotAppliedForSingleAxis) {
const auto model = std::make_shared<ov::Model>(ov::NodeVector{normalize_l2}, ov::ParameterVector{input});
auto core = ov::Core();
const auto compiled_model = core.compile_model(model, device_name);
ov::AnyMap config;
if (device_name == CommonTestUtils::DEVICE_GPU)
config.insert(ov::hint::inference_precision(ov::element::f32));
const auto compiled_model = core.compile_model(model, device_name, config);
ASSERT_TRUE(model->get_ops().size() >= compiled_model.get_runtime_model()->get_ops().size()); // decomposition not applied
}

View File

@ -216,6 +216,18 @@ void SubgraphBaseTest::compile_model() {
}
#endif
// Set inference_precision hint to run fp32 model in fp32 runtime precision as default plugin execution precision may vary
if (targetDevice == CommonTestUtils::DEVICE_GPU) {
ov::element::Type hint = ov::element::f32;
for (auto& param : function->get_parameters()) {
if (param->get_output_element_type(0) == ov::element::f16) {
hint = ov::element::f16;
break;
}
}
configuration.insert({ov::hint::inference_precision.name(), hint});
}
compiledModel = core->compile_model(function, targetDevice, configuration);
if (is_report_stages) {
auto end_time = std::chrono::system_clock::now();

View File

@ -76,6 +76,11 @@ class CommonLayerTest:
# (flag, resp) = ir.compare(ref_net)
# assert flag, '\n'.join(resp)
config = None
# GPU default execution precision is FP16, so if we want to check FP32 inference we need to set explicit precision hint
if ie_device == 'GPU' and precision == 'FP32':
config = {'INFERENCE_PRECISION_HINT' : 'f32'}
if self.use_old_api:
ie_engine = IEInfer(model=path_to_xml,
weights=path_to_bin,
@ -93,7 +98,7 @@ class CommonLayerTest:
inputs_dict = self._prepare_input(ie_engine.get_inputs_info(precision))
# IE infer:
infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout)
infer_res = ie_engine.infer(input_data=inputs_dict, infer_timeout=infer_timeout, config=config)
if hasattr(self, 'skip_framework') and self.skip_framework:
warnings.warn('Framework is skipped')

View File

@ -23,14 +23,14 @@ class BaseInfer:
self.name = name
self.res = None
def fw_infer(self, input_data):
def fw_infer(self, input_data, config=None):
raise RuntimeError("This is base class, please implement infer function for the specific framework")
def get_inputs_info(self, precision) -> dict:
raise RuntimeError("This is base class, please implement get_inputs_info function for the specific framework")
def infer(self, input_data, infer_timeout=10):
self.res = multiprocessing_run(self.fw_infer, [input_data], self.name, infer_timeout)
def infer(self, input_data, config=None, infer_timeout=10):
self.res = multiprocessing_run(self.fw_infer, [input_data, config], self.name, infer_timeout)
return self.res
@ -41,7 +41,7 @@ class IEInfer(BaseInfer):
self.model = model
self.weights = weights
def fw_infer(self, input_data):
def fw_infer(self, input_data, config=None):
print("Inference Engine version: {}".format(ie_get_version()))
print("Creating IE Core Engine...")
@ -49,7 +49,7 @@ class IEInfer(BaseInfer):
print("Reading network files")
net = ie.read_network(self.model, self.weights)
print("Loading network")
exec_net = ie.load_network(net, self.device)
exec_net = ie.load_network(net, self.device, config)
print("Starting inference")
result = exec_net.infer(input_data)
@ -78,14 +78,14 @@ class InferAPI20(BaseInfer):
self.weights = weights
self.use_new_frontend = use_new_frontend
def fw_infer(self, input_data):
def fw_infer(self, input_data, config=None):
print("Inference Engine version: {}".format(ie2_get_version()))
print("Creating IE Core Engine...")
ie = Core()
print("Reading network files")
net = ie.read_model(self.model, self.weights)
print("Loading network")
exec_net = ie.compile_model(net, self.device)
exec_net = ie.compile_model(net, self.device, config)
print("Starting inference")
request = exec_net.create_infer_request()
request_result = request.infer(input_data)

View File

@ -19,7 +19,7 @@ class OnnxRuntimeInfer(BaseInfer):
super().__init__('OnnxRuntime')
self.net = net
def fw_infer(self, input_data):
def fw_infer(self, input_data, config=None):
import onnxruntime as rt
sess = rt.InferenceSession(self.net)