[IE CLDNN] Build time optimization for OCL kernels (#4725)

Build OCL kernel batches of different buckets in parallel Co-authored-by: Donghyeon Jeong <donghyeon.jeong@intel.com>
2021-03-31 16:22:19 +09:00 · 2021-03-31 16:22:19 +09:00 · b58c648d2d
commit b58c648d2d
parent d674eebd52
19 changed files with 340 additions and 180 deletions
--- a/inference-engine/include/cldnn/cldnn_config.hpp
+++ b/inference-engine/include/cldnn/cldnn_config.hpp
@ -72,6 +72,11 @@ DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
 */
 DECLARE_CLDNN_CONFIG_KEY(NV12_TWO_INPUTS);

+/**
+* @brief This key sets the max number of host threads that can be used by GPU plugin on model loading.
+* Default value is maximum number of threads available in the environment.
+*/
+DECLARE_CLDNN_CONFIG_KEY(MAX_NUM_THREADS);

 }  // namespace CLDNNConfigParams
 }  // namespace InferenceEngine
--- a/inference-engine/samples/classification_sample_async/classification_sample_async.h
+++ b/inference-engine/samples/classification_sample_async/classification_sample_async.h
@ -48,6 +48,7 @@ DEFINE_string(i, "", image_message);
 /// @brief Define parameter for set model file <br>
 /// It is a required parameter
 DEFINE_string(m, "", model_message);
+DEFINE_string(m2, "", model_message);

 /// @brief device the target device to infer on <br>
 DEFINE_string(d, "CPU", target_device_message);
--- a/inference-engine/src/cldnn_engine/CMakeLists.txt
+++ b/inference-engine/src/cldnn_engine/CMakeLists.txt
@ -40,6 +40,8 @@ target_include_directories(${TARGET_NAME} PRIVATE

 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})

+
+set_ie_threading_interface_for(clDNN_lib)
 # Failed because of OpenCL
 # ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})

--- a/inference-engine/src/cldnn_engine/cldnn_config.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@ -11,6 +11,7 @@
 #include "ie_api.h"
 #include "file_utils.h"
 #include "cldnn_itt.h"
+#include <thread>

 #ifdef _WIN32
 # include <direct.h>
@ -221,6 +222,20 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
            } else {
                IE_THROW(NotFound) << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
            }
+        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS) == 0) {
+            int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
+            try {
+                int val_i = std::stoi(val);
+                if (val_i <= 0 || val_i > max_threads) {
+                    n_threads = max_threads;
+                } else {
+                    n_threads = val_i;
+                }
+            } catch (const std::exception&) {
+                IE_THROW() << "Wrong value for property key " << CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS << ": " << val
+                                   << "\nSpecify the number of threads use for build as an integer."
+                                   << "\nOut of range value will be set as a default value, maximum concurrent threads.";
+            }
        } else {
            IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
        }
@ -306,5 +321,6 @@ void Config::adjustKeyMapValues() {
    key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
    key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
    key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
+    key_config_map[CLDNNConfigParams::KEY_CLDNN_MAX_NUM_THREADS] = std::to_string(n_threads);
 }
 }  // namespace CLDNNPlugin
--- a/inference-engine/src/cldnn_engine/cldnn_config.h
+++ b/inference-engine/src/cldnn_engine/cldnn_config.h
@ -31,7 +31,8 @@ struct Config {
               graph_dumps_dir(""),
               sources_dumps_dir(""),
               device_id(""),
-               kernels_cache_dir("") {
+               kernels_cache_dir(""),
+               n_threads(std::max(static_cast<unsigned int>(1), std::thread::hardware_concurrency())) {
        adjustKeyMapValues();
    }

@ -56,6 +57,7 @@ struct Config {
    std::string sources_dumps_dir;
    std::string device_id;
    std::string kernels_cache_dir;
+    size_t n_threads;

    std::map<std::string, std::string> key_config_map;
 };
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -494,7 +494,8 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn
               context_config.tuningConfig.mode == current_config.tuningConfig.mode &&
               context_config.tuningConfig.cache_file_path == current_config.tuningConfig.cache_file_path &&
               context_config.kernels_cache_dir == current_config.kernels_cache_dir &&
-               context_config.device_id == current_config.device_id;
+               context_config.device_id == current_config.device_id &&
+               context_config.n_threads == current_config.n_threads;
    };

    {
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@ -267,7 +267,8 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
                m_config.queueThrottle,
                m_config.memory_pool_on,
                m_config.throughput_streams,
-                m_config.kernels_cache_dir));
+                m_config.kernels_cache_dir,
+                m_config.n_threads));
    }
 }

--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat.cpp
@ -4,7 +4,7 @@

 #include <vector>

-#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
+#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat.hpp"
 #include "common_test_utils/test_constants.hpp"

 using namespace LayerTestsDefinitions;
@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
    LayerTestsUtils::LayerTransformationParamsFactory::createParams()
 };

-INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
+INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformationsForConcat,
    ::testing::Combine(
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
        ::testing::Values(CommonTestUtils::DEVICE_GPU),
        ::testing::ValuesIn(trasformationParamValues)),
-    OutputLayersHandlingInTransformations::getTestCaseName);
+    OutputLayersHandlingInTransformationsForConcat::getTestCaseName);
 }  // namespace
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.cpp
@ -4,7 +4,7 @@

 #include <vector>

-#include "low_precision_transformations/output_layers_handling_in_transformations.hpp"
+#include "low_precision_transformations/output_layers_handling_in_transformations_for_concat_multi_channel.hpp"
 #include "common_test_utils/test_constants.hpp"

 using namespace LayerTestsDefinitions;
@ -19,11 +19,11 @@ const std::vector<LayerTransformation::Params> trasformationParamValues = {
    LayerTestsUtils::LayerTransformationParamsFactory::createParams()
 };

-INSTANTIATE_TEST_CASE_P(smoke_LPT, OutputLayersHandlingInTransformations,
+INSTANTIATE_TEST_CASE_P(DISABLED_smoke_LPT, OutputLayersHandlingInTransformationsForConcatMultiChannel,
    ::testing::Combine(
        ::testing::ValuesIn(netPrecisions),
        ::testing::Values(InferenceEngine::SizeVector({ 1, 3, 16, 16 })),
        ::testing::Values(CommonTestUtils::DEVICE_GPU),
        ::testing::ValuesIn(trasformationParamValues)),
-    OutputLayersHandlingInTransformations::getTestCaseName);
+    OutputLayersHandlingInTransformationsForConcatMultiChannel::getTestCaseName);
 }  // namespace
--- a/inference-engine/thirdparty/CMakeLists.txt
+++ b/inference-engine/thirdparty/CMakeLists.txt
@ -39,7 +39,7 @@ if (ENABLE_CLDNN)
    else()
        set(CLDNN__ARCHITECTURE_TARGET "Linux64" CACHE STRING "" FORCE)
    endif()
-
+    set(CLDNN_THREADING "${THREADING}" CACHE STRING "" FORCE)
    add_subdirectory(clDNN)

    # disable CLDNN docs build
--- a/inference-engine/thirdparty/clDNN/CMakeLists.txt
+++ b/inference-engine/thirdparty/clDNN/CMakeLists.txt
@ -59,6 +59,14 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 # ======================================================================================================
 # ====================================== HELPER CONSTANT VARIABLES =====================================
 # ======================================================================================================
+# ======================================================================================================
+if("${CLDNN_THREADING}" MATCHES "SEQ")
+    add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_SEQ)
+elseif("${CLDNN_THREADING}" MATCHES "TBB")
+    add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_TBB)
+else()
+    add_definitions(-DCLDNN_THREADING=CLDNN_THREADING_THREADPOOL)
+endif()

 # Path which points to main directory of project.
 set(CLDNN__MAIN_DIR      "${CMAKE_CURRENT_SOURCE_DIR}")
--- a/inference-engine/thirdparty/clDNN/api/cldnn.hpp
+++ b/inference-engine/thirdparty/clDNN/api/cldnn.hpp
@ -123,6 +123,7 @@
 #include <memory>
 #include <string>
 #include <type_traits>
+#include <thread>

 namespace cldnn {

--- a/inference-engine/thirdparty/clDNN/api/engine.hpp
+++ b/inference-engine/thirdparty/clDNN/api/engine.hpp
@ -10,6 +10,7 @@
 #include <stdexcept>
 #include <vector>
 #include <map>
+#include <algorithm>

 namespace cldnn {

@ -61,6 +62,7 @@ struct engine_configuration {
                                              ///< (switched off for older drivers then NEO).
    uint16_t n_streams;                       ///< Number of queues executed in parallel
    const std::string kernels_cache_path;     ///< Path to compiled kernels cache
+    uint16_t n_threads;                       ///< Number of threads
    const std::string tuning_cache_path;      ///< Path to tuning kernel cache

    /// @brief Constructs engine configuration with specified options.
@ -83,6 +85,7 @@ struct engine_configuration {
        bool memory_pool = true,
        uint16_t n_streams = 1,
        const std::string& kernels_cache_path = "",
+        uint16_t n_threads = std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1)),
        const std::string& tuning_cache_path = "cache.json")
        : enable_profiling(profiling)
        , meaningful_kernels_names(decorate_kernel_names)
@ -97,6 +100,7 @@ struct engine_configuration {
        , enable_memory_pool(memory_pool)
        , n_streams(n_streams)
        , kernels_cache_path(kernels_cache_path)
+        , n_threads(n_threads)
        , tuning_cache_path(tuning_cache_path) {
        if (n_streams == 0) {
            throw std::invalid_argument("Invalid streams count set in engine config");
--- a/inference-engine/thirdparty/clDNN/src/engine.cpp
+++ b/inference-engine/thirdparty/clDNN/src/engine.cpp
@ -83,6 +83,7 @@ gpu_toolkit_config convert_configuration(const engine_configuration conf) {
    result.queues_num = conf.n_streams;
    result.kernels_cache_path = conf.kernels_cache_path;
    result.tuning_cache_path = conf.tuning_cache_path;
+    result.n_threads = conf.n_threads;
    return result;
 }

--- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp
@ -4,6 +4,7 @@

 ///////////////////////////////////////////////////////////////////////////////////////////////////
 #include "configuration.h"
+#include <algorithm>

 namespace cldnn {
 namespace gpu {
@ -22,6 +23,7 @@ configuration::configuration()
      throttle_mode(throttle_mode_types::disabled),
      queues_num(0),
      tuning_cache_path("cache.json"),
-      kernels_cache_path("") {}
+      kernels_cache_path(""),
+      n_threads(std::max(static_cast<uint16_t>(std::thread::hardware_concurrency()), static_cast<uint16_t>(1))) {}
 }  // namespace gpu
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.h
@ -31,6 +31,7 @@ struct configuration {
    uint16_t queues_num;
    std::string tuning_cache_path;
    std::string kernels_cache_path;
+    uint16_t n_threads;
 };
 }  // namespace gpu
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp
@ -13,9 +13,17 @@
 #include <string>
 #include <memory>
 #include <utility>
-
 #include "kernel_selector_helper.h"
 #include "cldnn_itt.h"
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range.h>
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+#include <thread>
+#include <future>
+#include <queue>
+#include <condition_variable>
+#endif

 #ifndef ENABLE_UNICODE_PATH_SUPPORT
 # ifdef _WIN32
@ -36,8 +44,10 @@
 #include <Windows.h>
 #endif

+#if (CLDNN_THREADING != CLDNN_THREADING_SEQ)
+#define DEFAULT_NUM_THREADS 2
+#endif
 namespace {
-
 std::mutex cacheAccessMutex;

 #ifdef ENABLE_UNICODE_PATH_SUPPORT
@ -84,7 +94,6 @@ static std::vector<unsigned char> loadBinaryFromFile(std::string path) {

    return {};
 }
-
 static void saveBinaryToFile(std::string path, const std::vector<unsigned char> buffer) {
    std::lock_guard<std::mutex> lock(cacheAccessMutex);
 #if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
@ -190,9 +199,10 @@ size_t kernels_cache::get_max_kernels_per_batch() const {
    return 10;
 }

-kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code& kernels_source_code) const {
+
+void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll::GetProgramSource");
-    sorted_code scode;
+    std::map<std::string, std::vector<batch_program>> program_buckets;

    for (const auto& code : kernels_source_code) {
        std::string full_code = code.kernel_strings->jit + code.kernel_strings->str;
@ -213,7 +223,7 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
        std::string key = options;

        if (batch_compilation == false) {
-            key += " __PROGRAM__" + std::to_string(scode.size());
+            key += " __PROGRAM__" + std::to_string(program_buckets.size());
        }

        if (dump_custom_program) {
@ -223,48 +233,63 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code&
        if (one_time_kernel) {
            key += " __ONE_TIME__";
        }
-
-        auto& current_bucket = scode[key];
-        current_bucket.dump_custom_program = dump_custom_program;
-        current_bucket.one_time = one_time_kernel;
-
-        if (current_bucket.source.empty()) {
-            current_bucket.options = options;
+        auto& current_bucket = program_buckets[key];
+        if (current_bucket.empty()) { // new bucket
+            const auto& bucket_id = program_buckets.size() - 1;
+            current_bucket.push_back(batch_program());
+            current_bucket.back().bucket_id = static_cast<int32_t>(bucket_id);
+            current_bucket.back().batch_id = 0;
+            current_bucket.back().options = options;
        }

-        // Create new kernels bucket when the limit is reached
-        if ((current_bucket.kernels_counter % get_max_kernels_per_batch()) == 0) {
-            current_bucket.source.push_back({});
+        // Create new kernels batch when the limit is reached
+        if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
+            const auto& batch_id = current_bucket.size();
+            current_bucket.push_back(batch_program());
+            current_bucket.back().bucket_id = static_cast<int32_t>(program_buckets.size());
+            current_bucket.back().batch_id = static_cast<int32_t>(batch_id);
+            current_bucket.back().options = options;
        }

-        current_bucket.entry_point_to_id[entry_point] = code.id;
+        auto& current_batch = current_bucket.back();
+        current_batch.dump_custom_program = dump_custom_program;
+        current_batch.one_time = one_time_kernel;
+        current_batch.entry_point_to_id[entry_point] = code.id;
+
        assert(org_source_code.size() == 1);

-        current_bucket.source.back().push_back(std::move(org_source_code.front()));
-
-        current_bucket.kernels_counter++;
+        current_batch.source.push_back(std::move(org_source_code.front()));
+        current_batch.kernels_counter++;
    }

-    // Compute hash value for each bucket
+    // Compute hash value for each batch
    // Hash calculation might require additional optimizations, but currently execution time of this part is much smaller than loading
    // of the precompiled binaries or get_undef_jit calls
    // Hash is computed for string that contains compilation options + driver version +
-    // full source code (jit + template + undef sections) of all kernels in the bucket
-    for (auto& c : scode) {
-        program_code& code = c.second;
+    // full source code (jit + template + undef sections) of all kernels in the batches
+    for (auto& c : program_buckets) {
        auto options = c.first;
-        for (size_t i = 0; i < code.source.size(); i++) {
+        auto& batches = c.second;
+        for (auto& b : batches) {
            std::string full_code = options + " " + _context.get_device_info().driver_version;
-            for (auto& ss : code.source[i])
+            for (auto& ss : b.source)
                full_code += ss;
-            code.hash_values.push_back(std::hash<std::string>()(full_code));
+            b.hash_value = std::hash<std::string>()(full_code);
+            all_batches->push_back(b);
        }
    }
-
-    return scode;
 }

-kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {}
+kernels_cache::kernels_cache(gpu_toolkit& context, uint32_t prog_id) : _context(context), _prog_id(prog_id) {
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+    int n_threads = _context.get_configuration().n_threads;
+    arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
+    arena->initialize(n_threads);
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+    int n_threads = _context.get_configuration().n_threads;
+    pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
+#endif
+}

 kernels_cache::kernel_id kernels_cache::set_kernel_source(
    const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
@ -301,149 +326,160 @@ static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
    return program.getInfo<CL_PROGRAM_BINARIES>().front();
 }

-kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source) const {
+void kernels_cache::build_batch(const batch_program& batch) {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram");
-    static uint32_t current_file_index = 0;

-    bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || program_source.dump_custom_program;
+    bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || batch.dump_custom_program;

-    std::string dump_file_name = "";
+    std::string err_log;  // accumulated build log from all program's parts (only contains messages from parts which
+
+    std::string current_dump_file_name = "";
    if (dump_sources) {
-        dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
-        if (!dump_file_name.empty() && dump_file_name.back() != '/')
-            dump_file_name += '/';
+        current_dump_file_name = _context.get_configuration().ocl_sources_dumps_dir;
+        if (!current_dump_file_name.empty() && current_dump_file_name.back() != '/')
+            current_dump_file_name += '/';

-        dump_file_name += "clDNN_program_" + std::to_string(current_file_index++) + "_part_";
+        current_dump_file_name += "clDNN_program_" + std::to_string(batch.bucket_id) + "_part_" + std::to_string(batch.batch_id) + ".cl";
    }

+    std::ofstream dump_file;
+    if (dump_sources) {
+        dump_file.open(current_dump_file_name);
+        if (dump_file.good()) {
+            for (auto& s : batch.source)
+                dump_file << s;
+        }
+    }
+
+    std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache";
+    cl::Program::Binaries precompiled_kernels = {};
+
+    if (is_cache_enabled()) {
+        // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
+        // If read is successful, then remove kernels from compilation bucket
+        auto bin = loadBinaryFromFile(cached_bin_name);
+        if (!bin.empty()) {
+            precompiled_kernels.push_back(bin);
+        }
+    }
    try {
-        kernels_map kmap;
-        std::string err_log;  // accumulated build log from all program's parts (only contains messages from parts which
-                              // failed to compile)
+        cl::vector<cl::Kernel> kernels;
+
+        // Run compilation
+        if (precompiled_kernels.empty()) {
+            cl::Program program(_context.context(), batch.source);
+            {
+                OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
+                program.build(_context.device(), batch.options.c_str());
+            }
+
+            if (dump_sources && dump_file.good()) {
+                dump_file << "\n/* Build Log:\n";
+                for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
+                    dump_file << p.second << "\n";
+
+                dump_file << "*/\n";
+            }
+
+            program.createKernels(&kernels);

-        uint32_t part_idx = 0;
-        for (size_t i = 0; i < program_source.source.size(); i++) {
-            auto sources_bucket_to_compile = program_source.source[i];
-            const auto& hash_value = program_source.hash_values[i];
-            std::string cached_bin_name = get_cache_path() + std::to_string(hash_value) + ".cl_cache";
-            cl::Program::Binaries precompiled_kernels = {};
            if (is_cache_enabled()) {
-                // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
-                // If read is successful, then remove kernels from compilation bucket
-                auto bin = loadBinaryFromFile(cached_bin_name);
-                if (!bin.empty()) {
-                    precompiled_kernels.push_back(bin);
-                }
+                // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
+                // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
+                // Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
+                // compile time.
+                saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
            }
-            auto current_dump_file_name = dump_file_name + std::to_string(part_idx++) + ".cl";
-            std::ofstream dump_file;
-
-            if (dump_sources) {
-                dump_file.open(current_dump_file_name);
-
-                if (dump_file.good()) {
-                    for (auto& s : sources_bucket_to_compile)
-                        dump_file << s;
-                }
-            }
-
-            try {
-                cl::vector<cl::Kernel> kernels;
-                // Run compilation
-                if (precompiled_kernels.empty()) {
-                    cl::Program program(_context.context(), sources_bucket_to_compile);
-                    {
-                        OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
-                        program.build(_context.device(), program_source.options.c_str());
-                    }
-
-                    if (dump_sources && dump_file.good()) {
-                        dump_file << "\n/* Build Log:\n";
-                        for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
-                            dump_file << p.second << "\n";
-
-                        dump_file << "*/\n";
-                    }
-
-                    program.createKernels(&kernels);
-                    if (is_cache_enabled()) {
-                        // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
-                        // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
-                        // Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
-                        // compile time.
-                        saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
+        } else {
+            cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
+            program.build(_context.device(), batch.options.c_str());
+            program.createKernels(&kernels);
+        }
+        {
+            std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
+            for (auto& k : kernels) {
+                const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
+                const auto& k_id = batch.entry_point_to_id.find(entry_point);
+                const auto& k_type = kernel_type(k, _context.get_device_info().supports_usm);
+                if (k_id != batch.entry_point_to_id.end()) {
+                    const auto& kmap = std::make_pair(k_id->second, k_type);
+                    if (batch.one_time) {
+                        _one_time_kernels.insert(kmap);
+                    } else {
+                        _kernels.insert(kmap);
                    }
                } else {
-                    cl::Program program(_context.context(), {_context.device()}, precompiled_kernels);
-                    program.build(_context.device(), program_source.options.c_str());
-                    program.createKernels(&kernels);
+                    throw std::runtime_error("Could not find entry point");
                }
-
-                for (auto& k : kernels) {
-                    auto kernel_name = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
-                    kmap.emplace(kernel_name, kernels_cache::kernel_type(k, _context.get_device_info().supports_usm));
-                }
-            } catch (const cl::BuildError& err) {
-                if (dump_sources && dump_file.good())
-                    dump_file << "\n/* Build Log:\n";
-
-                for (auto& p : err.getBuildLog()) {
-                    if (dump_sources && dump_file.good())
-                        dump_file << p.second << "\n";
-
-                    err_log += p.second + '\n';
-                }
-
-                if (dump_sources && dump_file.good())
-                    dump_file << "*/\n";
            }
        }
+    } catch (const cl::BuildError& err) {
+        if (dump_sources && dump_file.good())
+            dump_file << "\n/* Build Log:\n";

-        if (!err_log.empty()) {
-            throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
+        for (auto& p : err.getBuildLog()) {
+            if (dump_sources && dump_file.good())
+                dump_file << p.second << "\n";
+            err_log += p.second + '\n';
        }
-
-        return kmap;
-    } catch (const cl::Error& err) {
-        throw ocl_error(err);
+        if (dump_sources && dump_file.good())
+            dump_file << "*/\n";
+    }
+    if (!err_log.empty()) {
+        throw std::runtime_error("Program build failed. You may enable OCL source dump to see the error log.\n");
    }
 }

-kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) {
-    build_all();
-    if (one_time_kernel) {
-        return _one_time_kernels.at(id);
-    } else {
-        return _kernels.at(id);
-    }
+kernels_cache::kernel_type kernels_cache::get_kernel(kernel_id id, bool one_time_kernel) const {
+    if (_pending_compilation)
+        throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
+
+    const auto& kernels = one_time_kernel ?  _one_time_kernels : _kernels;
+    auto res = kernels.find(id);
+    if (kernels.end() == res)
+        throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
+    return res->second;
 }

 void kernels_cache::build_all() {
    OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
    if (!_pending_compilation)
        return;
-
-    std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
-
-    auto sorted_program_code = get_program_source(_kernels_code);
-
-    _one_time_kernels.clear();
-    for (auto& program : sorted_program_code) {
-        auto kernels = build_program(program.second);
-
-        for (auto& k : kernels) {
-            const auto& entry_point = k.first;
-            const auto& k_id = program.second.entry_point_to_id[entry_point];
-            if (program.second.one_time) {
-                _one_time_kernels[k_id] = k.second;
-            } else {
-                _kernels[k_id] = k.second;
-            }
-        }
+    std::vector<batch_program> batches;
+    {
+        std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
+        get_program_source(_kernels_code, &batches);
+        _one_time_kernels.clear();
    }

-    _kernels_code.clear();
-    _pending_compilation = false;
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+    arena->execute([this, &batches] {
+        tbb::parallel_for(tbb::blocked_range<size_t>(0, batches.size()), [this, &batches](const tbb::blocked_range<size_t>& r) {
+            for (auto i = r.begin(); i != r.end(); ++i) {
+                build_batch(batches[i]);
+            }
+        });
+    });
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+    std::vector<std::future<void>> builds;
+    for (size_t i = 0; i < batches.size(); ++i) {
+        builds.push_back(pool->enqueue([this, &batches, i] () {
+            build_batch(batches[i]);
+        }));
+    }
+    std::for_each(builds.begin(), builds.end(), [] (std::future<void>& f) { f.wait(); });
+#else
+    // no parallel build
+    for (const auto& batch : batches) {
+        build_batch(batch);
+    }
+#endif
+
+    {
+        std::lock_guard<std::mutex> lock(_context.get_cache_mutex());
+        _kernels_code.clear();
+        _pending_compilation = false;
+    }
 }

 void kernels_cache::reset() {
@ -452,6 +488,5 @@ void kernels_cache::reset() {
    _kernels_code.clear();
    _pending_compilation = false;
 }
-
 }  // namespace gpu
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h
@ -13,6 +13,19 @@
 #include <unordered_set>
 #include <kernel_selector_common.h>

+#define CLDNN_THREADING_SEQ 0
+#define CLDNN_THREADING_TBB 1
+#define CLDNN_THREADING_THREADPOOL 2
+
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+#include <tbb/task_arena.h>
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+#include <queue>
+#include <future>
+#include <functional>
+#include <condition_variable>
+#endif
+
 namespace cl {
 class Kernel;
 class KernelIntel;
@ -26,14 +39,76 @@ namespace cldnn {
 namespace gpu {

 class gpu_toolkit;
+#if (CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+class thread_pool {
+public:
+    thread_pool(size_t num_threads) : _stop_pool(false) {
+        _workers.reserve(num_threads);
+        for (size_t i = 0; i < num_threads; ++i) {
+            _workers.emplace_back(std::thread(&thread_pool::worker_thread, this));
+        }
+    }

+    ~thread_pool() {
+        {
+            std::lock_guard<std::mutex> lock(_q_m);
+            _stop_pool = true;
+        }
+        this->wait_all();
+    }
+
+    template <class F, class... Args>
+    std::future<typename std::result_of<F(Args...)>::type> enqueue(F&& f, Args&&... args) {
+        if (_stop_pool) {
+            throw std::runtime_error("Thread pool is stoped");
+        }
+
+        using return_type = typename std::result_of<F(Args...)>::type;
+        auto task = std::make_shared<std::packaged_task<return_type()>> (std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+        std::future<return_type> result = task->get_future();
+        {
+            std::lock_guard<std::mutex> lock(_q_m);
+            _tasks.push([task]() {(*task)();});
+        }
+        _cv.notify_one();
+        return result;
+    }
+
+    void wait_all() {
+        _cv.notify_all();
+        for (auto& w : _workers) {
+            w.join();
+        }
+    }
+
+private:
+    std::vector<std::thread> _workers;
+    std::queue<std::function<void()>> _tasks;
+    std::condition_variable _cv;
+    std::mutex _q_m;
+    bool _stop_pool;
+
+    void worker_thread() {
+        while (true) {
+            std::unique_lock<std::mutex> lock(this->_q_m);
+            _cv.wait(lock, [this]() { return (!this->_tasks.empty()) || (_stop_pool); });
+            if ((_stop_pool) && (this->_tasks.empty())) return;
+            auto task = std::move(_tasks.front());
+            this->_tasks.pop();
+            lock.unlock();
+            task();
+        }
+    }
+};
+#endif
 class kernels_cache {
 public:
    using source_code = std::vector<std::string>;
-
-    struct program_code {
-        std::vector<source_code> source;
-        std::vector<size_t> hash_values;
+    struct batch_program {
+        int32_t bucket_id = 0;
+        int32_t batch_id = 0;
+        source_code source;
+        size_t hash_value;
        uint32_t kernels_counter = 0;
        std::string options;
        bool dump_custom_program = false;
@ -69,7 +144,6 @@ public:

    typedef std::string kernel_id;
    typedef cl::KernelIntel kernel_type;
-    using sorted_code = std::map<std::string, program_code>;
    using kernels_map = std::map<std::string, kernel_type>;
    using kernels_code = std::unordered_set<kernel_code, hash_kernel_code>;

@ -77,13 +151,19 @@ private:
    gpu_toolkit& _context;
    kernels_code _kernels_code;
    std::atomic<bool> _pending_compilation{false};
-    std::map<std::string, kernel_type> _kernels;
-    std::map<std::string, kernel_type> _one_time_kernels;  // These kernels are intended to be executed only once (can
+    std::map<const std::string, const kernel_type> _kernels;
+    std::map<const std::string, const kernel_type> _one_time_kernels;  // These kernels are intended to be executed only once (can
                                                           // be removed later from the cache).
    uint32_t _prog_id;
+#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
+    std::unique_ptr<tbb::task_arena> arena;
+#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
+    std::unique_ptr<thread_pool> pool;
+#endif

-    sorted_code get_program_source(const kernels_code& kernels_source_code) const;
-    kernels_map build_program(const program_code& pcode) const;
+
+    void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
+    void build_batch(const batch_program& batch);

    std::string get_cache_path() const;
    bool is_cache_enabled() const;
@ -94,7 +174,7 @@ public:
    kernel_id set_kernel_source(const std::shared_ptr<kernel_selector::kernel_string>& kernel_string,
                                bool dump_custom_program,
                                bool one_time_kernel);
-    kernel_type get_kernel(kernel_id id, bool one_time_kernel);
+    kernel_type get_kernel(kernel_id id, bool one_time_kernel) const;
    gpu_toolkit& get_context() { return _context; }
    // forces compilation of all pending kernels/programs
    void build_all();
--- a/tests/stress_tests/.automation/memcheck_tests/precommit_configs/desktop_references_config.xml
+++ b/tests/stress_tests/.automation/memcheck_tests/precommit_configs/desktop_references_config.xml
@ -3,37 +3,37 @@
    <models>
        <!--Models with FP32 precision-->
        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="740214" vmpeak="805110" vmrss="129308" vmhwm="129308" />
-        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="922147" vmpeak="922147" vmrss="587522" vmhwm="587522" />
+        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2709506" vmpeak="2794703" vmrss="1342104" vmhwm="1342104" />
        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1007890" vmpeak="1007890" vmrss="138652" vmhwm="138652" />
-        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1006439" vmpeak="1091636" vmrss="587241" vmhwm="587241" />
+        <model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2709501" vmpeak="2794698" vmrss="1291404" vmhwm="1291404" />
        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="691589" vmpeak="922864" vmrss="31054" vmhwm="31054" />
-        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="290695" vmhwm="290695" />
+        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="626194" vmpeak="626194" vmrss="403228" vmhwm="403228" />
        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="958240" vmpeak="1043437" vmrss="31366" vmhwm="31366" />
-        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="708734" vmpeak="793930" vmrss="287877" vmhwm="287877" />
+        <model path="public/mtcnn/mtcnn-r/FP32/mtcnn-r.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="854417" vmpeak="939614" vmrss="402339" vmhwm="402339" />
        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="1046988" vmpeak="1179042" vmrss="307990" vmhwm="439457" />
-        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="1267775" vmpeak="1279647" vmrss="932672" vmhwm="944626" />
+        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2969241" vmpeak="2969241" vmrss="1506492" vmhwm="1506492" />
        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="1321819" vmpeak="1321819" vmrss="374207" vmhwm="439748" />
-        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="1356565" vmpeak="1441762" vmrss="941418" vmhwm="947060" />
+        <model path="public/ssd300/FP32/ssd300.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2605324" vmpeak="26900521" vmrss="1549958" vmhwm="1549958" />
        <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="CPU" vmsize="2133814" vmpeak="2836412" vmrss="1438049" vmhwm="2140533" />
        <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="create_exenetwork" device="GPU" vmsize="2801422" vmpeak="3915366" vmrss="2465065" vmhwm="3578811" />
        <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="CPU" vmsize="2401380" vmpeak="2836412" vmrss="1469832" vmhwm="2140377" />
        <model path="public/vgg16/FP32/vgg16.xml" precision="FP32" test="infer_request_inference" device="GPU" vmsize="2892432" vmpeak="3939166" vmrss="2472017" vmhwm="3602924" />
        <!--Models with FP16 precision-->
        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1057487" vmpeak="1085224" vmrss="109694" vmhwm="137295" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="760942" vmpeak="760942" vmrss="418298" vmhwm="418298" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2667537" vmpeak="2752734" vmrss="1304919" vmhwm="1304919" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1058844" vmpeak="1085224" vmrss="123016" vmhwm="136682" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="845348" vmpeak="930545" vmrss="417445" vmhwm="417445" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mobilenet-ssd/FP16/mobilenet-ssd.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2347389" vmpeak="2432586" vmrss="1290504" vmhwm="1290504" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="955427" vmpeak="955806" vmrss="27700" vmhwm="27700" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="680862" vmpeak="680862" vmrss="331858" vmhwm="331858" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2123113" vmpeak="2208310" vmrss="453814" vmhwm="453814" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="955827" vmpeak="955827" vmrss="27222" vmhwm="27222" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="766053" vmpeak="851250" vmrss="331458" vmhwm="331458" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/mtcnn/mtcnn-r/FP16/mtcnn-r.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1760990" vmpeak="1760990" vmrss="454173" vmhwm="454173" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="1372961" vmpeak="1505639" vmrss="369969" vmhwm="501649" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="866543" vmpeak="866543" vmrss="523967" vmhwm="523967" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2746588" vmpeak="2831784" vmrss="1296328" vmhwm="1296328" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="1381265" vmpeak="1505472" vmrss="437039" vmhwm="500630" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="951584" vmpeak="1036781" vmrss="528060" vmhwm="528060" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/ssd300/FP16/ssd300.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2380580" vmpeak="2465777" vmrss="1326369" vmhwm="1326369" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="CPU" vmsize="2748220" vmpeak="3450818" vmrss="1783704" vmhwm="2486161" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="1397463" vmpeak="1994402" vmrss="1049625" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="create_exenetwork" device="GPU" vmsize="2181312" vmpeak="2582752" vmrss="1060712" vmhwm="1629414" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="CPU" vmsize="2749458" vmpeak="3450818" vmrss="1816765" vmhwm="2486525" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
-        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="1482655" vmpeak="1998812" vmrss="1049692" vmhwm="1630782" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
+        <model path="public/vgg16/FP16/vgg16.xml" precision="FP16" test="infer_request_inference" device="GPU" vmsize="2910814" vmpeak="3347489" vmrss="1371380" vmhwm="1717102" /> # values from {"commit_id": "af63cb78ee5cbd66bac0d0980db61cb11b5d9995", "commit_date": "2021-03-03 15:44"} and *= 1.3
    </models>
 </attributes>