[IE CLDNN] Added ITT counters in the plugin (#3719)

2021-01-14 09:56:18 +03:00 · 2021-01-14 09:56:18 +03:00 · d2303262a2
commit d2303262a2
parent 036259481d
8 changed files with 88 additions and 25 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_config.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@ -11,6 +11,7 @@
 #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
 #include "ie_api.h"
 #include "file_utils.h"
+#include "cldnn_itt.h"

 #ifdef _WIN32
 # include <direct.h>
@ -40,6 +41,7 @@ static void createDirectory(std::string _path) {
 }

 void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Config::UpdateFromMap");
    for (auto& kvp : configMap) {
        std::string key = kvp.first;
        std::string val = kvp.second;
@ -228,6 +230,7 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
 }

 void Config::adjustKeyMapValues() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Config::adjustKeyMapValues");
    if (useProfiling)
        key_config_map[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
    else
--- a/inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp
@ -15,6 +15,7 @@
 #endif

 #include "simple_math.h"
+#include "cldnn_itt.h"

 using namespace InferenceEngine;
 using namespace XMLParseUtils;
@ -224,6 +225,7 @@ cldnn::format CLDNNCustomLayer::FormatFromString(const std::string & str) {
 }

 void CLDNNCustomLayer::LoadFromFile(const std::string configFile, CLDNNCustomLayerMap& customLayers, bool can_be_missed) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNCustomLayer::LoadFromFile");
    pugi::xml_document xmlDoc;
    pugi::xml_parse_result res = xmlDoc.load_file(configFile.c_str());
    if (res.status != pugi::status_ok) {
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //

@ -64,6 +64,7 @@
 #include "cldnn_engine.h"
 #include "cldnn_executable_network.h"
 #include "cldnn_custom_layer.h"
+#include "cldnn_itt.h"

 #ifdef __linux__
 # include <dlfcn.h>
@ -122,9 +123,11 @@ static bool disableReduceDecomposition(const std::shared_ptr<const ngraph::Node>

 InferenceEngine::CNNNetwork clDNNEngine::CloneAndTransformNetwork(const InferenceEngine::CNNNetwork& network,
                                                                  const CLDNNPlugin::Config& config) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::CloneAndTransformNetwork");
    CNNNetwork clonedNetwork = InferenceEngine::details::cloneNetwork(network);

    if (clonedNetwork.getFunction()) {
+        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::TransformNetwork");
        auto nGraphFunc = clonedNetwork.getFunction();
        // Disable shape inference (WA for generic operations)
        ngraph::op::GenericIE::DisableReshape noReshape(nGraphFunc);
@ -268,6 +271,7 @@ InferenceEngine::CNNNetwork clDNNEngine::CloneAndTransformNetwork(const Inferenc

        bool enableInt8 = config.enableInt8 && ngraph::pass::low_precision::LowPrecisionTransformer::isFunctionQuantized(nGraphFunc);
        if (enableInt8) {
+            OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::TransformNetwork::LPT");
            using namespace ngraph::pass::low_precision;
            ngraph::pass::Manager conversion_manager;
            // [WA part1] Convert quantized FP16 model to FP32 to avoid possible overflow and mixed precision errors
@ -284,6 +288,7 @@ InferenceEngine::CNNNetwork clDNNEngine::CloneAndTransformNetwork(const Inferenc
        }

        {
+            OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::TransformNetwork::RunPasses");
            ngraph::pass::Manager manager;
            // This ConstantFolding pass is added to fold reshapes added for constant inputs on NMS internal operation which prevents upper-bound calculation
            // TODO: check why we have these reshapes
@ -349,6 +354,7 @@ auto check_inputs = [](InferenceEngine::InputsDataMap _networkInputs) {
 };

 void clDNNEngine::UpdateConfig(CLDNNPlugin::Config& conf, const InferenceEngine::CNNNetwork &network, const std::map<std::string, std::string> &params) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::UpdateConfig");
    auto device_info = GetDeviceInfo(params);
    conf.enableInt8 = device_info.supports_imad || device_info.supports_immad;
    conf.UpdateFromMap(params);
@ -359,6 +365,7 @@ void clDNNEngine::UpdateConfig(CLDNNPlugin::Config& conf, const InferenceEngine:

 ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network,
                                                               const std::map<std::string, std::string> &config) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::LoadExeNetworkImpl");
    // verification of supported input
    InferenceEngine::InputsDataMap _networkInputs = network.getInputsInfo();
    check_inputs(_networkInputs);
@ -389,6 +396,7 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn
    };

    {
+        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::LoadExeNetworkImpl::CreateContext");
        std::lock_guard<std::mutex> lock(engine_mutex);
        if (!canReuseDefaultContext()) {
            m_defaultContext.reset(new CLDNNRemoteCLContext(shared_from_this(), ParamMap(), conf));
@ -398,7 +406,10 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn
    context = m_defaultContext;

    auto transformedNetwork = CloneAndTransformNetwork(network, conf);
-    return std::make_shared<CLDNNExecNetwork>(transformedNetwork, context, conf);
+    {
+        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::LoadExeNetworkImpl::CreateExeNetwork");
+        return std::make_shared<CLDNNExecNetwork>(transformedNetwork, context, conf);
+    }
 }

 ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network,
@ -451,6 +462,7 @@ void clDNNEngine::SetConfig(const std::map<std::string, std::string> &config) {

 QueryNetworkResult clDNNEngine::QueryNetwork(const CNNNetwork& network,
                                             const std::map<std::string, std::string>& config) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::QueryNetwork");
    QueryNetworkResult res;
    CLDNNPlugin::Config conf = _impl->m_config;
    UpdateConfig(conf, network, config);
@ -664,6 +676,7 @@ QueryNetworkResult clDNNEngine::QueryNetwork(const CNNNetwork& network,
 }

 Parameter clDNNEngine::GetConfig(const std::string& name, const std::map<std::string, Parameter>& /*options*/) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::GetConfig");
    Parameter result;
    auto option = _impl->m_config.key_config_map.find(name);
    if (option != _impl->m_config.key_config_map.end()) {
@ -692,6 +705,7 @@ auto StringRightTrim = [](std::string string, std::string substring, bool case_s
 };

 Parameter clDNNEngine::GetMetric(const std::string& name, const std::map<std::string, Parameter>& options) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "clDNNEngine::GetMetric");
    auto device_id = GetConfig(CONFIG_KEY(DEVICE_ID), {});
    if (options.find(CONFIG_KEY(DEVICE_ID)) != options.end())
        device_id = options.at(CONFIG_KEY(DEVICE_ID)).as<std::string>();
--- a/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_executable_network.cpp
@ -13,6 +13,7 @@
 #include <cmath>
 #include <algorithm>
 #include "cldnn_graph.h"
+#include "cldnn_itt.h"

 #include <description_buffer.hpp>
 #include <cldnn/cldnn_config.hpp>
@ -63,6 +64,7 @@ CLDNNExecNetwork::CLDNNExecNetwork(InferenceEngine::CNNNetwork &network, RemoteC

 InferRequestInternal::Ptr CLDNNExecNetwork::CreateInferRequestImpl(InputsDataMap networkInputs,
                                                                   OutputsDataMap networkOutputs) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecNetwork::CreateInferRequestImpl");
    if (m_graphs.empty()) {
        THROW_IE_EXCEPTION << NETWORK_NOT_LOADED_str;
    }
@ -90,6 +92,7 @@ InferRequestInternal::Ptr CLDNNExecNetwork::CreateInferRequestImpl(InputsDataMap
 }

 IInferRequest::Ptr CLDNNExecNetwork::CreateInferRequest() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecNetwork::CreateInferRequest");
    return CreateAsyncInferRequestFromSync<CLDNNAsyncInferRequest>();
 }

@ -110,6 +113,7 @@ InferenceEngine::Parameter CLDNNExecNetwork::GetConfig(const std::string &name)
 }

 InferenceEngine::Parameter CLDNNExecNetwork::GetMetric(const std::string &name) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecNetwork::GetMetric");
    if (name == METRIC_KEY(NETWORK_NAME)) {
        IE_ASSERT(!m_graphs.empty());
        IE_SET_METRIC_RETURN(NETWORK_NAME, m_graphs[0]->getName());
--- a/inference-engine/src/cldnn_engine/cldnn_graph.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_graph.cpp
@ -27,6 +27,7 @@
 #include <ie_ngraph_utils.hpp>
 #include "generic_ie.hpp"
 #include <ngraph/variant.hpp>
+#include "cldnn_itt.h"

 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@ -52,6 +53,7 @@ CLDNNGraph::CLDNNGraph(std::shared_ptr<CLDNNGraph> graph, uint16_t stream_id)
 }

 void CLDNNGraph::UpdateLayersMaps() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::UpdateLayersMaps");
    primitiveIDs = m_program->primitiveIDs;
    primitivesToIRLayersMap = m_program->primitivesToIRLayersMap;
    IRToNgraphLayersMap = m_program->IRToNgraphLayersMap;
@ -62,6 +64,7 @@ void CLDNNGraph::UpdateLayersMaps() {
 }

 void CLDNNGraph::Build() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::Build");
    UpdateLayersMaps();

    if (GetMaxDynamicBatchSize() > 1) {
@ -81,6 +84,7 @@ void CLDNNGraph::Build() {
 }

 std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::BuildNetwork");
    auto network = std::make_shared<cldnn::network>(*program, m_stream_id);

    if (!m_config.graph_dumps_dir.empty() && m_stream_id == 0) {
@ -101,6 +105,7 @@ std::shared_ptr<cldnn::network> CLDNNGraph::BuildNetwork(std::shared_ptr<cldnn::

 InferenceEngine::CNNNetwork CLDNNGraph::GetExecGraphInfoByPrimitivesInfo(std::vector<cldnn::primitive_info>& primitives_info,
                                                                               bool filter_const_primitives) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::GetExecGraphInfoByPrimitivesInfo");
    if (m_config.useProfiling) {
        try {
            // Update may throw an exception for step-by-step runtime graph dump,
@ -474,6 +479,7 @@ InferenceEngine::CNNNetwork CLDNNGraph::GetExecGraphInfo() {


 void CLDNNGraph::UpdatePerfStatistics() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::UpdatePerfStatistics");
    if (GetNetworksCount() == 0) {
        return;
    }
@ -545,6 +551,7 @@ bool CLDNNGraph::IsLoaded() const {
 }

 void CLDNNGraph::UpdateImplementationsMap() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::UpdateImplementationsMap");
    if (m_config.useProfiling) {
        auto extractImplementationFromInfo = [](const std::string& info) -> std::string {
            std::string def_implementation = "undef";
@ -587,6 +594,7 @@ void CLDNNGraph::UpdateImplementationsMap() {
 }

 void CLDNNGraph::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &result) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNGraph::GetPerformanceCounts");
    bool combinePrimByIRLayers = false;
    unsigned i = 0;
    auto allIds = GetNetwork()->get_all_primitive_org_ids();
--- a/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
@ -23,6 +23,7 @@ const char cannot_set_compound[] = "cannot set compound blob: supported only for
 const char wrong_nv12_blob[] = "NV12 input blob is expected for input with NV12 color format";

 Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* mem_ptr) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::createInputBlob");
    const Precision p = desc.getPrecision();

    switch (p) {
@ -77,6 +78,7 @@ Blob::Ptr CLDNNInferRequest::createInputBlob(const TensorDesc& desc, uint8_t* me
 }

 Blob::Ptr CLDNNInferRequest::createOutputBlob(const TensorDesc& desc, uint8_t* mem_ptr) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::createOutputBlob");
    const Precision p = desc.getPrecision();

    switch (p) {
@ -106,6 +108,7 @@ Blob::Ptr CLDNNInferRequest::createOutputBlob(const TensorDesc& desc, uint8_t* m
 }

 void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory& inputMem) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::input_attach");
    auto impl = getContextImpl(m_graph->GetContext());
    impl->acquire_lock();

@ -120,6 +123,7 @@ void CLDNNInferRequest::input_attach(cldnn::primitive_id name, cldnn::memory& in
 }

 void CLDNNInferRequest::input_alloc(cldnn::primitive_id name, const cldnn::layout& layout) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::input_alloc");
    cldnn::memory input_mem = cldnn::memory::allocate(*(m_graph->GetEngine()), layout);
    input_attach(name, input_mem);
 }
@ -127,6 +131,7 @@ void CLDNNInferRequest::input_alloc(cldnn::primitive_id name, const cldnn::layou
 void CLDNNInferRequest::copyOutputData(const cldnn::memory& outputMemory,
                                        Blob::Ptr bptr,
                                        buf_info* bi) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyOutputData");
    size_t n = (bi == nullptr) ? bptr->size() : bi->buf_size;
    size_t offset = (bi == nullptr) ? 0 : bi->buf_offset;

@ -270,6 +275,7 @@ void CLDNNInferRequest::copyInputData(std::shared_ptr<cldnn::network> network,
                                    const cldnn::primitive_id &inputName,
                                    const cldnn::layout& inputLayout,
                                    const Blob &inputBlob, buf_info* bi) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::copyInputData");
    size_t n = (bi == nullptr) ? inputBlob.size() : bi->buf_size;
    size_t offset = (bi == nullptr) ? 0 : bi->buf_offset;

@ -386,6 +392,7 @@ void checkOutputBlob(const Blob::Ptr &blob,
 }

 void CLDNNInferRequest::checkBlobs() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::checkBlobs");
    for (auto const &input : _inputs) {
        InputInfo::Ptr foundInput = nullptr;
        auto foundInputPair = std::find_if(std::begin(_networkInputs), std::end(_networkInputs),
@ -415,7 +422,7 @@ void CLDNNInferRequest::checkBlobs() {
 }

 void CLDNNInferRequest::GetBlob(const char *name, Blob::Ptr &data) {
-    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "GetBlob");
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::GetBlob");
    InputInfo::Ptr foundInput;
    DataPtr foundOutput;
    bool is_input = findInputAndOutputBlobByName(name, foundInput, foundOutput);
@ -436,7 +443,7 @@ void CLDNNInferRequest::GetBlob(const char *name, Blob::Ptr &data) {
 }

 void CLDNNInferRequest::SetBlob(const char *name, const Blob::Ptr &data) {
-    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "SetBlob");
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::SetBlob");

    // perform all common checks first
    if (name == nullptr) {
@ -562,6 +569,7 @@ void CLDNNInferRequest::SetBlob(const char *name, const Blob::Ptr &data) {
 }

 void CLDNNInferRequest::AllocateInputs() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::AllocateInputs");
    auto inputLayouts = m_graph->GetInputLayouts();
    // allocate inputs
    for (auto& ni : _networkInputs) {
@ -611,6 +619,7 @@ void CLDNNInferRequest::AllocateInputs() {
 }

 void CLDNNInferRequest::AllocateInputsDyn() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::AllocateInputsDyn");
    // allocate inputs
    for (auto &input : m_graph->GetInputLayouts()) {
        InputInfo::Ptr ni = _networkInputs.at(input.first);
@ -636,6 +645,7 @@ void CLDNNInferRequest::AllocateInputsDyn() {
 }

 void CLDNNInferRequest::AllocateOutputs() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::AllocateOutputs");
    // allocate outputs
    bool can_reuse_internal_mem = !m_useStreams;
    for (auto& no : _networkOutputs) {
@ -661,6 +671,7 @@ void CLDNNInferRequest::AllocateOutputs() {
 }

 void CLDNNInferRequest::AllocateOutputsDyn() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::AllocateOutputsDyn");
    // allocate outputs
    for (auto& no : _networkOutputs) {
        DataPtr oi = no.second;
@ -680,6 +691,7 @@ void CLDNNInferRequest::AllocateOutputsDyn() {
 }

 void CLDNNInferRequest::SetGraph(std::shared_ptr<CLDNNPlugin::CLDNNGraph> graph) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::SetGraph");
    m_graph = graph;

    if (m_graph == nullptr) {
@ -697,6 +709,7 @@ void CLDNNInferRequest::SetGraph(std::shared_ptr<CLDNNPlugin::CLDNNGraph> graph)
 }

 void CLDNNInferRequest::SetBatch(int new_batch) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::SetBatch");
    if (m_graph->GetMaxDynamicBatchSize() < 0)
        THROW_IE_EXCEPTION << "Dynamic batch is not enabled.";

@ -774,6 +787,7 @@ CLDNNInferRequest::CLDNNInferRequest(InputsDataMap networkInputs, OutputsDataMap
 }

 void CLDNNInferRequest::execAndParse() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::execAndParse");
    auto networkOutputs = m_graph->GetNetwork()->execute();

    // Collect outputs as requested by the model
@ -804,6 +818,7 @@ void CLDNNInferRequest::execAndParse() {
 }

 void CLDNNInferRequest::execAndParseDyn() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::execAndParseDyn");
    std::vector<std::map<cldnn::primitive_id, cldnn::network_output>> networkOutputs(m_graph->GetNetworksCount());

    // set up exection and put all graphs into driver queue
@ -832,7 +847,7 @@ void CLDNNInferRequest::execAndParseDyn() {
 }

 void CLDNNInferRequest::InferImpl() {
-    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNN_INFER");
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::InferImpl");
    int streamID = 0;
    if (nullptr != streamExecutor) {
        streamID = streamExecutor->GetStreamId();
@ -871,6 +886,7 @@ void CLDNNInferRequest::InferImpl() {

 void CLDNNInferRequest::GetPerformanceCounts(
        std::map<std::string, InferenceEngineProfileInfo> &perfMap) const {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::GetPerformanceCounts");
    if (!m_useProfiling) {
        THROW_IE_EXCEPTION << "Performance counters were not enabled";
    } else {
@ -882,6 +898,7 @@ namespace {

 template <typename T>
 void copyToFloat(float* dst, const InferenceEngine::Blob* src) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "copyToFloat");
    if (!dst) {
        return;
    }
@ -901,6 +918,7 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) {
 }  // namespace

 void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const Blob &inputBlob) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::PrepareInput");
    // Get input layout
    if (m_graph->GetInputLayouts().find(inputName) == m_graph->GetInputLayouts().end()) {
        THROW_IE_EXCEPTION << "Input name mismatch.";
@ -959,6 +977,7 @@ void CLDNNInferRequest::PrepareInput(const cldnn::primitive_id &inputName, const
 }

 void CLDNNInferRequest::PrepareInputDyn(const cldnn::primitive_id &inputName, const Blob &inputBlob) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNInferRequest::PrepareInputDyn");
    // now try to get execution results
    for (unsigned nb = 0; nb < m_graph->GetNetworksCount(); nb++) {
        unsigned int mask = 1 << nb;
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@ -5,6 +5,7 @@
 #include "cldnn_program.h"
 #include "ngraph/ops.hpp"
 #include "ngraph_ops/nms_ie_internal.hpp"
+#include "cldnn_itt.h"

 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@ -175,6 +176,7 @@ void Program::CleanupBuild() {
 std::shared_ptr<cldnn::program> Program::BuildProgram(std::vector<std::shared_ptr<ngraph::Node>> ops,
                                                      InferenceEngine::InputsDataMap networkInputs,
                                                      InferenceEngine::OutputsDataMap networkOutputs) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::BuildProgram");
    cldnn::build_options options;
    if (!m_config.graph_dumps_dir.empty()) {
        options.set_option(cldnn::build_option::graph_dumps_dir(m_config.graph_dumps_dir));
@ -186,14 +188,17 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(std::vector<std::shared_pt
    for (auto op : ops) {
        CreateSingleLayerPrimitive(*m_topology, op);
    }
+    {
+        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::CreateProgram");
+        auto program = std::make_shared<cldnn::program>(*m_engine, *m_topology, options);
+        CleanupBuild();

-    auto program = std::make_shared<cldnn::program>(*m_engine, *m_topology, options);
-    CleanupBuild();
-
-    return program;
+        return program;
+    }
 }

 bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const std::shared_ptr<ngraph::Node>& op) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::IsOpSupported");
    cldnn::topology topology;
    try {
        // Query mode disables checks that input primitives are created,
@ -220,6 +225,7 @@ bool Program::IsOpSupported(const InferenceEngine::CNNNetwork& network, const st
 }

 void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::shared_ptr<ngraph::Node>& op) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "Program::CreateSingleLayerPrimitive");
    InitProfileInfo(op->get_friendly_name(), op->get_type_name());

    bool is_created = false;
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@ -4,6 +4,7 @@

 #include <memory>
 #include "cldnn_remote_context.h"
+#include "cldnn_itt.h"

 using namespace InferenceEngine;
 using namespace InferenceEngine::gpu;
@ -80,6 +81,7 @@ bool CLDNNRemoteBlobImpl::is_locked() const noexcept {
 }

 void CLDNNRemoteBlobImpl::allocate_if_needed() {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::allocate_if_needed");
    auto _impl = getContextImpl(m_context.lock());
    _impl->acquire_lock();

@ -116,6 +118,7 @@ void CLDNNRemoteBlobImpl::allocate_if_needed() {
 }

 void CLDNNRemoteBlobImpl::allocate() noexcept {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNRemoteBlobImpl::allocate");
    assert(m_memObject == nullptr);

    std::shared_ptr<const cldnn::engine> eng = getContextImpl(m_context.lock())->GetEngine();
@ -224,6 +227,7 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
    m_type(ContextType::OCL),
    m_config(config),
    m_va_display(nullptr) {
+    OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecutionContextImpl");
    lock.clear(std::memory_order_relaxed);
    gpu_handle_param _context_id = nullptr;
    gpu_handle_param _va_device = nullptr;
@ -248,22 +252,25 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
    auto iter = device_map.find(m_config.device_id);
    auto& dev = iter != device_map.end() ? iter->second : device_map.begin()->second;

-    m_engine = std::make_shared<cldnn::engine>(dev,
-        cldnn::engine_configuration((m_config.useProfiling ||
-            (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) ||
-            (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache)),
-            false,
-            m_config.dumpCustomKernels,
-            std::string(),
-            std::string(),
-            true,
-            std::string(),
-            m_config.sources_dumps_dir,
-            m_config.queuePriority,
-            m_config.queueThrottle,
-            m_config.memory_pool_on,
-            m_config.throughput_streams,
-            m_config.kernels_cache_dir));
+    {
+        OV_ITT_SCOPED_TASK(itt::domains::CLDNNPlugin, "CLDNNExecutionContextImpl::Create");
+        m_engine = std::make_shared<cldnn::engine>(dev,
+            cldnn::engine_configuration((m_config.useProfiling ||
+                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) ||
+                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache)),
+                false,
+                m_config.dumpCustomKernels,
+                std::string(),
+                std::string(),
+                true,
+                std::string(),
+                m_config.sources_dumps_dir,
+                m_config.queuePriority,
+                m_config.queueThrottle,
+                m_config.memory_pool_on,
+                m_config.throughput_streams,
+                m_config.kernels_cache_dir));
+    }
 }

 ParamMap CLDNNExecutionContextImpl::getParams() const {