[IE][VPU]: CMX limit compile option (#1268)

In some networks, mvTensor would request a large CMX-DMA transfer (<512K). That starves DMA for other timing critical tasks such as SIPP. Limit CMX-DMA request size as an option in myriad_compile: * Add compile option TILING_CMX_LIMIT_KB * Declare compile option TILING_CMX_LIMIT_KB in IE tools (compile_tool and vpu_compile) * Add tests for compile option TILING_CMX_LIMIT_KB. Small fix for naming behavior tests.
2020-07-16 19:54:53 +03:00 · 2020-07-16 19:54:53 +03:00 · 73ee68afb2
commit 73ee68afb2
parent d9927a9f35
21 changed files with 85 additions and 26 deletions
--- a/inference-engine/src/vpu/common/include/vpu/parsed_config_base.hpp
+++ b/inference-engine/src/vpu/common/include/vpu/parsed_config_base.hpp
@ -107,14 +107,8 @@ protected:
    }

    static int parseInt(const std::string& src) {
-        return std::stoi(src);
-    }
-
-    static int parsePositiveInt(const std::string& src) {
        const auto val = std::stoi(src);
-        if (val < 0) {
-            throw std::invalid_argument("Zero/negative value");
-        }
+
        return val;
    }

--- a/inference-engine/src/vpu/graph_transformer/include/vpu/compile_env.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/compile_env.hpp
@ -21,6 +21,7 @@ struct DefaultAllocation {
    static int numStreams(const Platform& platform, const CompilationConfig& configuration);
    static int numSlices(const Platform& platform, int numStreams);
    static int numShaves(const Platform& platform, int numStreams, int numSlices);
+    static int tilingCMXLimit(int numSlices);
 };

 struct CompileEnv final {
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp
@ -43,6 +43,7 @@ struct CompilationConfig final {
    int numSHAVEs = -1;
    int numCMXSlices = -1;
    int numExecutors = -1;
+    int tilingCMXLimitKB = -1;

    bool hwOptimization = true;
    bool hwExtraSplit = false;
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/middleend/hw/utility.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/middleend/hw/utility.hpp
@ -136,6 +136,5 @@ void printTo(DotLabel& lbl, const HwPaddingInfo& hwPad);
 //

 int calculateHwBufferSize(const DimValues& dims, const DimsOrder& order = DimsOrder());
-int tilingCMXLimit(int numSlices);

 }  // namespace vpu
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/model/model.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/model/model.hpp
@ -32,6 +32,7 @@ struct Resources final {
    int numCMXSlices = 0;
    int numSHAVEs = 0;
    int numExecutors = 0;
+    int tilingCMXLimit = 0;
 };

 void printTo(std::ostream& os, const Resources& res);
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/private_plugin_config.hpp
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/private_plugin_config.hpp
@ -17,6 +17,7 @@ namespace VPUConfigParams {

 DECLARE_VPU_CONFIG_KEY(NUMBER_OF_SHAVES);
 DECLARE_VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES);
+DECLARE_VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB);

 DECLARE_VPU_CONFIG_KEY(TENSOR_STRIDES);

--- a/inference-engine/src/vpu/graph_transformer/src/graph_transformer.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/graph_transformer.cpp
@ -98,6 +98,12 @@ void CompileEnv::init(Platform platform, const CompilationConfig& config, const
        R"(Value of configuration option ("{}") must be in the range [{}, {}], actual is "{}")",
        VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES), 1, DeviceResources::numSlices(platform), numSlices);

+    int defaultCmxLimit = DefaultAllocation::tilingCMXLimit(numSlices);
+    const auto tilingCMXLimit  = config.tilingCMXLimitKB != -1 ? std::min(config.tilingCMXLimitKB * 1024, defaultCmxLimit) : defaultCmxLimit;
+    VPU_THROW_UNLESS(tilingCMXLimit >= 0,
+        R"(Value of configuration option ("{}") must be greater than {}, actual is "{}")",
+        VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), 0, tilingCMXLimit);
+
    const auto numShaves = config.numSHAVEs != -1 ? config.numSHAVEs : DefaultAllocation::numShaves(platform, numExecutors, numSlices);
    VPU_THROW_UNLESS(numShaves >= 1 && numShaves <= DeviceResources::numShaves(platform),
        R"(Value of configuration option ("{}") must be in the range [{}, {}], actual is "{}")",
@ -114,6 +120,7 @@ void CompileEnv::init(Platform platform, const CompilationConfig& config, const
    g_compileEnv->resources.numSHAVEs = numShaves;
    g_compileEnv->resources.numCMXSlices = numSlices;
    g_compileEnv->resources.numExecutors = numExecutors;
+    g_compileEnv->resources.tilingCMXLimit = tilingCMXLimit;
    g_compileEnv->initialized = true;
 }

@ -299,4 +306,8 @@ int DefaultAllocation::numShaves(const Platform& platform, int numStreams, int n
    return numShavesForAllocation / numStreams;
 }

+int DefaultAllocation::tilingCMXLimit(int numSlices) {
+    return (numSlices / 2) * CMX_SLICE_SIZE + CMX_SLICE_SIZE / 2;
+}
+
 }  // namespace vpu
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/hw/conv_tiling/hw_convolution_tiler.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/hw/conv_tiling/hw_convolution_tiler.cpp
@ -640,8 +640,7 @@ std::vector<TilingOption> HWConvolutionTilingSearcher::selectBetterTiling() cons

    const auto& splitOver = dirTiling.splitOverTensorDims();
    const auto direction = dirTiling.getDirection();
-
-    const auto cmxLimit = tilingCMXLimit(env.resources.numCMXSlices);
+    const auto cmxLimit = env.resources.tilingCMXLimit;

    // split over Input tensor for the Channel dimension always
    for (int numChannelTiles = 1; numChannelTiles <= maxNumChannelTiles; numChannelTiles++) {
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/hw/pooling_tiling/hw_pooling_tiler.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/hw/pooling_tiling/hw_pooling_tiler.cpp
@ -315,8 +315,7 @@ std::vector<TilingOption> HWPoolingTilingSearcher::selectBetterTiling() const {

    const auto& splitOver = dirTiling.splitOverTensorDims();
    const auto direction = dirTiling.getDirection();
-
-    const auto cmxLimit = tilingCMXLimit(env.resources.numCMXSlices);
+    const auto cmxLimit = env.resources.tilingCMXLimit;

    for (int numBatchTiles = 1; numBatchTiles <= maxNumBatchTiles; numBatchTiles++) {
        //
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/hw/utility.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/hw/utility.cpp
@ -88,8 +88,4 @@ int calculateHwBufferSize(const DimValues& dims, const DimsOrder& order) {
    return calcTotalByteSize(desc, strides);
 }

-int tilingCMXLimit(int numSlices) {
-    return (numSlices / 2) * CMX_SLICE_SIZE + CMX_SLICE_SIZE / 2;
-}
-
 }  // namespace vpu
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/passes/hw_fc_tiling.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/hw_fc_tiling.cpp
@ -292,7 +292,7 @@ public:
        VPU_PROFILE(hwFullyConnectedTiling);

        const auto& env = CompileEnv::get();
-        const auto cmxLimit = tilingCMXLimit(env.resources.numCMXSlices);
+        const auto cmxLimit = env.resources.tilingCMXLimit;

        for (const auto& origStage : model->getStages()) {
            if (origStage->type() != StageType::StubFullyConnected) {
--- a/inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_hw_conv_and_pool.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_hw_conv_and_pool.cpp
@ -31,7 +31,7 @@ void PassImpl::run(const Model& model) {
    VPU_PROFILE(splitHwConvAndPool);

    const auto& env = CompileEnv::get();
-    const auto cmxLimit = tilingCMXLimit(env.resources.numCMXSlices);
+    const auto cmxLimit = env.resources.tilingCMXLimit;

    for (const auto& convStage : model->getStages()) {
        if (convStage == nullptr) {
--- a/inference-engine/src/vpu/graph_transformer/src/model/model.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/model/model.cpp
@ -28,6 +28,7 @@ namespace vpu {
 void printTo(std::ostream& os, const Resources& res) {
    os << "[" << std::endl;

+    os << "tilingCMXLimit=" << res.tilingCMXLimit << std::endl;
    os << "numCMXSlices=" << res.numCMXSlices << std::endl;
    os << "numSHAVEs=" << res.numSHAVEs << std::endl;

@ -36,6 +37,7 @@ void printTo(std::ostream& os, const Resources& res) {

 void printTo(DotLabel& lbl, const Resources& res) {
    DotLabel subLbl(lbl);
+    subLbl.appendPair("tilingCMXLimit", res.tilingCMXLimit);
    subLbl.appendPair("numCMXSlices", res.numCMXSlices);
    subLbl.appendPair("numSHAVEs", res.numSHAVEs);
 }
--- a/inference-engine/src/vpu/graph_transformer/src/parsed_config.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/parsed_config.cpp
@ -44,6 +44,7 @@ IE_SUPPRESS_DEPRECATED_START

        VPU_CONFIG_KEY(NUMBER_OF_SHAVES),
        VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES),
+        VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB),

        VPU_CONFIG_KEY(TENSOR_STRIDES),

@ -180,9 +181,28 @@ void ParsedConfig::parse(const std::map<std::string, std::string>& config) {
        setOption(_compileConfig.customLayers, config, CONFIG_KEY(CONFIG_FILE));
    }

-    setOption(_compileConfig.numSHAVEs, config, VPU_CONFIG_KEY(NUMBER_OF_SHAVES), parseInt);
-    setOption(_compileConfig.numCMXSlices, config, VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES), parseInt);
-    setOption(_compileConfig.numExecutors, config, VPU_MYRIAD_CONFIG_KEY(THROUGHPUT_STREAMS), parseInt);
+    auto isPositive = [](int value) {
+        return value >= 0;
+    };
+
+    auto isDefaultValue = [](int value) {
+        return value == -1;
+    };
+
+    auto preprocessCompileOption = [&](const std::string& src) {
+        int value = parseInt(src);
+
+        if (isPositive(value) || isDefaultValue(value)) {
+            return value;
+        }
+
+        throw std::invalid_argument("Value must be positive or default(-1).");
+    };
+
+    setOption(_compileConfig.numSHAVEs, config, VPU_CONFIG_KEY(NUMBER_OF_SHAVES), preprocessCompileOption);
+    setOption(_compileConfig.numCMXSlices, config, VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES), preprocessCompileOption);
+    setOption(_compileConfig.numExecutors, config, VPU_MYRIAD_CONFIG_KEY(THROUGHPUT_STREAMS), preprocessCompileOption);
+    setOption(_compileConfig.tilingCMXLimitKB, config, VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), preprocessCompileOption);

    if ((_compileConfig.numSHAVEs < 0 && _compileConfig.numCMXSlices >= 0) ||
        (_compileConfig.numSHAVEs >= 0 && _compileConfig.numCMXSlices < 0)) {
@ -214,7 +234,10 @@ IE_SUPPRESS_DEPRECATED_END
        _compileConfig.dumpAllPasses = std::stoi(envVar) != 0;
    }
    if (const auto envVar = std::getenv("IE_VPU_NUMBER_OF_SHAVES_AND_CMX_SLICES")) {
-        _compileConfig.numSHAVEs = _compileConfig.numCMXSlices = std::stoi(envVar);
+        _compileConfig.numSHAVEs = _compileConfig.numCMXSlices = preprocessCompileOption(envVar);
+    }
+    if (const auto envVar = std::getenv("IE_VPU_TILING_CMX_LIMIT_KB")) {
+        _compileConfig.tilingCMXLimitKB = preprocessCompileOption(envVar);
    }
 #endif
 }
--- a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/config.cpp
+++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/config.cpp
@ -27,6 +27,10 @@ namespace {
            {{VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION), CONFIG_VALUE(YES)}},
            {{VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION), CONFIG_VALUE(NO)}},

+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "-1"}},
+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "0"}},
+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "10"}},
+
            {{VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME), CONFIG_VALUE(YES)}},
            {{VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME), CONFIG_VALUE(NO)}},
            {{VPU_MYRIAD_CONFIG_KEY(PROTOCOL), VPU_MYRIAD_CONFIG_VALUE(USB)}},
@ -73,6 +77,8 @@ namespace {
            {{VPU_MYRIAD_CONFIG_KEY(PLATFORM), "0"}},
            {{VPU_MYRIAD_CONFIG_KEY(PLATFORM), "1"}},

+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "-10"}},
+
            {{VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME), "ON"}},
            {{VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME), "OFF"}}
    };
--- a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/infer_request_config.cpp
+++ b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/behavior/infer_request_config.cpp
@ -20,7 +20,7 @@ namespace {
            {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}}
    };

-    const std::vector<std::map<std::string, std::string>> Inconfigs = {
+    const std::vector<std::map<std::string, std::string>> inferConfigs = {
            {},

            {{VPU_MYRIAD_CONFIG_KEY(FORCE_RESET), CONFIG_VALUE(YES)}},
@ -33,6 +33,10 @@ namespace {
            {{CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_DEBUG)}},
            {{CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_TRACE)}},

+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "-1"}},
+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "0"}},
+            {{VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB), "1"}},
+
            {{VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION), CONFIG_VALUE(YES)}},
            {{VPU_CONFIG_KEY(HW_STAGES_OPTIMIZATION), CONFIG_VALUE(NO)}},

@ -40,7 +44,7 @@ namespace {
            {{VPU_CONFIG_KEY(PRINT_RECEIVE_TENSOR_TIME), CONFIG_VALUE(NO)}}
    };

-    const std::vector<std::map<std::string, std::string>> InmultiConfigs = {
+    const std::vector<std::map<std::string, std::string>> inferMultiConfigs = {
            {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, CommonTestUtils::DEVICE_MYRIAD},
            {CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_DEBUG)}},
            {{InferenceEngine::MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES, CommonTestUtils::DEVICE_MYRIAD},
@ -65,13 +69,13 @@ namespace {
                            ::testing::Combine(
                                    ::testing::ValuesIn(netPrecisions),
                                    ::testing::Values(CommonTestUtils::DEVICE_MYRIAD),
-                                    ::testing::ValuesIn(Inconfigs)),
+                                    ::testing::ValuesIn(inferConfigs)),
                            InferConfigInTests::getTestCaseName);

    INSTANTIATE_TEST_CASE_P(smoke_Multi_BehaviorTests, InferConfigInTests,
                            ::testing::Combine(
                                    ::testing::ValuesIn(netPrecisions),
                                    ::testing::Values(CommonTestUtils::DEVICE_MULTI),
-                                    ::testing::ValuesIn(InmultiConfigs)),
+                                    ::testing::ValuesIn(inferMultiConfigs)),
                            InferConfigInTests::getTestCaseName);
 }  // namespace
--- a/inference-engine/tests/functional/plugin/shared/include/behavior/infer_request_config.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/behavior/infer_request_config.hpp
@ -26,6 +26,7 @@
 #include "ngraph_functions/subgraph_builders.hpp"

 namespace BehaviorTestsDefinitions {
+// TODO: rename to SetupInferWithConfigTests
 using InferConfigTests = BehaviorTestsUtils::BehaviorTestsBasic;

 TEST_P(InferConfigTests, canSetExclusiveAsyncRequests) {
@ -83,6 +84,7 @@ TEST_P(InferConfigTests, withoutExclusiveAsyncRequests) {
    }
 }

+// TODO: rename to InferWithConfigTests
 using InferConfigInTests = BehaviorTestsUtils::BehaviorTestsBasic;

 TEST_P(InferConfigInTests, CanInferWithConfig) {
--- a/inference-engine/tools/compile_tool/README.md
+++ b/inference-engine/tools/compile_tool/README.md
@ -43,6 +43,7 @@ compile_tool [OPTIONS]
                                                 This option must be used in order to compile blob without a connected Myriad device.
        -VPU_NUMBER_OF_SHAVES     <value>     Optional. Specifies number of shaves. Should be set with "VPU_NUMBER_OF_CMX_SLICES". Overwrites value from config.
        -VPU_NUMBER_OF_CMX_SLICES <value>     Optional. Specifies number of CMX slices. Should be set with "VPU_NUMBER_OF_SHAVES". Overwrites value from config.
+        -VPU_TILING_CMX_LIMIT_KB  <value>     Optional. Specifies CMX limit for data tiling in kB. Value should be equal or greater than -1, where -1 means default value of limit. Overwrites value from config.

    DLA options:
        -DLA_ARCH_NAME            <value>     Optional. Specify architecture name used to compile executable network for FPGA device.
--- a/inference-engine/tools/compile_tool/main.cpp
+++ b/inference-engine/tools/compile_tool/main.cpp
@ -39,6 +39,9 @@ static constexpr char number_of_shaves_message[] = "Optional. Specifies number o
 static constexpr char number_of_cmx_slices_message[] = "Optional. Specifies number of CMX slices."
                                                       " Should be set with \"VPU_NUMBER_OF_SHAVES\"."
                                                       " Overwrites value from config.";
+static constexpr char tiling_cmx_limit_message[] = "Optional. Specifies CMX limit for data tiling."
+                                                       " Value should be equal or greater than -1."
+                                                       " Overwrites value from config.";
 static constexpr char inputs_precision_message[] = "Optional. Specifies precision for all input layers of the network."
                                                   " Supported values: FP32, FP16, U8. Default value: FP16.";
 static constexpr char outputs_precision_message[] = "Optional. Specifies precision for all output layers of the network."
@ -70,6 +73,7 @@ DEFINE_string(ol, "", outputs_layout_message);
 DEFINE_string(VPU_MYRIAD_PLATFORM, "", platform_message);
 DEFINE_string(VPU_NUMBER_OF_SHAVES, "", number_of_shaves_message);
 DEFINE_string(VPU_NUMBER_OF_CMX_SLICES, "", number_of_cmx_slices_message);
+DEFINE_string(VPU_TILING_CMX_LIMIT_KB, "", tiling_cmx_limit_message);
 DEFINE_string(DLA_ARCH_NAME, "", dla_arch_name);

 static void showUsage() {
@ -91,6 +95,7 @@ static void showUsage() {
    std::cout << "      -VPU_MYRIAD_PLATFORM       <value>     "   << platform_message             << std::endl;
    std::cout << "      -VPU_NUMBER_OF_SHAVES      <value>     "   << number_of_shaves_message     << std::endl;
    std::cout << "      -VPU_NUMBER_OF_CMX_SLICES  <value>     "   << number_of_cmx_slices_message << std::endl;
+    std::cout << "      -VPU_TILING_CMX_LIMIT_KB   <value>     "   << tiling_cmx_limit_message     << std::endl;
    std::cout << "    DLA options:                             "                                   << std::endl;
    std::cout << "      -DLA_ARCH_NAME             <value>     "   << dla_arch_name                << std::endl;
    std::cout << std::endl;
@ -164,6 +169,10 @@ static std::map<std::string, std::string> configure(const std::string &configFil
        config[VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES)] = FLAGS_VPU_NUMBER_OF_CMX_SLICES;
    }

+    if (!FLAGS_VPU_TILING_CMX_LIMIT_KB.empty()) {
+        config[VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB)] = FLAGS_VPU_TILING_CMX_LIMIT_KB;
+    }
+
    if (!FLAGS_DLA_ARCH_NAME.empty()) {
        config["DLIA_ARCH_NAME"] = FLAGS_DLA_ARCH_NAME;
    }
--- a/inference-engine/tools/vpu/vpu_compile/README.md
+++ b/inference-engine/tools/vpu/vpu_compile/README.md
@ -36,6 +36,7 @@ myriad_compile [OPTIONS]
                                             This option must be used in order to compile blob without a connected Myriad device.
    -VPU_NUMBER_OF_SHAVES        <value>     Optional. Specifies number of shaves. Should be set with "VPU_NUMBER_OF_CMX_SLICES". Overwrites value from config.
    -VPU_NUMBER_OF_CMX_SLICES    <value>     Optional. Specifies number of CMX slices. Should be set with "VPU_NUMBER_OF_SHAVES". Overwrites value from config.
+    -VPU_TILING_CMX_LIMIT_KB     <value>     Optional. Specifies CMX limit for data tiling in kB. Value should be equal or greater than -1, where -1 means default value of limit. Overwrites value from config.
 ```

 Running the application with the empty list of options yields an error message.
--- a/inference-engine/tools/vpu/vpu_compile/main.cpp
+++ b/inference-engine/tools/vpu/vpu_compile/main.cpp
@ -36,6 +36,9 @@ static constexpr char number_of_shaves_message[] = "Optional. Specifies number o
 static constexpr char number_of_cmx_slices_message[] = "Optional. Specifies number of CMX slices."
                                                       " Should be set with \"VPU_NUMBER_OF_SHAVES\"."
                                                       " Overwrites value from config.";
+static constexpr char tiling_cmx_limit_message[] = "Optional. Specifies CMX limit for data tiling."
+                                                   " Value should be equal or greater than -1."
+                                                   " Overwrites value from config.";
 static constexpr char inputs_precision_message[] = "Optional. Specifies precision for all input layers of network."
                                                   " Supported values: FP32, FP16, U8. Default value: FP16.";
 static constexpr char outputs_precision_message[] = "Optional. Specifies precision for all output layers of network."
@ -58,6 +61,7 @@ DEFINE_string(iop, "", iop_message);
 DEFINE_string(VPU_MYRIAD_PLATFORM, "", platform_message);
 DEFINE_string(VPU_NUMBER_OF_SHAVES, "", number_of_shaves_message);
 DEFINE_string(VPU_NUMBER_OF_CMX_SLICES, "", number_of_cmx_slices_message);
+DEFINE_string(VPU_TILING_CMX_LIMIT_KB, "", tiling_cmx_limit_message);

 static void showUsage() {
    std::cout << std::endl;
@ -74,6 +78,7 @@ static void showUsage() {
    std::cout << "    -VPU_MYRIAD_PLATFORM         <value>     "   << platform_message             << std::endl;
    std::cout << "    -VPU_NUMBER_OF_SHAVES        <value>     "   << number_of_shaves_message     << std::endl;
    std::cout << "    -VPU_NUMBER_OF_CMX_SLICES    <value>     "   << number_of_cmx_slices_message << std::endl;
+    std::cout << "    -VPU_TILING_CMX_LIMIT_KB     <value>     "   << tiling_cmx_limit_message     << std::endl;
    std::cout << std::endl;
 }

@ -119,6 +124,10 @@ static std::map<std::string, std::string> configure(const std::string &configFil
        config[VPU_CONFIG_KEY(NUMBER_OF_CMX_SLICES)] = FLAGS_VPU_NUMBER_OF_CMX_SLICES;
    }

+    if (!FLAGS_VPU_TILING_CMX_LIMIT_KB.empty()) {
+        config[VPU_CONFIG_KEY(TILING_CMX_LIMIT_KB)] = FLAGS_VPU_TILING_CMX_LIMIT_KB;
+    }
+
    return config;
 }