[IE CLDNN] Convolution b_fs_zyx_fsv16_imad tuning improvements (#3011)

2021-01-20 18:36:46 +03:00 · 2021-01-20 18:36:46 +03:00 · 33005b7741
commit 33005b7741
parent 318db6eccc
10 changed files with 252 additions and 172 deletions
--- a/inference-engine/thirdparty/clDNN/api/device.hpp
+++ b/inference-engine/thirdparty/clDNN/api/device.hpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -39,6 +39,8 @@ enum class device_type {
 struct device_info {
    uint32_t cores_count;     ///< Number of available HW cores.
    uint32_t core_frequency;  ///< Clock frequency in MHz.
+    uint32_t max_threads_per_execution_unit;    ///< Number of available HW threads on EU.
+    uint32_t max_threads_per_device;            ///< Maximum number of HW threads on device.

    uint64_t max_work_group_size;  ///< Maximum number of work-items in a work-group executing a kernel using the data parallel execution model.
    uint64_t max_local_mem_size;   ///< Maximum size of local memory arena in bytes.
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp
@ -340,11 +340,8 @@ float Convolution_kernel_b_fs_yx_fsv16_imad_1x1::EstimateOccupancy(const convolu
    size_t block_b = params.output.Batch().v;

    auto threads = blocks_s * blocks_f * block_b;
-    constexpr size_t max_threads_per_cu = 7;
-    size_t compute_units = params.engineInfo.computeUnitsCount;
-    size_t max_threads = compute_units * max_threads_per_cu;

-    return static_cast<float>(threads) / static_cast<float>(max_threads);
+    return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
 }

 float Convolution_kernel_b_fs_yx_fsv16_imad_1x1::EstimateSLMUsage(const convolution_params& params, const AutoTuneParams& tparams) const {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16_imad.cpp
@ -65,188 +65,201 @@ namespace kernel_selector {

 Convolution_kernel_b_fs_zyx_fsv16_imad::BlockParams
 Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
-    constexpr float max_reg_pressure = 0.75f;

-    // TODO Investigate whether below algorithm for selecting optimal block params could be reduced to:
-    //      1. Enumerate possible block params as optimization space
-    //      2. Prune invalid params (too high register pressure, too big local memory usage)
-    //      3. Rank params according to some combination of:
-    //         - compute/memory ratio
-    //         - occupancy
-    //         - register pressure
-    //         - local memory usage
-    //      4. Select params with highest rank
+    size_t max_block_width = getOutBlock_X(params.output.X().v, params.stride.x, params.filterSize.x, params.dilation.x);
+    size_t max_in_block_width = (max_block_width - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
+
+    size_t block_width = max_block_width;
+    if (max_block_width > 1) {
+        for (size_t w = max_block_width; w >= CeilDiv(max_block_width, 2); w -= 1) {
+            if (params.output.X().v % w == 0) {
+                block_width = w;
+                break;
+            }
+        }
+    }

-    // Select optimal block width
-    size_t block_width = getOutBlock_X(params.output.X().v, params.stride.x, params.filterSize.x, params.dilation.x);
    size_t in_block_width = (block_width - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
-
-    // If possible increase features block size
    size_t block_features = simd;
-    {
-        size_t tmp_block_features = simd * 2;
-        auto block2_params = BlockParams{ block_width, 1, 1, tmp_block_features, in_block_width, 1, 1, 1 };
-
-        bool c_mul_f = params.weights.OFM().v % tmp_block_features == 0;
-        bool c_reg_pressure = EstimateRegPressure(params, block2_params) <= max_reg_pressure;
-
-        if (c_mul_f && c_reg_pressure) {
-            block_features = tmp_block_features;
-        }
-    }
-
-    // If not enough occupancy try to perform feature split or/and block reduction
    size_t feature_slm_split = 1;
-
-    auto no_split_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 1 };
-
-    if (EstimateOccupancy(params, no_split_params) < 1.f) {
-        // Temporary variables for possible reductions in block sizes
-        bool update_block_params = false;
-        size_t split_block_width = block_width;
-        size_t split_in_block_width = in_block_width;
-        size_t split_block_features = block_features;
-
-        // Feature split requires extra registers, so check if it can be done with current block sizes
-        bool can_split =
-            EstimateRegPressure(params, BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, 2 }) <= max_reg_pressure;
-
-        // Has the occupancy reached sufficient level
-        bool enough_occupancy = false;
-        // Reductions to reduce register pressure
-        // Try to reduce block width to free some registers. Good compute/memory ratio will be pointless if barely any threads will run.
-        if (!can_split && block_width != 1) {
-            // At most twice reduction in output block width is acceptable
-            for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
-                size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
-                auto dummy_split_params = BlockParams{ w, 1, 1, block_features, tmp_in_width, 1, 1, 2 };
-
-                bool c_reg_pressure = EstimateRegPressure(params, dummy_split_params) <= max_reg_pressure;
-                bool c_mul_x = params.output.X().v % w == 0;
-
-                if (c_reg_pressure && c_mul_x) {
-                    split_block_width = w;
-                    split_in_block_width = tmp_in_width;
-                    can_split = true;
-                    break;
-                }
-            }
-        }
-        // Try to reduce block features.
-        // Done after attempting block width reduction, because bigger feature block allows more threads to write results in parallel.
-        if (!can_split) {
-            if (block_features / simd % 2 == 0) {
-                split_block_features = block_features / 2;
-                can_split = true;
-            }
-        }
-        // Check if previous reductions haven't improved occupancy enough
-        {
-            auto reduced_params = BlockParams{ split_block_width, 1, 1, split_block_features, split_in_block_width, 1, 1, 1 };
-            enough_occupancy = EstimateOccupancy(params, reduced_params) >= 1.f;
-            update_block_params = enough_occupancy;
-        }
-
-        if (can_split && !enough_occupancy) {
-            // TODO Try other split sizes
-            for (size_t split = 4; split < 5; ++split) {
-                auto tmp_params = BlockParams{ block_width, 1, 1, block_features, in_block_width, 1, 1, split };
-
-                bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
-                bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
-                bool c_lws = split * simd <= params.engineInfo.maxWorkGroupSize;
-                bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
-                bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
-
-                if (c_ifm_mul && c_slm && c_lws && c_reg_pressure) {
-                    feature_slm_split = split;
-                    update_block_params = true;
-                    enough_occupancy = c_occupancy;
-                }
-
-                // slm usage and work group sizes will only grow with split, so no point in checking
-                if (!c_slm || !c_lws || split * fsv >= params.weights.IFM().v)
-                    break;
-            }
-        }
-        // Splitting was not sufficient or couldn't be done
-        // Try to reduce block width if hasn't been done before
-        if (!enough_occupancy && split_block_width == block_width && block_width != 1) {
-            // At most twice reduction in output block width is acceptable
-            for (size_t w = block_width; w >= CeilDiv(block_width, 2); w -= 1) {
-                size_t tmp_in_width = (w - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;
-                auto tmp_params = BlockParams{ w, 1, 1, split_block_features, tmp_in_width, 1, 1,  feature_slm_split };
-
-                bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
-                bool c_mul_x = params.output.X().v % w == 0;
-
-                if (c_mul_x) {
-                    split_block_width = w;
-                    split_in_block_width = tmp_in_width;
-                    update_block_params = true;
-                }
-                // Reached enough occupancy, don't reduce futher to not hurt compute/mem ratio
-                if (c_mul_x && c_occupancy)
-                    break;
-            }
-        }
-        if (update_block_params) {
-            block_width = split_block_width;
-            in_block_width = split_in_block_width;
-            block_features = split_block_features;
-        }
-    }
-
-    // Select biggest block height and depth that fits into registers
    size_t block_height = 1;
    size_t block_depth = 1;
    size_t in_block_height = 1;
    size_t in_block_depth = 1;

-    bool break_external_loop = false;
+    // Estimate basic block params ratio
+    auto test_block_params = BlockParams{ block_width, 1, 1, simd, in_block_width, 1, 1, 1 };
+    auto best_block_params_ratio = EstimateBlockParamsRatio(params, test_block_params);

-    for (size_t d = 1; d < 16; ++d) {
-        if (params.output.Z().v % d != 0)
-            continue;
-        for (size_t h = 2; h < 16; ++h) {
-            if (params.output.Y().v % h != 0)
-                continue;
-            size_t tmp_in_block_depth = (d - 1) * params.stride.z + (params.filterSize.z - 1) * params.dilation.z + 1;
-            size_t tmp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
-            auto tmp_params = BlockParams{ block_width, h, d, block_features, in_block_width, tmp_in_block_height, tmp_in_block_depth, feature_slm_split };
+    size_t max_slm_split = params.engineInfo.maxWorkGroupSize / simd;

-            bool c_reg_pressure = EstimateRegPressure(params, tmp_params) <= max_reg_pressure;
-            bool c_occupancy = EstimateOccupancy(params, tmp_params) >= 1.f;
-            bool c_slm = EstimateSLMUsage(params, tmp_params) <= 1.f;
+    // TGLU exceptions related to SLM usage
+    if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.computeUnitsCount == 96) {
+        bool split_exception_1 = params.output.X().v == 3 && params.output.Y().v == 3 && params.output.Z().v == 1 && params.output.Feature().v == 512;
+        bool split_exception_2 = params.output.X().v == 5 && params.output.Y().v == 5 && params.output.Z().v == 1 && params.output.Feature().v == 256;
+        bool split_exception_3 = params.output.X().v == 9 && params.output.Y().v == 9 && params.output.Z().v == 1 && params.output.Feature().v == 128;
+        bool split_exception_4 = params.output.X().v == 18 && params.output.Y().v == 18 && params.output.Z().v == 1 && params.output.Feature().v == 64;

-            if (c_reg_pressure && c_occupancy && c_slm) {
-                block_height = h;
-                block_depth = d;
-                in_block_height = tmp_in_block_height;
-                in_block_depth = tmp_in_block_depth;
+        if (split_exception_1 || split_exception_2 || split_exception_3 || split_exception_4)
+            max_slm_split = 2;
+    }
+
+    // Check ratio in cycle for all available block params
+    for (size_t w = 0; w < 2; w++) {
+        size_t temp_block_width = block_width;
+        size_t temp_in_block_width = in_block_width;
+
+        if (w == 1) {
+            if (max_block_width > 1) {
+                temp_block_width = max_block_width;
+                temp_in_block_width = max_in_block_width;
            } else {
-                break_external_loop = true;
                break;
            }
        }

-        if (break_external_loop) {
-            break;
+        for (size_t split = 1; split <= max_slm_split; split *= 2) {
+            for (size_t temp_block_features = simd; temp_block_features <= simd * 2; temp_block_features += simd) {
+                for (size_t d = 1; d < 16; ++d) {
+                    if (params.output.Z().v % d)
+                        continue;
+                    for (size_t h = 1; h < 16; ++h) {
+                        if (params.output.Y().v % h)
+                            continue;
+
+                        bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
+                        bool c_mul_f = temp_block_features == simd ? true : params.weights.OFM().v % temp_block_features == 0;
+
+                        size_t temp_block_height = 1;
+                        size_t temp_block_depth = 1;
+                        size_t temp_in_block_height = 1;
+                        size_t temp_in_block_depth = 1;
+
+                        if (h != 1) {
+                            temp_block_height = h;
+                            temp_block_depth = d;
+                            temp_in_block_height = (h - 1) * params.stride.y + (params.filterSize.y - 1) * params.dilation.y + 1;
+                            temp_in_block_depth = (d - 1) * params.stride.z + (params.filterSize.z - 1) * params.dilation.z + 1;
+                        }
+
+                        // Estimate current block params ratio
+                        test_block_params = BlockParams{ temp_block_width, temp_block_height, temp_block_depth, temp_block_features,
+                                                         temp_in_block_width, temp_in_block_height, temp_in_block_depth, split };
+                        auto block_params_ratio = EstimateBlockParamsRatio(params, test_block_params);
+
+                        // Try to increase block_params_ratio
+                        if (c_ifm_mul && c_mul_f && block_params_ratio > best_block_params_ratio) {
+                            best_block_params_ratio = block_params_ratio;
+
+                            // Update block params if current ratio is better than previous
+                            block_width = temp_block_width;
+                            block_height = temp_block_height;
+                            block_depth = temp_block_depth;
+                            block_features = temp_block_features;
+
+                            in_block_width = temp_in_block_width;
+                            in_block_height = temp_in_block_height;
+                            in_block_depth = temp_in_block_depth;
+                            feature_slm_split = split;
+                        }
+                    }
+                }
+            }
+            if (split * fsv >= params.weights.IFM().v)
+                break;
        }
    }

    return BlockParams{ block_width, block_height, block_depth, block_features, in_block_width, in_block_height, in_block_depth, feature_slm_split };
 }

+float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateBlockParamsRatio(const convolution_params& params, const BlockParams& block) const {
+    float occupancy_by_logic_size = static_cast<float>(params.output.LogicalSize() / static_cast<size_t>(params.engineInfo.maxThreadsPerDevice));
+    bool increase_max_reg_pressure = occupancy_by_logic_size >= 595.f;
+    bool twice_increase_max_reg_pressure = occupancy_by_logic_size >= 595.f * 2.f;
+    float max_reg_pressure = twice_increase_max_reg_pressure ? 0.785f : increase_max_reg_pressure ? 0.75f : 0.7f;
+
+    constexpr float max_occupancy = 2.f;
+    constexpr float max_slm_usage = 1.f;
+
+    // Estimate occupancy, slm usage and register pressure
+    float occupancy = EstimateOccupancy(params, block);
+    float slm_usage = EstimateSLMUsage(params, block);
+    float reg_pressure = EstimateRegPressure(params, block);
+
+    // Estimate fb32 usage factor
+    auto& output = params.output;
+    float feature_block_32 = static_cast<float>(block.output_block_features == 32);
+    float fb32_factor = -5.f;
+    if (params.engineInfo.deviceType == dev_type::discrete_gpu && params.engineInfo.bIMADSupport) {
+        // Known cases where fb32 for discrete GPU works better
+        bool fb32_exception_1 = output.X().v % 13 == 0 && output.X().v * output.Feature().v == 13312;
+        bool fb32_exception_2 = (output.X().v % 28 == 0 && output.X().v * output.Feature().v == 14336) || (output.X().v == 14 && output.Feature().v == 512);
+        bool fb32_exception_3 = (output.X().v == 5 || output.X().v == 9) && output.Feature().v == 128;
+        bool fb32_exception_4 = output.X().v == 18 && output.Feature().v == 64;
+        bool fb32_exception_5 = output.X().v == 37 && output.Feature().v == 512;
+        bool fb32_exception_6 = output.X().v == 17 && output.Feature().v == 256;
+
+        // Accumulate exceptions for z == 1
+        bool fb32_exceptions = fb32_exception_1 || fb32_exception_2 || fb32_exception_3 || fb32_exception_4 || fb32_exception_5 || fb32_exception_6;
+
+        // Exception for z != 1
+        bool fb32_exception_z = output.X().v == output.Y().v && output.X().v % 28 == 0 && output.Z().v == 40 && output.Feature().v % 32 == 0;
+
+        if ((output.X().v == output.Y().v && output.Z().v == 1 && fb32_exceptions) || fb32_exception_z)
+            fb32_factor = 1.f;
+    } else if (occupancy_by_logic_size >= 2500.f) {
+        fb32_factor = 0.5f;
+    }
+
+    // We use arctangens function below for estimation of reg_pressure_factor and slm_usage_factor because arctangens
+    // is a symmetric function with positive values for x > 0 (we are only interested in positive values because
+    // the occupancy is always a positive value). For low occupancy (e.g. < 1) we shouldn't concentrate our attention on
+    // reg_pressure_factor and slm_usage_factor because target №1 in these cases is the occupancy increase. So for occupancy < 1
+    // reg_pressure factor and slm_usage_factor are enough low and they grow with growth of occupancy. Pi number (3.14159f)
+    // is a scaling coefficient for setting function values in range [0; 0.5f].
+    float reg_pressure_factor = atanf(occupancy) / 3.14159f;
+    float slm_usage_factor = atanf(occupancy) / 3.14159f;
+
+    size_t cur_increase_occupancy_coeff = (block.output_block_features == fsv ? 2 : 1) * block.feature_slm_split;
+    size_t max_increase_occupancy_coeff = 2 * params.engineInfo.maxWorkGroupSize / simd;
+    float can_increase_occupancy_coeff = static_cast<float>(max_increase_occupancy_coeff) / static_cast<float>(cur_increase_occupancy_coeff);
+
+    // We should check if there is a possibility for increase of occupancy if occupancy is less than 1.0
+    auto c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % (params.engineInfo.maxWorkGroupSize / simd) == 0;
+    auto can_increase_occupancy = (occupancy * can_increase_occupancy_coeff >= 1.0f) && c_ifm_mul;
+
+    float reduce_occupancy = 0.0f;
+    if (occupancy > max_occupancy) {
+        reduce_occupancy = log10f(occupancy - max_occupancy);
+        occupancy = max_occupancy;
+    }
+
+    // Estimate current block_params_ratio
+    float block_params_ratio = occupancy +
+                               feature_block_32 * fb32_factor +
+                               slm_usage * slm_usage_factor +
+                               reg_pressure * reg_pressure_factor -
+                               reduce_occupancy;
+
+    // Check all restrictions
+    bool bad_block_params = reg_pressure > max_reg_pressure || slm_usage > max_slm_usage || (occupancy < 1.0f && can_increase_occupancy);
+
+    return bad_block_params ? -10.f : block_params_ratio;
+}
+
 float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateRegPressure(const convolution_params& params, const BlockParams& block) const {
    size_t bytes_used = 0;
-    // accumulator
+
+    // Accumulator
    size_t accumulator_elements = block.output_block_width * block.output_block_height * block.output_block_depth * block.output_block_features;
    bytes_used += accumulator_elements * BytesPerElement(GetAccumulatorType(params));
-    // input block
+
+    // Input block
    size_t input_block_elements = block.input_block_depth * block.input_block_height * Align(block.input_block_width, simd) * fsv;
    bytes_used += input_block_elements * BytesPerElement(params.inputs[0].GetDType());
-    // weights block
+
+    // Weights block
    size_t weights_block_elements = block.output_block_features * fsv;
    bytes_used += weights_block_elements * BytesPerElement(params.weights.GetDType());

@ -275,22 +288,60 @@ float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateOccupancy(const convolutio
    size_t block_b = params.output.Batch().v;

    auto threads = blocks_w * blocks_h * blocks_d * blocks_f * block_b;
-    constexpr size_t max_threads_per_cu = 7;
-    size_t compute_units = params.engineInfo.computeUnitsCount;
-    size_t max_threads = compute_units * max_threads_per_cu;

-    return static_cast<float>(threads) / static_cast<float>(max_threads);
+    return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
 }

 float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const {
-    size_t slm_elements = block.output_block_width * block.output_block_height * block.output_block_depth *
-                          block.output_block_features * (block.feature_slm_split - 1);
-    size_t slm_bytes = slm_elements * BytesPerElement(GetAccumulatorType(params));
+    if (block.feature_slm_split == 1)
+        return 0.f;

-    // TODO:  Actual maximum slm should also depend on number of work-groups, but this is device specific
-    size_t max_slm_bytes = params.engineInfo.maxLocalMemSize;
+    size_t slm_elements_per_work_group = block.output_block_width * block.output_block_height * block.output_block_depth *
+                                         block.output_block_features * (block.feature_slm_split - 1);
+    size_t slm_bytes_per_work_group = slm_elements_per_work_group * BytesPerElement(GetAccumulatorType(params));

-    return static_cast<float>(slm_bytes) / static_cast<float>(max_slm_bytes);
+    // Check maxLocalMemSize limitations
+    size_t max_slm_bytes_per_sub_slice = params.engineInfo.maxLocalMemSize;
+    if (slm_bytes_per_work_group > max_slm_bytes_per_sub_slice)
+        return 0.f;
+
+    // Estimate work groups number
+    const auto& output = params.output;
+    size_t work_groups_number = CeilDiv(output.X().v, block.output_block_width) *
+                                CeilDiv(output.Y().v, block.output_block_height) *
+                                CeilDiv(output.Z().v, block.output_block_depth) *
+                                output.Batch().v *
+                                CeilDiv(params.weights.OFM().v, block.output_block_features) *
+                                params.groups;
+
+    // Check work groups per device limitations
+    size_t max_threads_per_compute_unit = static_cast<size_t>(params.engineInfo.maxThreadsPerExecutionUnit);
+    constexpr size_t max_compute_units_per_sub_slice = 8;
+    constexpr size_t max_work_groups_per_sub_slice = 16;
+    size_t max_sub_slices_per_device = params.engineInfo.computeUnitsCount / max_compute_units_per_sub_slice;
+    size_t max_work_groups_per_device = max_sub_slices_per_device * max_work_groups_per_sub_slice;
+    if (work_groups_number > max_work_groups_per_device * 100)
+        return 0.f;
+
+    // Estimate work groups number in sub slice
+    size_t threads_per_work_group = block.feature_slm_split;
+    size_t threads_per_sub_slice = max_threads_per_compute_unit * max_compute_units_per_sub_slice;
+    size_t current_max_work_groups_per_sub_slice = threads_per_sub_slice / threads_per_work_group;
+    while (current_max_work_groups_per_sub_slice * slm_bytes_per_work_group > max_slm_bytes_per_sub_slice)
+        current_max_work_groups_per_sub_slice--;
+
+    // The best scenario for slm usage from the point of view of time spending is a case with 1 work group per sub slice
+    // due to time isn't spent on waiting of synchronizations between work groups in sub slice
+    if (current_max_work_groups_per_sub_slice == 1)
+        return 1.0;
+
+    // Estimate the size of the SLM memory used
+    float max_slm_bytes_per_work_group = static_cast<float>(max_slm_bytes_per_sub_slice) / static_cast<float>(current_max_work_groups_per_sub_slice);
+    max_slm_bytes_per_work_group = static_cast<float>(Align(static_cast<size_t>(max_slm_bytes_per_work_group), 1024));
+    if (max_slm_bytes_per_work_group * static_cast<float>(current_max_work_groups_per_sub_slice) > static_cast<float>(max_slm_bytes_per_sub_slice))
+        max_slm_bytes_per_work_group -= 1024.0;
+
+    return static_cast<float>(slm_bytes_per_work_group) / static_cast<float>(max_slm_bytes_per_work_group);
 }

 ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16_imad.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_zyx_fsv16_imad.h
@ -61,6 +61,7 @@ protected:
    };

    BlockParams GetBlockParams(const convolution_params& params) const;
+    float EstimateBlockParamsRatio(const convolution_params& params, const BlockParams& block) const;
    float EstimateRegPressure(const convolution_params& params, const BlockParams& block) const;
    float EstimateOccupancy(const convolution_params& params, const BlockParams& block) const;
    float EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_1x1.cpp
@ -27,8 +27,7 @@ namespace {
    constexpr size_t pref_features_per_wi = 16;

    size_t get_preferred_lwg_depth(const DataTensor& output, const WeightsTensor& weights, const EngineInfo& info) {
-        constexpr size_t threads_per_eu = 7;
-        size_t max_simd_number = info.computeUnitsCount * threads_per_eu;
+        size_t max_simd_number = static_cast<size_t>(info.maxThreadsPerDevice);

        size_t simd_number = CeilDiv(output.X().v * output.Y().v, pref_simd) *
                             CeilDiv(output.Feature().v, pref_features_per_wi) *
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp
@ -91,7 +91,7 @@ static bool use_packing(const resample_params& params) {
    auto packed_work_items = params.output.X().v * params.output.Y().v * params.output.Z().v
        * CeilDiv(params.output.Feature().v, pack) * params.output.Batch().v;
    // TODO Loosen this requirement to minimum EUs needed to saturate cache bandwidth
-    constexpr size_t max_work_items_per_eu = 32 * 7;
+    size_t max_work_items_per_eu = 32 * static_cast<size_t>(params.engineInfo.maxThreadsPerExecutionUnit);
    auto minimum_work_items = params.engineInfo.computeUnitsCount * max_work_items_per_eu;

    if (packed_work_items < minimum_work_items)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
@ -364,6 +364,14 @@ private:
    Key key;
 };

+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Device type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+enum class dev_type {
+    integrated_gpu = 0,
+    discrete_gpu = 1
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // EngineInfo
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -378,7 +386,10 @@ struct EngineInfo {
    bool bIMMADSupport = false;
    bool bOptHintsSupport = false;
    bool bLocalBlockIOSupport = false;
+    dev_type deviceType = dev_type::integrated_gpu;
    uint32_t computeUnitsCount = 0;
+    uint32_t maxThreadsPerExecutionUnit = 0;
+    uint32_t maxThreadsPerDevice = 0;
    uint64_t maxWorkGroupSize = 0;
    uint64_t maxLocalMemSize = 0;
    uint64_t maxImage2dWidth = 0;
--- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -226,6 +226,9 @@ device_info_internal::device_info_internal(const cl::Device& device) {
    supports_imad = get_imad_support(device);
    supports_immad = false;

+    max_threads_per_execution_unit = 7;
+    max_threads_per_device = static_cast<uint32_t>(cores_count * max_threads_per_execution_unit);
+
    vendor_id = static_cast<uint32_t>(device.getInfo<CL_DEVICE_VENDOR_ID>());

    supports_usm = extensions.find("cl_intel_unified_shared_memory") != std::string::npos;
--- a/inference-engine/thirdparty/clDNN/src/gpu/device_info.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/device_info.h
@ -1,4 +1,4 @@
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -36,6 +36,8 @@ struct device_info_internal : cldnn::device_info {
    device_info convert_to_api() {
        return { cores_count,
         core_frequency,
+         max_threads_per_execution_unit,
+         max_threads_per_device,
         max_work_group_size,
         max_local_mem_size,
         max_global_mem_size,
--- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
+++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
@ -542,6 +542,17 @@ kernel_selector::tuning_mode to_tuning_mode(cldnn::tuning_mode mode) {
    }
 }

+kernel_selector::dev_type get_device_type(cldnn::device_type type) {
+    switch (type) {
+        case cldnn::device_type::integrated_gpu:
+            return kernel_selector::dev_type::integrated_gpu;
+        case cldnn::device_type::discrete_gpu:
+            return kernel_selector::dev_type::discrete_gpu;
+        default:
+            return kernel_selector::dev_type::integrated_gpu;
+    }
+}
+
 kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split, const tensor view_offset) {
    const auto& pad = l.data_padding;
    const auto& vals = l.size.sizes(l.format);
@ -744,11 +755,14 @@ void set_params(const program_node& node, kernel_selector::params& params) {
    params.engineInfo.bImageSupport = device_info.supports_image != 0;
    params.engineInfo.bOptHintsSupport = device_info.supports_optimization_hints;
    params.engineInfo.bLocalBlockIOSupport = device_info.supports_local_block_io;
+    params.engineInfo.deviceType = get_device_type(device_info.dev_type);
    params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
    params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
    params.engineInfo.maxImage2dWidth = device_info.max_image2d_width;
    params.engineInfo.maxImage2dHeight = device_info.max_image2d_height;
    params.engineInfo.computeUnitsCount = device_info.compute_units_count;
+    params.engineInfo.maxThreadsPerExecutionUnit = device_info.max_threads_per_execution_unit;
+    params.engineInfo.maxThreadsPerDevice = device_info.max_threads_per_device;
    params.engineInfo.deviceCache = context->get_device_cache();
    params.engineInfo.driverVersion = device_info.driver_version;