[IE CLDNN] Convolution fsv16 improvements several fixes after code review (#3637)

2021-01-21 11:06:43 +03:00 · 2021-01-21 11:06:43 +03:00 · 72c3e0e4a6
commit 72c3e0e4a6
parent d4f774f3c7
6 changed files with 364 additions and 148 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp
@ -19,16 +19,13 @@
 namespace kernel_selector {
 static const size_t sub_group_size = 16;
 static const size_t feature_block_size = 16;
 ConvolutionKernel_b_fs_yx_fsv16::ConvolutionKernel_b_fs_yx_fsv16() : ConvolutionKernelBase("convolution_gpu_bfyx_f16") {
-    std::vector<size_t> outputBlockWidths = {2, 4, 8};
+    std::vector<size_t> outputBlockWidths = { 2, 4, 8 };
    std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
    for (auto w : outputBlockWidths) {
        for (auto exeMode : executionModes) {
-            autoTuneOptions.emplace_back(AutoTuneOption{w, exeMode});
+            autoTuneOptions.emplace_back(AutoTuneOption{ w, exeMode });
        }
    }
 }
@ -39,7 +36,7 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
    auto x = cp.output.X().v;
    auto f = cp.output.Feature().v;
    if (x * f <= 256) {
-        if ( x <= 8 || x * f <= 128)
+        if (x <= 8 || x * f <= 128)
            return { 2, DEFAULT };
        else
            return { 4, DEFAULT };
@ -55,6 +52,42 @@ ConvolutionKernel_b_fs_yx_fsv16::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16:
    }
 }
 float ConvolutionKernel_b_fs_yx_fsv16::EstimateOccupancy(const convolution_params& params,
                                                         const ConvolutionTuningData& tuning_data) const {
    auto tuneOptions = GetAutoTuneOptions(params, 0);
    auto blockWidth = tuneOptions.blockWidth;
    auto x = params.output.X().v;
    auto y = params.output.Y().v;
    auto f = params.output.Feature().v;
    auto b = params.output.Batch().v;
    auto threads = CeilDiv(x, blockWidth) * y * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
    return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
 }
 ConvolutionKernel_b_fs_yx_fsv16::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16::GetTuningParams(const convolution_params& params) const {
    ConvolutionTuningData tuning_data;
    const auto& input = params.inputs[0];
    size_t ic_blocks = CeilDiv(input.Feature().v / params.groups, tuning_data.feature_block_size);
    size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
    bool slm_exception = params.output.X().v == 3 && params.output.Y().v == 3 && params.output.ElementSize() == 4 && params.output.Feature().v <= 512;
    if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !slm_exception)
        while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
               EstimateOccupancy(params, tuning_data) < 4.0)
            tuning_data.slm_div_factor *= 2;
    tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
    return tuning_data;
 }
 ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::F16);
@ -92,6 +125,8 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
                                                                                int autoTuneIndex) const {
    DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
    ConvolutionTuningData tuning_data = GetTuningParams(params);
    const auto& out = params.output;
    auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
@ -103,11 +138,11 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16::SetDefault(
    auto b = out.Batch().v;
    dispatchData.gws[0] = CeilDiv(x, autoTune.blockWidth) * y;
-    dispatchData.gws[1] = Align(f, sub_group_size);
+    dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
    dispatchData.gws[2] = b;
    dispatchData.lws[0] = 1;
-    dispatchData.lws[1] = sub_group_size;
+    dispatchData.lws[1] = tuning_data.work_group_size;
    dispatchData.lws[2] = 1;
    return dispatchData;
@ -126,27 +161,29 @@ bool ConvolutionKernel_b_fs_yx_fsv16::Validate(const Params& p, const optional_p
    const auto& params = static_cast<const convolution_params&>(p);
    ConvolutionTuningData tuning_data = GetTuningParams(params);
    const auto& input = params.inputs[0];
    const auto& output = params.output;
    if (params.groups > 1) {
        auto outFeaturesPerGroup = output.Feature().v / params.groups;
        auto inFeaturesPerGroup = input.Feature().v / params.groups;
-        auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
+        auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
-                                          (feature_block_size % inFeaturesPerGroup == 0) &&
+                                          (tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
-                                          (feature_block_size / outFeaturesPerGroup > 1) &&
+                                          (tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
-                                          (feature_block_size / inFeaturesPerGroup > 1) &&
+                                          (tuning_data.feature_block_size / inFeaturesPerGroup > 1) &&
                                          (outFeaturesPerGroup != 1) &&
                                          (inFeaturesPerGroup != 1);
-        auto grouped = inFeaturesPerGroup % sub_group_size == 0 &&
+        auto grouped = inFeaturesPerGroup % tuning_data.sub_group_size == 0 &&
-                       (outFeaturesPerGroup % sub_group_size == 0 || sub_group_size % outFeaturesPerGroup == 0);
+                       (outFeaturesPerGroup % tuning_data.sub_group_size == 0 || tuning_data.sub_group_size % outFeaturesPerGroup == 0);
        if (!multipleGroupsInputPreload && !grouped)
            return false;
    }
    // Check that padding before features doesn't miss-align the blocks
-    if (input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0)
+    if (input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0)
        return false;
    if (!params.bias.empty() && params.bias[0].GetDType() != input.GetDType())
@ -161,11 +198,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
    auto output = params.output;
    auto jit = Parent::GetJitConstants(params, dispatchData);
    ConvolutionTuningData tuning_data = GetTuningParams(params);
    auto blockWidth = dispatchData.cldnnStyle.blockWidth;
    if (!params.fused_ops.empty()) {
        auto input_dt = GetActivationType(params);
        FusedOpsConfiguration conf_vec = { "_VEC",
-                                           {"b", "(f_block*16)", "y", "x"},
+                                           {"b", "(feature_block * 16)", "y", "x"},
                                           "dst",
                                           input_dt,
                                           blockWidth,
@ -174,7 +213,7 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
                                           IndexType::TENSOR_COORD,
                                           Tensor::DataChannelName::X };
        FusedOpsConfiguration conf_scalar = { "_SCALAR",
-                                              {"b", "(f_block*16)", "y", "(x+i)"},
+                                              {"b", "(feature_block * 16)", "y", "(x + i)"},
                                              "dst[i]",
                                              input_dt,
                                              1,
@ -190,23 +229,25 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16::GetJitConstants(const convolution_
    auto outFeaturesPerGroup = output.Feature().v / params.groups;
    auto inFeaturesPerGroup = input.Feature().v / params.groups;
-    auto multipleGroupsInputPreload = (feature_block_size % outFeaturesPerGroup == 0) &&
+    auto multipleGroupsInputPreload = (tuning_data.feature_block_size % outFeaturesPerGroup == 0) &&
-                                      (feature_block_size % inFeaturesPerGroup == 0) &&
+                                      (tuning_data.feature_block_size % inFeaturesPerGroup == 0) &&
-                                      (feature_block_size / outFeaturesPerGroup > 1) &&
+                                      (tuning_data.feature_block_size / outFeaturesPerGroup > 1) &&
-                                      (feature_block_size / inFeaturesPerGroup > 1);
+                                      (tuning_data.feature_block_size / inFeaturesPerGroup > 1);
    if (multipleGroupsInputPreload)
        jit.AddConstant(MakeJitConstant("MULTIPLE_GROUPS_INPUT_PRELOAD", 1));
    jit.AddConstant(MakeJitConstant("OUTPUT_X_BLOCK_SIZE", blockWidth));
    jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
+    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
    jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, blockWidth)));
-    jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, feature_block_size)));
+    jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
-    if (params.output.Feature().v % feature_block_size != 0) {
+    jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
    jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(inFeaturesPerGroup, tuning_data.feature_block_size)));
    if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
        jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
    }
-    if (inFeaturesPerGroup % feature_block_size != 0 && !multipleGroupsInputPreload) {
+    if (inFeaturesPerGroup % tuning_data.feature_block_size != 0 && !multipleGroupsInputPreload) {
        jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
    }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16.h
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -58,7 +58,16 @@ private:
        std::string exeMode;
    };
    struct ConvolutionTuningData {
        const size_t sub_group_size = 16;
        const size_t feature_block_size = 16;
        size_t slm_div_factor = 1;
        size_t work_group_size = 1;
    };
    std::vector<AutoTuneOption> autoTuneOptions;
    AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
    ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
    float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
 };
 }  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
@ -19,11 +19,9 @@
 #include <string>
 namespace kernel_selector {
 static const size_t sub_group_size = 16;
 static const size_t feature_block_size = 16;
 ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_bfyx_f16_1x1") {
-    std::vector<size_t> outputBlockWidths = {2, 4, 8};
+    std::vector<size_t> outputBlockWidths = { 1, 2, 4, 8 };
    std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
    for (auto w : outputBlockWidths) {
@ -36,10 +34,15 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con
 ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
                                                                                                            int /*autoTuneIndex*/) const {
    const convolution_params& cp = static_cast<const convolution_params&>(params);
    auto x = cp.output.X().v;
    auto y = cp.output.Y().v;
    auto f = cp.output.Feature().v;
-    if (x * f <= 256) {
+
-        if ( x < 8 || x * f <= 128)
+    if (x == 1 && y == 1) {
        return { 1, DEFAULT };
    } else if (x * f <= 256) {
        if (x < 8 || x * f <= 128)
            return { 2, DEFAULT };
        else
            return { 4, DEFAULT };
@ -50,6 +53,40 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fs
    }
 }
 float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_params& params,
                                                             const ConvolutionTuningData& tuning_data) const {
    auto tuneOptions = GetAutoTuneOptions(params, 0);
    auto blockWidth = tuneOptions.blockWidth;
    auto x = params.output.X().v;
    auto y = params.output.Y().v;
    auto f = params.output.Feature().v;
    auto b = params.output.Batch().v;
    auto threads = CeilDiv(x * y, blockWidth) * CeilDiv(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor * b;
    return static_cast<float>(threads) / static_cast<float>(params.engineInfo.maxThreadsPerDevice);
 }
 ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const {
    ConvolutionTuningData tuning_data;
    const auto& input = params.inputs[0];
    size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
    size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
    bool block_size_one_is_better = params.output.X().v == 1 && params.output.Y().v == 1 && input.Feature().v >= 2048;
    if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.bIMADSupport && !block_size_one_is_better)
        while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
               EstimateOccupancy(params, tuning_data) < 4.0)
            tuning_data.slm_div_factor *= 2;
    tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
    return tuning_data;
 }
 ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
    ParamsKey k;
@ -75,21 +112,24 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
                                                                               int autoTuneIndex) const {
    DispatchData dispatchData = ConvolutionKernelBase::SetDefault(params);
    ConvolutionTuningData tuning_data = GetTuningParams(params);
    auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
    dispatchData.cldnnStyle.blockWidth = autoTune.blockWidth;
    const auto& out = params.output;
    auto x = out.X().v;
    auto y = out.Y().v;
    auto f = out.Feature().v;
    auto b = out.Batch().v;
-    dispatchData.gws[0] = CeilDiv(x * y, autoTune.blockWidth);
+    dispatchData.gws[0] = x == 1 && y == 1 ? 1 : CeilDiv(x * y, autoTune.blockWidth);
-    dispatchData.gws[1] = Align(f, feature_block_size);
+    dispatchData.gws[1] = Align(f, tuning_data.feature_block_size) * tuning_data.slm_div_factor;
    dispatchData.gws[2] = b;
    dispatchData.lws[0] = 1;
-    dispatchData.lws[1] = sub_group_size;
+    dispatchData.lws[1] = tuning_data.work_group_size;
    dispatchData.lws[2] = 1;
    return dispatchData;
@ -124,14 +164,16 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p, const option
    const auto& params = static_cast<const convolution_params&>(p);
    ConvolutionTuningData tuning_data = GetTuningParams(params);
    const auto& input = params.inputs[0];
    const auto& output = params.output;
-    const bool bOutputSizes =
+    const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
        output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
    const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
    const bool bStride = params.stride.x != 1 || params.stride.y != 1;
-    const bool bPadding = input.Feature().pad.before % feature_block_size != 0 || output.Feature().pad.before % feature_block_size != 0;
+    const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 ||
                          output.Feature().pad.before % tuning_data.feature_block_size != 0;
    if  (bOutputSizes || bFilterSize || bStride || bPadding) {
        return false;
@ -144,11 +186,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
                                                                  const DispatchData& dispatchData) const {
    auto jit = Parent::GetJitConstants(params, dispatchData);
    ConvolutionTuningData tuning_data = GetTuningParams(params);
    auto blockWidth = dispatchData.cldnnStyle.blockWidth;
    if (!params.fused_ops.empty()) {
        auto input_dt = GetUnitType(params);
        FusedOpsConfiguration conf_vec = { "_VEC",
-                                           {"b", "(f_block*16)", "y", "x"},
+                                           {"b", "(feature_block * 16)", "y", "x"},
                                           "dst",
                                           input_dt,
                                           blockWidth,
@ -156,8 +200,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
                                           BoundaryCheck::ENABLED,
                                           IndexType::TENSOR_COORD,
                                           Tensor::DataChannelName::X };
-        FusedOpsConfiguration conf_scalar = { "_SCALAR",
+        FusedOpsConfiguration conf_scalar1 = { "_SCALAR",
-                                              {"b", "(f_block*16)", "yi", "xi"},
+                                              {"b", "(feature_block * 16)", "yi", "xi"},
                                              "dst[i]",
                                              input_dt,
                                              1,
@ -165,10 +209,19 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
                                              BoundaryCheck::ENABLED,
                                              IndexType::TENSOR_COORD,
                                              Tensor::DataChannelName::X };
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
+        FusedOpsConfiguration conf_scalar2 = { "_SCALAR_B1",
                                              {"b", "(feature_block * 16)", "0", "0"},
                                              "dst",
                                              input_dt,
                                              1,
                                              LoadType::LT_ALIGNED_READ,
                                              BoundaryCheck::ENABLED,
                                              IndexType::TENSOR_COORD,
                                              Tensor::DataChannelName::X };
        jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 }));
    }
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
+    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
    jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
    bool padded_output = params.output.X().pad.Total() != 0;
@ -194,11 +247,13 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
    jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
    jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.output.X().v, blockWidth)));
-    jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, feature_block_size)));
+    jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
-    if (params.output.Feature().v % feature_block_size != 0) {
+    jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
    jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
    if (params.output.Feature().v % tuning_data.feature_block_size != 0) {
        jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
    }
-    if (params.inputs[0].Feature().v % feature_block_size != 0) {
+    if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
        jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
    }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.h
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -51,7 +51,16 @@ protected:
        std::string exeMode;
    };
    struct ConvolutionTuningData {
        const size_t sub_group_size = 16;
        const size_t feature_block_size = 16;
        size_t slm_div_factor = 1;
        size_t work_group_size = 1;
    };
    std::vector<AutoTuneOption> autoTuneOptions;
    AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
    ConvolutionTuningData GetTuningParams(const convolution_params& params) const;
    float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const;
 };
 }  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
@ -40,7 +40,7 @@
 #   define INPUT_BLOCK_READ4(ptr, offset)   AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
 #   define INPUT_BLOCK_READ8(ptr, offset)   AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
 #else
-#   error convolution_gpu_bfyx_f16.cl - unsupported input type.
+#   error convolution_gpu_bfyx_f16.cl: unsupported input type
 #endif
 #if FILTER_TYPE_SIZE == 2
@ -48,7 +48,7 @@
 #elif FILTER_TYPE_SIZE == 4
 #   define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
 #else
-#   error convolution_gpu_bfyx_f16.cl - unsupported filter type.
+#   error convolution_gpu_bfyx_f16.cl: unsupported filter type
 #endif
 #if OUTPUT_TYPE_SIZE == 1
@ -67,7 +67,7 @@
 #   define OUTPUT_BLOCK_WRITE4(ptr, offset, val)   intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
 #   define OUTPUT_BLOCK_WRITE8(ptr, offset, val)   intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
 #else
-#   error convolution_gpu_bfyx_f16.cl - unsupported output type.
+#   error convolution_gpu_bfyx_f16.cl: unsupported output type
 #endif
 #if INPUT0_TYPE_SIZE == 2
@ -77,12 +77,14 @@
 #else
 #   define GET_SRC(data, id)    intel_sub_group_shuffle(data, id)
 #endif
 #define FEATURE_SLICE_SIZE 16
 #define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
 #define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
+__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
 KERNEL(convolution_bfyx_f16)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@ -94,26 +96,29 @@ KERNEL(convolution_bfyx_f16)(
    FUSED_OPS_DECLS,
 #endif
    uint split_idx) {
-#if GROUPED
+    const int sglid = get_sub_group_local_id();
    const int f_block = get_group_id(1);
    const int group = (f_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
    const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (f_block * FEATURE_SLICE_SIZE);
    int groups_per_sub_group = 1;
    if (prev_group_leftover < 16)
        groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
 #else
    const int f_block = get_group_id(1);
    const int group = split_idx;
    const int groups_per_sub_group = 1;
 #endif  // GROUPED
    const int lid = get_sub_group_local_id();
    const int b = (uint)get_global_id(2);
    const int xy = get_global_id(0);
    const int x = (xy % X_BLOCKS) * OUTPUT_X_BLOCK_SIZE;
    const int y = (xy / X_BLOCKS);
    const int lid1 = (int)get_local_id(1);
    const int feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
    const int feature_sub_block = lid1 / feature_per_wg;
    const int feature_block = (int)get_group_id(1);
 #if GROUPED
    const int group = (feature_block * FEATURE_SLICE_SIZE) / FILTER_OFM_NUM;
    const int prev_group_leftover = (FILTER_OFM_NUM * (group + 1)) - (feature_block * FEATURE_SLICE_SIZE);
    int groups_per_sub_group = 1;
    if (prev_group_leftover < 16)
        groups_per_sub_group += ((FEATURE_SLICE_SIZE - prev_group_leftover - 1) / FILTER_OFM_NUM) + 1;
 #else
    const int group = split_idx;
    const int groups_per_sub_group = 1;
 #endif  // GROUPED
    typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, OUTPUT_X_BLOCK_SIZE) vec_t;
    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
@ -143,7 +148,7 @@ KERNEL(convolution_bfyx_f16)(
    const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
    const uint output_offset = b * output_b_pitch +
-                               (f_block + output_fs_pad_before) * output_fs_pitch +
+                               (feature_block + output_fs_pad_before) * output_fs_pitch +
                               (y + OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
                               (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
@ -155,54 +160,71 @@ KERNEL(convolution_bfyx_f16)(
    const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
 #if BIAS_TERM
-    uint bias_offset = f_block * FEATURE_SLICE_SIZE;
+#if SLM_DIV_FACTOR == 1
-    vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, bias_offset));
+    vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
 #else
    vec_t dst;
    if (feature_sub_block == 0) {
        dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
    } else {
        dst = INPUT0_VAL_ZERO;
    }
 #endif // SLM_DIV_FACTOR == 1
 #else
    vec_t dst = INPUT0_VAL_ZERO;
-#endif  // BIAS_TERM
+#endif // BIAS_TERM
 #if SLM_DIV_FACTOR > 1
    __local vec_t partial_summ[WORK_GROUP_SIZE];
 #endif
 #if MULTIPLE_GROUPS_INPUT_PRELOAD
-    const uint in_split_offset = f_block * input_fs_pitch;
+    const uint in_split_offset = feature_block * input_fs_pitch;
-    const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint g = sglid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
-    const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint ofm_in_group = sglid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
    const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
 #else
 #if GROUPED
    for (uint g = group; g < group + groups_per_sub_group; g++) {
        const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
        const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
-        const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
+        const uint filter_offset = (feature_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
 #else
        const uint in_split_offset = 0;
        const uint filter_split_offset = 0;
-        const uint filter_offset = f_block * filter_os_pitch;
+        const uint filter_offset = feature_block * filter_os_pitch;
 #endif  // GROUPED
        const uint grouped_filter_offset = filter_offset + filter_split_offset;
 #endif  // MULTIPLE_GROUPS_INPUT_PRELOAD
        const uint grouped_input_offset = input_offset + in_split_offset;
-        for (uint icb = 0; icb < IC_BLOCKS; icb++) {
+#if SLM_DIV_FACTOR > 1
        for (int icb = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; icb < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; icb++) {
 #else
        for (int icb = 0; icb < IC_BLOCKS; icb++) {
 #endif // SLM_DIV_FACTOR > 1
            __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
            for (int kh = 0; kh < FILTER_SIZE_Y; kh++) {
-                if (input_y + kh*DILATION_SIZE_Y < 0 || input_y + kh*DILATION_SIZE_Y >= INPUT0_SIZE_Y)
+                if (input_y + kh * DILATION_SIZE_Y < 0 || input_y + kh * DILATION_SIZE_Y >= INPUT0_SIZE_Y)
                    continue;
                INPUT_TYPE line_cache[INPUT_LINE_SIZE];
 #if INPUT_LEFTOVERS
-                if ((icb+1)*FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
+                if ((icb + 1) * FEATURE_SLICE_SIZE >= FILTER_IFM_NUM)
                {
                    for (int xb = 0; xb < INPUT_LINE_SIZE; xb++)
                    {
-                        if (icb*FEATURE_SLICE_SIZE + lid >= FILTER_IFM_NUM)
+                        if (icb * FEATURE_SLICE_SIZE + sglid >= FILTER_IFM_NUM)
                            line_cache[xb] = 0;
                        else
                            line_cache[xb] = input[grouped_input_offset +
                                                   icb * input_fs_pitch +
                                                   kh * DILATION_SIZE_Y * input_y_pitch +
                                                   xb * input_x_pitch +
-                                                   lid];
+                                                   sglid];
                    }
                }
                else
@ -251,7 +273,7 @@ KERNEL(convolution_bfyx_f16)(
 #if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
                        src[i] = line_cache[i];
 #else
-                        src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
+                        src[i] = line_cache[kw * DILATION_SIZE_X + STRIDE_SIZE_X * i];
 #endif  // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
                    }
 #if MULTIPLE_GROUPS_INPUT_PRELOAD
@ -300,7 +322,7 @@ KERNEL(convolution_bfyx_f16)(
                        dst = mad(wei0.s6, src6,  dst);
                        dst = mad(wei0.s7, src7,  dst);
 #else
-#error Unsupported input feature size for multiple groups input preload
+#   error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
 #endif  // FILTER_IFM_NUM
 #else
                    FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
@ -352,13 +374,24 @@ KERNEL(convolution_bfyx_f16)(
 #if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
    }
 #endif  // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
 #if SLM_DIV_FACTOR > 1
    partial_summ[lid1] = dst;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (feature_sub_block == 0) {
        __attribute__((opencl_unroll_hint))
        for (int i = 1; i < SLM_DIV_FACTOR; i++)
            dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
 #endif // SLM_DIV_FACTOR > 1
    dst = ACTIVATION(dst, ACTIVATION_PARAMS);
    typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
    out_vec_t res;
 #if OUTPUT_LEFTOVERS
-    if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
+    if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
        for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
 #if HAS_FUSED_OPS
            FUSED_OPS_SCALAR;
@ -366,8 +399,8 @@ KERNEL(convolution_bfyx_f16)(
 #else
            res[i] = TO_OUTPUT_TYPE(dst[i]);
 #endif
-            if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
+            if ((feature_block * FEATURE_SLICE_SIZE + sglid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
-                output[output_offset + i * output_x_pitch + lid] = res[i];
+                output[output_offset + i * output_x_pitch + sglid] = res[i];
            }
        }
    }
@ -391,7 +424,7 @@ KERNEL(convolution_bfyx_f16)(
 #elif OUTPUT_X_BLOCK_SIZE == 1
            OUTPUT_BLOCK_WRITE(output, output_offset, res);
 #else
-#   error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
+#   error convolution_gpu_bfyx_f16.cl: unsupported output x block size
 #endif
        } else {
            for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
@ -405,6 +438,10 @@ KERNEL(convolution_bfyx_f16)(
            }
        }
    }
 #if SLM_DIV_FACTOR > 1
    }
 #endif
 }
 #undef AS_INPUT_SRC
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -16,16 +16,24 @@
 #include "include/unit_type.cl"
 #include "include/mmad.cl"
-#define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE),                             \
+#if X_BLOCK_SIZE > 1
-                            intel_sub_group_shuffle(                                                     \
+#   define GET_SRC(data, id)    AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE),                             \
-                                AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data),       \
+                                    intel_sub_group_shuffle(                                                   \
-                                id))
+                                    AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data),         \
                                    id))
 #else
 #   define GET_SRC(data, id)    AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
 #endif
 #define FEATURE_SLICE_SIZE 16
 #if X_BLOCK_SIZE > 1
 #   define UNIT_BLOCK_READ_VEC(ptr, offset)         CAT(UNIT_BLOCK_READ, X_BLOCK_SIZE)(ptr, offset)
 #   define UNIT_BLOCK_WRITE_VEC(ptr, offset, val)   CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
 #endif
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
+__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
 KERNEL(convolution_b_fs_yx_fsv16_1x1)(
    __global INPUT0_TYPE* input,
    __global OUTPUT_TYPE* output,
@ -37,16 +45,21 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
    FUSED_OPS_DECLS,
 #endif
    uint split_idx) {
-    const int xy = get_global_id(0);
+#if X_BLOCK_SIZE > 1
-    const int f_block = get_group_id(1);
+    const uint xy = (int)get_global_id(0);
-    const int b = get_global_id(2);
+    const uint x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
-    const int lid = get_sub_group_local_id();
+    const uint y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
-    const int x = (xy * X_BLOCK_SIZE) % OUTPUT_SIZE_X;
+    const uint input_x = x;
-    const int y = (xy * X_BLOCK_SIZE) / OUTPUT_SIZE_X;
+    const uint input_y = y;
 #endif
    const uint b = (int)get_global_id(2);
    const uint sglid = (int)get_sub_group_local_id();
-    const int input_x = x;
+    const uint lid1 = (int)get_local_id(1);
-    const int input_y = y;
+    const uint feature_per_wg = (int)get_local_size(1) / SLM_DIV_FACTOR;
    const uint feature_sub_block = lid1 / feature_per_wg;
    const uint feature_block = (int)get_group_id(1);
    // Input offset calculations:
    const uint input_x_pitch = FEATURE_SLICE_SIZE;
@ -71,8 +84,8 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
    const uint output_fs_pad_before = OUTPUT_PAD_BEFORE_FEATURE_NUM / FEATURE_SLICE_SIZE;
-    const uint output_offset =  b * output_b_pitch +
+    const uint output_offset = b * output_b_pitch +
-                               (f_block + output_fs_pad_before) * output_fs_pitch +
+                               (feature_block + output_fs_pad_before) * output_fs_pitch +
                               (OUTPUT_PAD_BEFORE_SIZE_Y) * output_y_pitch +
                               (OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
@ -83,71 +96,91 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
    const uint filter_is_pitch = filter_y_pitch * FILTER_SIZE_Y;
    const uint filter_os_pitch = filter_is_pitch * ((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE);
-    const uint filter_offset = f_block * filter_os_pitch;
+    const uint filter_offset = feature_block * filter_os_pitch;
 #if X_BLOCK_SIZE > 1
    typedef MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE) vec_t;
 #if BIAS_TERM
    vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
 #else
-    vec_t dst = UNIT_VAL_ZERO;
+    typedef UNIT_TYPE vec_t;
 #endif
-    for (uint k = 0; k < IC_BLOCKS; ++k)
+#if BIAS_TERM
 #if SLM_DIV_FACTOR == 1
    vec_t dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
 #else
    vec_t dst;
    if (feature_sub_block == 0) {
        dst = (vec_t)(UNIT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
    } else {
        dst = UNIT_VAL_ZERO;
    }
 #endif // SLM_DIV_FACTOR == 1
 #else
    vec_t dst = UNIT_VAL_ZERO;
 #endif // BIAS_TERM
 #if SLM_DIV_FACTOR > 1
    __local vec_t partial_summ[WORK_GROUP_SIZE];
    for (uint k = feature_sub_block * IC_BLOCKS / SLM_DIV_FACTOR; k < (feature_sub_block + 1) * IC_BLOCKS / SLM_DIV_FACTOR; k++)
    {
 #else
    for (uint k = 0; k < IC_BLOCKS; k++)
    {
 #endif // SLM_DIV_FACTOR > 1
        vec_t src = 0;
 #if INPUT_LEFTOVERS
-        if ((k+1)*FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
+        if ((k + 1) * FEATURE_SLICE_SIZE >= INPUT0_FEATURE_NUM)
        {
-            if (k*FEATURE_SLICE_SIZE + lid < INPUT0_FEATURE_NUM)
+            if (k * FEATURE_SLICE_SIZE + sglid < INPUT0_FEATURE_NUM)
            {
 #if X_BLOCK_SIZE > 1
                __attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
                for (int i = 0; i < X_BLOCK_SIZE; i++)
                {
                    const uint xb = (x + i) % INPUT0_SIZE_X;
                    const uint yb = y + (x + i) / INPUT0_SIZE_X;
-                    const uint input_idx = input_offset + k * input_fs_pitch +
+                    const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
-                                                          yb * input_y_pitch +
+
-                                                          xb * input_x_pitch;
+                    src[i] = input[input_idx + sglid];
                    src[i] = input[input_idx + lid];
                }
 #else
                src = input[input_offset + k * input_fs_pitch + sglid];
 #endif // X_BLOCK_SIZE > 1
            }
        }
        else
-#endif
+#endif // INPUT_LEFTOVERS
        {
 #if PADDED_INPUT
 #if X_BLOCK_SIZE > 1
            __attribute__((opencl_unroll_hint(X_BLOCK_SIZE)))
            for (int i = 0; i < X_BLOCK_SIZE; i++)
            {
                const uint xb = (x + i) % INPUT0_SIZE_X;
                const uint yb = y + (x + i) / INPUT0_SIZE_X;
-                const uint input_idx = input_offset + k * input_fs_pitch +
+                const uint input_idx = input_offset + k * input_fs_pitch + yb * input_y_pitch + xb * input_x_pitch;
-                                                      yb * input_y_pitch +
+
                                                      xb * input_x_pitch;
                src[i] = UNIT_BLOCK_READ(input, input_idx);
            }
 #else
-#if X_BLOCK_SIZE == 8
+            src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
-            src = UNIT_BLOCK_READ8(input, input_offset + k * input_fs_pitch +
+#endif // X_BLOCK_SIZE > 1
                                                         input_y * input_y_pitch +
                                                         input_x * input_x_pitch);
 #elif X_BLOCK_SIZE == 4
            src = UNIT_BLOCK_READ4(input, input_offset + k * input_fs_pitch +
                                                         input_y * input_y_pitch +
                                                         input_x * input_x_pitch);
 #elif X_BLOCK_SIZE == 2
            src = UNIT_BLOCK_READ2(input, input_offset + k * input_fs_pitch +
                                                         input_y * input_y_pitch +
                                                         input_x * input_x_pitch);
 #endif
 #endif
        }
 #else // PADDED_INPUT
 #if X_BLOCK_SIZE > 1
            src = UNIT_BLOCK_READ_VEC(input, input_offset + k * input_fs_pitch + input_y * input_y_pitch + input_x * input_x_pitch);
 #else
            src = UNIT_BLOCK_READ(input, input_offset + k * input_fs_pitch);
 #endif // X_BLOCK_SIZE > 1
 #endif // PADDED_INPUT
        }
        UNIT_TYPE8 wei0 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch);
        UNIT_TYPE8 wei1 = UNIT_BLOCK_READ8(weights, filter_offset + k * filter_is_pitch + 8 * filter_isv_pitch);
        const vec_t src0  = GET_SRC(src, 0);
        const vec_t src1  = GET_SRC(src, 1);
        const vec_t src2  = GET_SRC(src, 2);
@ -183,29 +216,52 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
        dst = mad(wei1.s7, src15, dst);
    }
 #if SLM_DIV_FACTOR > 1
    partial_summ[lid1] = dst;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (feature_sub_block == 0) {
        __attribute__((opencl_unroll_hint))
        for (int i = 1; i < SLM_DIV_FACTOR; i++)
            dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
 #endif // SLM_DIV_FACTOR > 1
    dst = ACTIVATION(dst, ACTIVATION_PARAMS);
 #if OUTPUT_LEFTOVERS
-    if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
+    if ((feature_block + 1) * FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM)
    {
 #if X_BLOCK_SIZE > 1
        for (int i = 0; i < X_BLOCK_SIZE; i++) {
-            if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y ||
+            if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
                f_block*FEATURE_SLICE_SIZE + lid >= OUTPUT_FEATURE_NUM)
                return;
-            int xi = (x+i) % OUTPUT_SIZE_X;
+            int xi = (x + i) % OUTPUT_SIZE_X;
-            int yi = y + ((x+i) / OUTPUT_SIZE_X);
+            int yi = y + ((x + i) / OUTPUT_SIZE_X);
 #if HAS_FUSED_OPS
            FUSED_OPS_SCALAR;
            dst[i] = FUSED_OPS_RESULT_SCALAR;
 #endif
-            output[output_offset + yi * output_y_pitch + xi * output_x_pitch + lid] = dst[i];
+            output[output_offset + yi * output_y_pitch + xi * output_x_pitch + sglid] = dst[i];
        }
 #else // X_BLOCK_SIZE > 1
        if (feature_block * FEATURE_SLICE_SIZE + sglid >= OUTPUT_FEATURE_NUM)
            return;
 #if HAS_FUSED_OPS
        FUSED_OPS_SCALAR_B1;
        dst = FUSED_OPS_RESULT_SCALAR_B1;
 #endif
        output[output_offset + sglid] = dst;
 #endif // X_BLOCK_SIZE > 1
    }
    else
-#endif
+#endif // OUTPUT_LEFTOVERS
 #if X_BLOCK_SIZE > 1
    {
 #if !PADDED_OUTPUT && !NON_UNIT_FUSED_OP_SPATIAL
        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
@ -216,20 +272,14 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
            FUSED_OPS_VEC;
            dst = FUSED_OPS_RESULT_VEC;
 #endif
-#if X_BLOCK_SIZE == 8
+            UNIT_BLOCK_WRITE_VEC(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
            UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
 #elif X_BLOCK_SIZE == 4
            UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
 #elif X_BLOCK_SIZE == 2
            UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
 #endif
        } else {
            for (int i = 0; i < X_BLOCK_SIZE; i++) {
                if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
                    return;
-                int xi = (x+i) % OUTPUT_SIZE_X;
+                int xi = (x + i) % OUTPUT_SIZE_X;
-                int yi = y + ((x+i) / OUTPUT_SIZE_X);
+                int yi = y + ((x + i) / OUTPUT_SIZE_X);
 #if HAS_FUSED_OPS
                FUSED_OPS_SCALAR;
@ -240,7 +290,22 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
            }
        }
    }
 #else // X_BLOCK_SIZE > 1
    {
 #if HAS_FUSED_OPS
        FUSED_OPS_SCALAR_B1;
        dst = FUSED_OPS_RESULT_SCALAR_B1;
 #endif
        UNIT_BLOCK_WRITE(output, output_offset, dst);
    }
 #endif // X_BLOCK_SIZE > 1
 #if SLM_DIV_FACTOR > 1
    }
 #endif
 }
 #undef GET_SRC
 #undef FEATURE_SLICE_SIZE
 #undef UNIT_BLOCK_READ_VEC
 #undef UNIT_BLOCK_WRITE_VEC