[IE CLDNN] int8 batches optimization (#632)

2020-06-29 13:09:33 +02:00 · 2020-06-29 13:09:33 +02:00 · f8b2627c3b
commit f8b2627c3b
parent b9d67927fd
31 changed files with 1157 additions and 42 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
@ -83,6 +83,7 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::byxf_af32);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);

    k.EnableDifferentTypes();
    k.EnableDifferentInputWeightsTypes();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.cpp
@ -0,0 +1,145 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+namespace kernel_selector {
+
+ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputWeightsType(WeightsType::INT8);
+
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+
+    k.EnableDifferentTypes();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    k.DisableTuning();
+    return k;
+}
+
+KernelsData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetCommonKernelsData(params, options);
+}
+
+JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
+    auto mem_consts = Parent::GetJitConstants(params, kd);
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf_scalar = {"",
+                                             {"out_b", "16 * j + out_f + get_sub_group_local_id()", "out_y", "out_x"},
+                                             "dequantized",
+                                             input_dt,
+                                             1,
+                                             LoadType::FEATURE_SHUFFLE};
+        conf_scalar.SetLoopAxes({ Tensor::DataChannelName::BATCH }, true);
+        conf_scalar.SetShuffleVarName("i");
+
+        mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+    }
+
+    return mem_consts;
+}  // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::SetDefault(const convolution_params& params, int) const {
+    DispatchData kd;
+    const auto& output = params.output;
+
+    std::vector<size_t> global = {output.X().v, output.Y().v, output.Feature().v / 32 * output.Batch().v};
+    std::vector<size_t> local = {1, 1, SIMD_SIZE};
+
+    kd.gws0 = global[0];
+    kd.gws1 = global[1];
+    kd.gws2 = global[2];
+
+    kd.lws0 = local[0];
+    kd.lws1 = local[1];
+    kd.lws2 = local[2];
+
+    kd.cldnnStyle = {0, 0, 0, 0, 0};
+    kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+    kd.efficiency = FORCE_PRIORITY_2;
+
+    return kd;
+}  // SetDefault
+
+bool Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::Validate(const Params& params, const optional_params& options) const {
+    if (!Parent::Validate(params, options)) {
+        return false;
+    }
+
+    KernelData kd = KernelData::Default<convolution_params>(params);
+    convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+    if ((newParams.filterSize.x != newParams.filterSize.y) || newParams.filterSize.x != 1) {
+        // Fitler size needs to be 1x1
+        return false;
+    }
+
+    if (newParams.stride.x != newParams.stride.y) {
+        // Strides must be equal
+        return false;
+    }
+    if (newParams.output.X().v != newParams.output.Y().v) {
+        // W and H must be equal
+        return false;
+    }
+
+    if (newParams.output.Feature().v % 32 != 0) {
+        // output feature size must be divided by 32
+        return false;
+    }
+
+    if (newParams.output.Batch().v % 16 != 0) {
+        // batch size must be divided by 16
+        return false;
+    }
+
+    // check that all fused ops except eltwise have only feature or scalar inputs
+    for (auto& fo : newParams.fused_ops) {
+        if (fo.GetType() == FusedOpType::ELTWISE)
+            continue;
+        for (auto& input : fo.tensors) {
+            if (input.X().v != 1 || input.Y().v != 1 || input.Batch().v != 1)
+                return false;
+        }
+    }
+
+    return true;
+}
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h
@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+
+class Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1 : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1") {}
+    virtual ~Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+    bool NeedPaddedInput() const override { return true; }
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+        return { WeightsLayout::os_is_yx_osv16_isv16 };
+    }
+
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+};
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.cpp
@ -0,0 +1,146 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+namespace kernel_selector {
+
+ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputWeightsType(WeightsType::INT8);
+
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+
+    k.EnableDifferentTypes();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    k.DisableTuning();
+    return k;
+}
+
+KernelsData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetCommonKernelsData(params, options);
+}
+
+JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
+    auto mem_consts = Parent::GetJitConstants(params, kd);
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf_scalar = {"",
+                                             {"out_b", "out_f + get_sub_group_local_id()", "out_y", "out_x"},
+                                             "dequantized",
+                                             input_dt,
+                                             1,
+                                             LoadType::FEATURE_SHUFFLE};
+        conf_scalar.SetLoopAxes({ Tensor::DataChannelName::BATCH }, true);
+        conf_scalar.SetShuffleVarName("i");
+        mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+    }
+
+    return mem_consts;
+}  // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::SetDefault(const convolution_params& params, int) const {
+    DispatchData kd;
+    const auto& output = params.output;
+
+    std::vector<size_t> global = {output.X().v, output.Y().v, output.Feature().v / 16 * output.Batch().v};
+    std::vector<size_t> local = {1, 1, SIMD_SIZE};
+
+    kd.gws0 = global[0];
+    kd.gws1 = global[1];
+    kd.gws2 = global[2];
+
+    kd.lws0 = local[0];
+    kd.lws1 = local[1];
+    kd.lws2 = local[2];
+
+    kd.cldnnStyle = {0, 0, 0, 0, 0};
+    kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+    kd.efficiency = FORCE_PRIORITY_2;
+
+    return kd;
+}  // SetDefault
+
+bool Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::Validate(const Params& params, const optional_params& options) const {
+    if (!Parent::Validate(params, options)) {
+        return false;
+    }
+
+    KernelData kd = KernelData::Default<convolution_params>(params);
+    convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+    if ((newParams.filterSize.x != newParams.filterSize.y) ||
+        newParams.filterSize.x != 3) {
+        // Fitler size needs to be 3x3
+        return false;
+    }
+
+    if (newParams.stride.x != newParams.stride.y) {
+        // Strides must be equal
+        return false;
+    }
+    if (newParams.output.X().v != newParams.output.Y().v) {
+        // W and H must be equal
+        return false;
+    }
+
+    if (newParams.output.Feature().v % 16 != 0) {
+        // output feature size must be divided by 16
+        return false;
+    }
+
+    if (newParams.output.Batch().v % 16 != 0) {
+        // batch size must be divided by 16
+        return false;
+    }
+
+    // check that all fused ops except eltwise have only feature or scalar inputs
+    for (auto& fo : newParams.fused_ops) {
+        if (fo.GetType() == FusedOpType::ELTWISE)
+            continue;
+        for (auto& input : fo.tensors) {
+            if (input.X().v != 1 || input.Y().v != 1 || input.Batch().v != 1)
+                return false;
+        }
+    }
+
+    return true;
+}
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h
@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+
+class Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3 : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3() : ConvolutionKernelBase("convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3") {}
+    virtual ~Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+    bool NeedPaddedInput() const override { return true; }
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+        return { WeightsLayout::os_is_yx_osv16_isv16 };
+    }
+
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+};
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -73,6 +73,8 @@
 #include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
 #include "convolution_kernel_b_fs_yx_fsv16_imad.h"
 #include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
+#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
+#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"

 namespace kernel_selector {
 convolution_kernel_selector::convolution_kernel_selector() {
@ -93,6 +95,8 @@ convolution_kernel_selector::convolution_kernel_selector() {

    // bs_fs_yx_bsv16_fsv16
    Attach<ConvolutionKernel_bfyx_to_bfyx_bsv16_fsv16>();
+    Attach<Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1>();
+    Attach<Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3>();

    // fs_byx_fsv32
    Attach<ConvolutionKernel_fs_byx_fsv32>();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.cpp
@ -0,0 +1,96 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
+#include "kernel_selector_utils.h"
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+namespace kernel_selector {
+ParamsKey Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::F32);
+
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
+
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnablePoolType(PoolType::MAX);
+    k.EnablePoolType(PoolType::AVG);
+    k.EnablePoolRemainder(PoolRemainder::FLOOR);
+    k.EnablePoolRemainder(PoolRemainder::CEIL);
+    k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
+    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
+    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
+    k.EnableDifferentTypes();
+    return k;
+}
+
+PoolingKernelBase::DispatchData Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::SetDefault(const pooling_params& params) const {
+    DispatchData runInfo = PoolingKernelBase::SetDefault(params);
+
+    runInfo.gws0 = params.output.Feature().v/16;
+    runInfo.gws1 = params.output.X().v * params.output.Y().v;
+    runInfo.gws2 = params.output.Batch().v;
+
+    runInfo.lws0 = 1;
+    runInfo.lws1 = 1;
+    runInfo.lws2 = SIMD_SIZE;
+    runInfo.efficiency = FORCE_PRIORITY_1;
+
+    return runInfo;
+}
+
+JitConstants Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetJitConstants(const pooling_params& params, DispatchData kd) const {
+    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
+        FusedOpsConfiguration conf = {"", {"b", "f", "y", "x"}, "pool_result[i]", input_dt, 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
+    }
+
+    return jit;
+}
+
+KernelsData Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
+}
+
+bool Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::Validate(const Params& params, const optional_params& options) const {
+    if (!PoolingKernelBase::Validate(params, options)) {
+        return false;
+    }
+    auto p = dynamic_cast<const pooling_params&>(params);
+
+    if (p.inputs[0].Feature().v % 16 != 0)
+        return false;
+
+    if (p.inputs[0].Batch().v % 16 != 0)
+        return false;
+
+    return true;
+}
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h
@ -0,0 +1,41 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "pooling_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+class Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16 : public PoolingKernelBase {
+public:
+    Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16() : PoolingKernelBase("pooling_gpu_bs_fs_yx_bsv16_fsv16") {}
+    virtual ~Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+    bool Validate(const Params&, const optional_params&) const override;
+    DispatchData SetDefault(const pooling_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION};
+    }
+
+protected:
+    JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+};
+}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -28,6 +28,7 @@
 #include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
 #include "pooling_kernel_gpu_bsv16_fsv16.h"
 #include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
+#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"

 namespace kernel_selector {

@ -46,6 +47,7 @@ pooling_kernel_selector::pooling_kernel_selector() {
    Attach<PoolingKernel_b_fs_yx_fsv16>();
    Attach<PoolingKernel_bsv16_fsv16>();
    Attach<PoolingKernelGPU_b_fs_yx_fsv16_imad>();
+    Attach<Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16>();
 }

 KernelsData pooling_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_scale_shift_opt.cpp
@ -41,6 +41,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
    k.EnableInputLayout(DataLayout::byxf_af32);
+    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::yxfb);
    k.EnableOutputLayout(DataLayout::bfzyx);
@ -50,6 +51,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
+    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
    k.EnableTensorOffset();
    k.EnableTensorPitches();
    k.EnableBatching();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1.cl
@ -0,0 +1,144 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#if QUANTIZATION_TERM
+#define ACCUMULATOR_TYPE int
+#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+#define ACTIVATION_TYPE float
+#define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+#define ACCUMULATOR_TYPE INPUT0_TYPE
+#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+#define ACTIVATION_TYPE INPUT0_TYPE
+#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define OUTPUT_TYPE16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+#define BATCH_SLICE_SIZE 16
+#define FEATURE_SLICE_SIZE 16
+
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
+    const __global INPUT0_TYPE *conv_input,
+    __global OUTPUT_TYPE *output,
+    const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    uint split_idx)
+{
+    const uint out_x = (uint)get_global_id(0);
+    const uint out_y = (uint)get_global_id(1);
+    const uint out_f = (uint)get_group_id(2) * 32 % OUTPUT_FEATURE_NUM;
+    const uint out_b = ((uint)get_group_id(2) * 32 / OUTPUT_FEATURE_NUM) * 16 + get_sub_group_local_id();
+
+    ACCUMULATOR_TYPE dotProd[32] = {0};
+    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+    const uint weights_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
+    const uint input_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
+    const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
+    const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
+
+    uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + get_sub_group_local_id(), 0, 0, 0);
+    uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16 + get_sub_group_local_id(), 0, 0, 0);
+
+    uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, out_b, 0, input_y, input_x);
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint k = 0; k < INPUT0_FEATURE_NUM / 16; k++) {
+        uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+        uint4 weights_val = vload4(0, (__global uint *)(weights + filter_idx));
+        uint4 weights_val2 = vload4(0, (__global uint *)(weights + filter_idx2));
+
+        __attribute__((opencl_unroll_hint(16)))
+        for (uint j = 0; j < 16; j++) {
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
+            dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
+
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val2.s0, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val2.s1, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val2.s2, j))));
+            dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val2.s3, j))));
+        }
+        filter_idx += weights_x_pitch;
+        filter_idx2 += weights_x_pitch;
+        input_idx += input_fs_pitch;
+    }
+
+    OUTPUT_TYPE16 results = 0;
+
+    __attribute__((opencl_unroll_hint(2)))
+    for (uint j = 0; j < 2; j++) {
+        const uint dst_index = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, out_b, 16 * j + out_f, out_y, out_x);
+#if BIAS_TERM
+        ACTIVATION_TYPE bias = biases[out_f + 16 * j + get_sub_group_local_id()];
+#endif
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_PRELOAD
+#endif
+        __attribute__((opencl_unroll_hint(16)))
+        for (uint i = 0; i < 16; i++) {
+
+            ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+            dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + intel_sub_group_shuffle(bias, i);
+#else
+            dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i];
+#endif
+#if HAS_FUSED_OPS
+#if FUSED_OPS_CAN_USE_PRELOAD
+            FUSED_OPS_CALC
+#else
+            FUSED_OPS
+#endif
+            OUTPUT_TYPE res = FUSED_OPS_RESULT;
+            results[i] = res;
+#else
+            results[i] = TO_OUTPUT_TYPE(dequantized);
+#endif
+        }
+
+#if OUTPUT_TYPE_SIZE == 1
+        vstore4(as_uint4(results), 0, ((__global uint *)(output + dst_index)));
+#else
+        __attribute__((opencl_unroll_hint(16)))
+        for (uint z = 0; z < 16; z++) {
+            output[dst_index + z] = results[z];
+        }
+  
+#endif
+    }
+}
+
+#undef BLOCK_LOAD_INPUTS
+#undef IN_BLOCK_WIDTH
+#undef IN_BLOCK_HEIGHT
+#undef PACK
+#undef AS_TYPE_N_
+#undef AS_TYPE_N
+#undef AS_INPUT0_TYPE_4
+#undef NUM_FILTERS
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3.cl
@ -0,0 +1,141 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#if QUANTIZATION_TERM
+#define ACCUMULATOR_TYPE int
+#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+#define ACTIVATION_TYPE float
+#define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+#define ACCUMULATOR_TYPE INPUT0_TYPE
+#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+#define ACTIVATION_TYPE INPUT0_TYPE
+#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define OUTPUT_TYPE16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+#define BATCH_SLICE_SIZE 16
+#define FEATURE_SLICE_SIZE 16
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+// int8 conv_input and weights data is packed to int32 "batches",
+// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3)(
+    const __global INPUT0_TYPE *conv_input,
+    __global OUTPUT_TYPE *output,
+    const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+uint split_idx)
+{
+    const uint out_x = (uint)get_global_id(0);
+    const uint out_y = (uint)get_global_id(1);
+    const uint out_f = (uint)get_group_id(2) * 16 % OUTPUT_FEATURE_NUM;
+    const uint out_b = ((uint)get_group_id(2) * 16 / OUTPUT_FEATURE_NUM) * 16 + get_sub_group_local_id();
+    
+    ACCUMULATOR_TYPE dotProd[16] = {0};
+    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+    const uint weights_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
+    const uint input_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
+    const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
+    const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
+    const uint y_diff = input_y_pitch - 3 * input_x_pitch;
+    const uint f_diff = input_fs_pitch - 3 * input_y_pitch;
+    uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, out_b, 0, input_y, input_x);
+
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint k = 0; k < INPUT0_FEATURE_NUM / 16; k++) {
+        uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + get_sub_group_local_id(), k*16, 0, 0);
+
+        __attribute__((opencl_unroll_hint(1)))
+        for (uint y = 0; y < FILTER_SIZE_Y; y++) {
+            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+            for (uint x = 0; x < FILTER_SIZE_X; x++) {
+                uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+                uint4 weights_val = vload4(0, (__global uint *)(weights + filter_idx));
+
+                __attribute__((opencl_unroll_hint(16)))
+                for (uint j = 0; j < 16; j++) {
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
+                    dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
+                }
+                filter_idx += weights_x_pitch;
+                input_idx += input_x_pitch;
+            }
+            input_idx += y_diff;
+        }
+        input_idx += f_diff;
+    }
+
+    OUTPUT_TYPE16 results = 0;
+    const uint dst_index = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, out_b, out_f, out_y, out_x);
+#if BIAS_TERM
+    ACTIVATION_TYPE bias = biases[out_f + get_sub_group_local_id()];
+#endif
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+    FUSED_OPS_PRELOAD
+#endif
+    __attribute__((opencl_unroll_hint(16)))
+    for (uint i = 0; i < 16; i++) {
+        ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+        dequantized = (ACTIVATION_TYPE)dotProd[i] + intel_sub_group_shuffle(bias, i);
+#else
+        dequantized = (ACTIVATION_TYPE)dotProd[i];
+#endif
+#if HAS_FUSED_OPS
+#if FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_CALC
+#else
+        FUSED_OPS
+#endif
+        OUTPUT_TYPE res = FUSED_OPS_RESULT;
+        results[i] = res;
+#else
+        results[i] = TO_OUTPUT_TYPE(dequantized);
+
+#endif
+    }
+#if OUTPUT_TYPE_SIZE == 1
+    vstore4(as_uint4(results), 0, ((__global uint *)(output + dst_index)));
+#else
+    __attribute__((opencl_unroll_hint(16)))
+    for (uint z = 0; z < 16; z++) {
+        output[dst_index + z] = results[z];
+    }
+#endif
+}
+
+#undef BLOCK_LOAD_INPUTS
+#undef IN_BLOCK_WIDTH
+#undef IN_BLOCK_HEIGHT
+#undef PACK
+#undef AS_TYPE_N_
+#undef AS_TYPE_N
+#undef AS_INPUT0_TYPE_4
+#undef NUM_FILTERS
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl
@ -1,4 +1,4 @@
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -195,7 +195,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
                uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
            #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
                uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
-            #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1
+            #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1
                uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
            #else
                #error "Incorrect output layout"
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bs_fs_yx_bsv16_fsv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bs_fs_yx_bsv16_fsv16.cl
@ -0,0 +1,184 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/include_all.cl"
+#include "include/data_types.cl"
+
+#define ALIGN_TO(val, multiple) (((val) + (multiple)-1) / (multiple) * (multiple))
+
+#define AS_TYPE(type, val) CAT(as_, type)(val)
+#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
+#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+#define CONVERT_OUT CAT(convert_, OUTPUT_TYPE)
+#define CONVERT_OUT_VEC16 CAT(convert_, OUT_VEC16)
+#define BATCH_SLICE_SIZE 16
+#define FEATURE_SLICE_SIZE 16
+#if MAX_POOLING
+#define INIT_VAL -128
+#elif AVG_POOLING
+#define INIT_VAL 0
+#else
+#error
+#endif
+
+inline int FUNC(apply_pooling)(int tmp, int in) {
+#if MAX_POOLING
+    return max(tmp, in);
+#elif AVG_POOLING
+    return tmp + in;
+#endif
+}
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(pooling_gpu_bs_fs_yx_bsv16_fsv16)(const __global INPUT0_TYPE* input,
+                                         __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+                                         , FUSED_OPS_DECLS
+#endif
+) {
+    const uint f = (uint)get_global_id(0) * 16;
+    const uint y = (uint)get_global_id(1) / OUTPUT_SIZE_X;
+    const uint x = (uint)get_global_id(1) % OUTPUT_SIZE_X;
+    const uint b = (uint)get_group_id(2) * 16 + get_sub_group_local_id();
+
+    const int offset_x = (int)x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int offset_y = (int)y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+    const uint input_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
+    const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
+    const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
+    int16 result = INIT_VAL;
+
+#ifdef CHECK_BOUNDRY
+    uint batch_and_feature_offset = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, 0, 0);
+    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || offset_y + POOL_SIZE_Y < 0 ||
+        offset_y >= INPUT0_SIZE_Y) {
+        return;
+    }
+
+#ifdef DYNAMIC_KERNEL_DIVIDER
+    uint num_elements = 0;
+#endif
+    __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
+    for (uint j = 0; j < POOL_SIZE_Y; j++) {
+        int input_offset_y = offset_y + j;
+        bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+        if (!zero_y) {
+            __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
+            for (uint i = 0; i < POOL_SIZE_X; i++) {
+                int input_offset_x = offset_x + i;
+                bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+                if (!zero) {
+                    const uint input_idx =
+                        batch_and_feature_offset + input_offset_y * input_y_pitch + input_offset_x * input_x_pitch;
+                    int4 int_data = vload4(0, (__global int*)(input + input_idx));
+                    IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
+                    __attribute__((opencl_unroll_hint(16)))
+                    for (uint z = 0; z < 16; z++)
+                        result[z] = FUNC_CALL(apply_pooling)(result[z], (int)ch16_data[z]);
+#ifdef DYNAMIC_KERNEL_DIVIDER
+                    num_elements++;
+#endif
+                }
+            }
+        }
+    }
+
+#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
+    const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
+    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
+    const uint num_elements = (hend - offset_y) * (wend - offset_x);
+#endif
+#else  // !CHECK_BOUNDRY
+    uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, offset_y, offset_x);
+    __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
+    for (uint j = 0; j < POOL_SIZE_Y; j++) {
+        __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
+        for (uint i = 0; i < POOL_SIZE_X; i++) {
+                int4 int_data = vload4(0, (__global int*)(input + input_idx));
+                IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
+                __attribute__((opencl_unroll_hint(16)))
+                for (uint z = 0; z < 16; z++)
+                    result[z] = FUNC_CALL(apply_pooling)(result[z], (int)ch16_data[z]);
+               
+            input_idx += input_x_pitch;
+        }
+        input_idx += (input_y_pitch - POOL_SIZE_X * input_x_pitch);
+    }
+
+#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+    const uint num_elements = POOL_SIZE_X * POOL_SIZE_Y;
+#endif
+#endif
+#if defined AVG_POOLING
+#if ENABLE_ROUND
+    int16 pool_result;
+    __attribute__((opencl_unroll_hint(16)))
+    for (uint i = 0; i < 16; i++) {
+#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+        result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
+#else
+        result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
+#endif
+    }
+#else
+    float16 pool_result;
+    __attribute__((opencl_unroll_hint(16)))
+    for (uint i = 0; i < 16; i++) {
+#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+        pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
+#else
+        pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
+#endif
+    }
+#endif  // ENABLE_ROUND
+#else   // AVG_POOLING
+    int16 pool_result;
+    __attribute__((opencl_unroll_hint(16)))
+    for (uint i = 0; i < 16; ++i) {
+        pool_result[i] = result[i];
+    }
+#endif  // AVG_POOLING
+    OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+    FUSED_OPS_PRELOAD
+#endif
+    __attribute__((opencl_unroll_hint(16)))
+    for (uint i = 0; i < 16; ++i) {
+#if HAS_FUSED_OPS
+#if FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_CALC
+#else
+        FUSED_OPS
+#endif
+        final_result[i] = FUSED_OPS_RESULT;
+#else
+        final_result[i] = pool_result[i];
+#endif
+    }
+    const uint output_pos = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, b, f, y, x);
+    
+#if OUTPUT_TYPE_SIZE == 1
+    vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
+#else
+    *((__global OUT_VEC16*)(output + output_pos)) = final_result;
+#endif
+}
+
+#undef ALIGN_TO
+#undef AS_TYPE
+#undef IN_VEC16
+#undef OUT_VEC16
+#undef CONVERT_OUT
+#undef CONVERT_OUT_VEC16
+#undef INIT_VAL
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
@ -1093,6 +1093,7 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura

    auto vec_size = conf.vec_size;
    auto idx = conf.bfzyx_idx_order;
+    auto fused_op_config = conf;

    std::string load_decls = "";
    static int i = 0;
@ -1103,11 +1104,19 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura
    if (reuse_index) {
        load_decls += "\\\n\tint " + reused_idx + " = " +  GetIdx(0, idx_desc{idx, desc.tensors[0]}, safe_load) + ";";
    }
+    // TODO: add some generic way to support shuffled feature, lets say possibility to add separate config for each fused op
+    if (desc.GetType() == KernelType::ELTWISE && conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE) {
+        std::string sub_group_local_id_str = "get_sub_group_local_id()";
+        size_t found_sub = conf.bfzyx_idx_order[1].rfind(sub_group_local_id_str);
+        if (found_sub != std::string::npos)
+            fused_op_config.bfzyx_idx_order[1].replace(found_sub, sub_group_local_id_str.length(), fused_op_config.shuffle_var_name);
+    }

    for (auto op_input_id : GetRequiredInputs()) {
        load_decls += "\\\n\t" + GetInputTypeName(op_input_id, vec_size) + " " + GetInputVarName(op_input_id) + " = " +
-                      GetJitLoad(conf, op_input_id, prim_output, reuse_index, reused_idx) + ";";
+                      GetJitLoad(fused_op_config, op_input_id, prim_output, reuse_index, reused_idx) + ";";
    }
+
    jit.AddConstant(MakeJitConstant("FUSED_OP"+std::to_string(desc.op_id)+"_LOAD" + conf.suffix, load_decls));

    return jit;
@ -1121,13 +1130,20 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
    std::string op_decls = "";
    auto vec_size = conf.vec_size;
    auto idx = conf.bfzyx_idx_order;
+    std::string shuffle_var = conf.shuffle_var_name;
+    bool is_shuffled = false;

    out_var = GetOutputVarName(in_var);
    out_type = desc.output_tensor.GetDType();

+    if (conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE &&
+        (desc.GetType() == KernelType::SCALE || desc.GetType() == KernelType::QUANTIZE)) {
+        is_shuffled = true;
+    }
+
    std::vector<std::string> in_vars_converted;
    for (size_t i = 0; i < desc.tensors.size(); i++) {
-        auto in_name = GetInputVarName(i);
+        auto in_name = GetInputVarName(i, is_shuffled, shuffle_var);
        if (desc.tensors[0].GetDType() != desc.output_tensor.GetDType()) {
            in_name = ConvertToOutputType(in_name, vec_size);
        }
@ -1163,17 +1179,17 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
            }

            auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
-                                                         : ConvertToType(GetInputVarName(p->out_scale_idx), tmp_type, vec_size);
+                                                         : ConvertToType(GetInputVarName(p->out_scale_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
            auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), tmp_type, vec_size)
-                                                         : ConvertToType(GetInputVarName(p->out_shift_idx), tmp_type, vec_size);
+                                                         : ConvertToType(GetInputVarName(p->out_shift_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
            auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), tmp_type, vec_size)
-                                                       : ConvertToType(GetInputVarName(p->in_scale_idx), tmp_type, vec_size);
+                                                       : ConvertToType(GetInputVarName(p->in_scale_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
            auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), tmp_type, vec_size)
-                                                       : ConvertToType(GetInputVarName(p->in_shift_idx), tmp_type, vec_size);
+                                                       : ConvertToType(GetInputVarName(p->in_shift_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
            auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), tmp_type, vec_size)
-                                                   : ConvertToType(GetInputVarName(p->in_range_lo_idx), tmp_type, vec_size);
+                                                   : ConvertToType(GetInputVarName(p->in_range_lo_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
            auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), tmp_type, vec_size)
-                                                   : ConvertToType(GetInputVarName(p->in_range_hi_idx), tmp_type, vec_size);
+                                                   : ConvertToType(GetInputVarName(p->in_range_hi_idx, is_shuffled, shuffle_var), tmp_type, vec_size);

            if (p->has_clamp) {
                op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
@ -1353,7 +1369,10 @@ std::string FusedOpsCodeGenerator::GetInputPtrName(size_t input_id) const {
    return GetTypeStr() + std::to_string(desc.op_id) + "_input" + std::to_string(input_id);
 }

-std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id) const {
+std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id, bool is_shuffled, std::string shuffle_var) const {
+    if (is_shuffled)
+        return "intel_sub_group_shuffle(" + GetTypeStr() + std::to_string(desc.op_id) + "_data" +
+               std::to_string(input_id) + ", " + shuffle_var + ")";
    return GetTypeStr() + std::to_string(desc.op_id) + "_data" + std::to_string(input_id);
 }

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.h
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2018 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -353,7 +353,7 @@ public:
                           bool reuse_index = false, std::string reused_idx = "") const;
    std::string GetIdx(size_t input_id, idx_desc idx, bool should_be_safe) const;
    std::string GetInputPtrName(size_t input_id) const;
-    std::string GetInputVarName(size_t input_id) const;
+    std::string GetInputVarName(size_t input_id, bool is_shuffled = false, std::string shuffle_var = "") const;
    std::string GetOutputVarName(std::string input_var_name) const;
    std::string ConvertToOutputType(std::string var, size_t vec_size = 1) const;
    std::string ConvertToType(std::string var, Datatype dt, size_t vec_size = 1) const;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -99,12 +99,15 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa

            bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
            can_all_use_preload &= can_use_preload;
-
+            bool can_preload_eltwise = true;
+            if (params.fused_ops[i].GetType() == FusedOpType::ELTWISE &&
+                c.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE)
+                can_preload_eltwise = false;
            fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
            fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
-            if (can_use_preload)
+            if (can_use_preload && can_preload_eltwise)
                fused_ops_preload += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
-            if (c.allow_for_partial_preload && !can_use_preload)
+            if (c.allow_for_partial_preload && (!can_use_preload || !can_preload_eltwise))
                fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
            fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
        }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -440,7 +440,8 @@ struct base_activation_params {
 struct FusedOpsConfiguration {
    enum class LoadType {
        LT_UNALIGNED = 0,
-        LT_ALIGNED_READ = 1
+        LT_ALIGNED_READ = 1,
+        FEATURE_SHUFFLE = 2
    };

    enum class BoundaryCheck {
@ -476,6 +477,8 @@ struct FusedOpsConfiguration {
    // If allow_for_partial_preload is false, then it's required that all fused_ops can be preloaded.
    // If allow_for_partial_preload is true, then not preloaded fused_ops will be loaded in FUSED_OPS_CALC.
    bool allow_for_partial_preload;
+    // Load index for shuffle fused op
+    std::string shuffle_var_name;

    FusedOpsConfiguration(std::string suffix,
                          std::vector<std::string> bfzyx_idx_order,
@ -487,7 +490,8 @@ struct FusedOpsConfiguration {
                          IndexType index_type = IndexType::TENSOR_COORD,
                          Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT,
                          std::vector<Tensor::DataChannelName> loop_axes = {},
-                          bool allow_for_partial_preload = false)
+                          bool allow_for_partial_preload = false,
+                          std::string shuffle_var_name = "")
      : suffix(suffix)
      , bfzyx_idx_order(bfzyx_idx_order)
      , input_var_name(input_var_name)
@ -498,7 +502,8 @@ struct FusedOpsConfiguration {
      , boundary_check(boundary_check)
      , index_type(index_type)
      , loop_axes(loop_axes)
-      , allow_for_partial_preload(allow_for_partial_preload) { }
+      , allow_for_partial_preload(allow_for_partial_preload)
+      , shuffle_var_name(shuffle_var_name) { }

    FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; }
    FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; }
@ -509,6 +514,7 @@ struct FusedOpsConfiguration {
        loop_axes = std::move(val);
        allow_for_partial_preload = partial_preload;
        return *this; }
+    FusedOpsConfiguration& SetShuffleVarName(std::string val) { shuffle_var_name = val; return *this; }
 };

 // Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program_impl::fuse_nodes
--- a/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -225,6 +225,8 @@ attach_convolution_gpu::attach_convolution_gpu() {
    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw);
    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw);
 }

 }  // namespace detail
--- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -110,6 +110,9 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() {
        {std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), val_fw},
        {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw},
        {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw},
        // fs_b_yx_fsv32
        {std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw},
    });
--- a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -199,6 +199,8 @@ attach_pooling_gpu::attach_pooling_gpu() {
    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
+    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
+    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
    // MMAD
    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create);
    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create);
--- a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
@ -171,6 +171,10 @@ attach_quantize_gpu::attach_quantize_gpu() {

    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
+
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw);
 }

 }  // namespace detail
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_padding.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -123,6 +123,7 @@ void prepare_padding::run(program_impl& p) {
        if (conv_layout.format != cldnn::format::bfyx &&
            conv_layout.format != cldnn::format::bf8_xy16 &&
            conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
+            conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
            conv_layout.format != cldnn::format::byxf_af32 &&
            conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
            conv_layout.format != cldnn::format::b_fs_yx_fsv4 &&
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
@ -321,8 +321,10 @@ void remove_redundant_reorders::run(program_impl& p) {

        auto& usr = node->get_users().front();
        auto& dep = node->get_dependency(0);
-        if (!usr->is_type<quantize>() || node->get_output_layout().format != format::bfyx ||
-            (dep.get_output_layout().format != format::b_fs_yx_fsv16 && dep.get_output_layout().format != format::fs_b_yx_fsv32))
+        if (!usr->is_type<quantize>() ||
+            (dep.get_output_layout().format != format::b_fs_yx_fsv16 &&
+             dep.get_output_layout().format != format::fs_b_yx_fsv32 &&
+             dep.get_output_layout().format != format::bfyx))
            continue;

        dep.merge_output_padding(node->get_output_layout().data_padding);
--- a/inference-engine/thirdparty/clDNN/src/include/layout_optimizer.h
+++ b/inference-engine/thirdparty/clDNN/src/include/layout_optimizer.h
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -157,6 +157,7 @@ private:
                                        const layout &weights_layout,
                                        std::shared_ptr<const convolution> conv);
    bool convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
+                                              const layout& weights_layout,
                                              std::shared_ptr<const convolution> conv);
    bool convolution_fs_b_yx_fsv32_opt(const layout& input_layout,
                                       const layout& weights_layout,
--- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
+++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -206,6 +206,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
            return cldnn::format::bfwzyx;
        case kernel_selector::data_layout::b_fs_yx_fsv4:
            return cldnn::format::b_fs_yx_fsv4;
+        case kernel_selector::data_layout::bs_fs_yx_bsv16_fsv16:
+            return cldnn::format::bs_fs_yx_bsv16_fsv16;
        case kernel_selector::data_layout::nv12:
            return cldnn::format::nv12;
        case kernel_selector::data_layout::image_2d_rgba:
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -211,7 +211,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        (fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
        (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
        (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))) ||
-        (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3)))
+        (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
+        (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
+        (next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
        return true;

    if (next.is_type<convolution>() &&
@ -221,12 +223,14 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
        (prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))))
        return true;

-    if (next.is_type<quantize>() && fmt_prev == format::bfyx && fmt_next == format::b_fs_yx_fsv16)
+    if (next.is_type<quantize>() && fmt_prev == format::bfyx && (fmt_next == format::b_fs_yx_fsv16 ||
+        fmt_next == format::bs_fs_yx_bsv16_fsv16 || fmt_next == format::b_fs_yx_fsv4))
        return true;

    if (next.is_type<convolution>() &&
-        fmt_prev == format::bfyx && prev_output_layout.size.feature[0] == 3 &&
-        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32))
+        (fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx)  && prev_output_layout.size.feature[0] == 3 &&
+        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 ||
+         fmt_next == format::bs_fs_yx_bsv16_fsv16))
        return true;

    if (next.is_type<convolution>() &&
@ -246,7 +250,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
        return true;

    if (prev.is_type<quantize>() &&
-        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 || fmt_next == format::b_fs_yx_fsv16))
+        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 ||
+         fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
        return true;

    return false;
@ -450,14 +455,24 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
 }

 bool layout_optimizer::convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
+                                                            const layout& weights_layout,
                                                            std::shared_ptr<const convolution> conv) {
    // A set of rules that define when bs_fs_yx_bsv16_fsv16 mem format can be used
-    bool correct_batch = input_layout.size.batch[0] > 16;
+    bool correct_batch = input_layout.size.batch[0] >= 16;
    bool correct_feature = (input_layout.size.feature[0] % 16 == 0 || input_layout.size.feature[0] == 3) && conv->output_size.feature[0] % 16 == 0;
    bool fp16_ver = input_layout.data_type == data_types::f16 && input_layout.size.batch[0] % 32 == 0;
    bool fp32_ver = input_layout.data_type == data_types::f32 && input_layout.size.batch[0] % 16 == 0;
    bool single_group = conv->groups == 1;
-    return (fp16_ver || fp32_ver) && correct_feature && correct_batch && single_group;
+    bool int8_sup = (input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
+                    input_layout.size.batch[0] % 16 == 0 && weights_layout.data_type == data_types::i8 &&
+                    (conv->activations_zero_points.empty() && conv->weights_zero_points.empty());
+    auto ks_x = weights_layout.size.spatial[0];
+    auto ks_y = weights_layout.size.spatial[1];
+    int8_sup &= (input_layout.size.spatial[2] == 1 && ((ks_x == 1 && ks_y == 1) || (ks_x == 3 && ks_y == 3) || (ks_x == 7 && ks_y == 7)) &&
+                 input_layout.size.batch[0] % 16 == 0 && weights_layout.size.batch[0] % 32 == 0 && conv->groups == 1 &&
+                 conv->split() == 1 && conv->dilation == tensor{1});
+
+    return (int8_sup || fp16_ver || fp32_ver) && correct_feature && correct_batch && single_group;
 }

 bool layout_optimizer::convolution_fs_b_yx_fsv32_opt(layout const& input_layout,
@ -630,7 +645,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
    const float cond_denom = _total_conv > 0 ? 1.0f / static_cast<float>(_total_conv) : 1.0f;

    if ((input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8)) {
-        if ((_optimization_attributes.b_fs_yx_fsv16_network &&
+        if ((_optimization_attributes.bs_fs_yx_bsv16_fsv16_network && expected_tensor.batch[0] % 16 == 0 &&
+             convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
+            expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
+        } else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
            convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
            expected_format = cldnn::format::b_fs_yx_fsv16;
        } else {
@ -651,7 +669,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
        expected_tensor = current_layout.size;
        expected_format = cldnn::format::bfzyx;
    } else if (_optimization_attributes.bs_fs_yx_bsv16_fsv16_network &&
-            convolution_bs_fs_yx_bsv16_fsv16_opt(node.input().get_output_layout(), prim)) {
+               convolution_bs_fs_yx_bsv16_fsv16_opt(node.input().get_output_layout(), output_or_weights_layout, prim)) {
        expected_tensor = current_layout.size;
        expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
    } else if (_optimization_attributes.fs_b_yx_fsv32_network && !node.get_transposed() &&
@ -781,6 +799,11 @@ format layout_optimizer::get_preferred_format(program_node& node) {
            output_layout,
            node.as<detection_output>(),
            layout{ data_types::f32, format::bfyx, tensor{} }).format;
+    } else if (node.is_type<quantize>()) {
+        auto layout = node.get_output_layout();
+        if ((layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
+            layout.size.batch[0] % 16 == 0)
+                expected = format::b_fs_yx_fsv4;
    } else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
        expected = node.get_output_layout().format;
    } else if (node.is_type<reshape>()) {
@ -850,7 +873,7 @@ bool layout_optimizer::is_format_optimized(const convolution_node& node, const f
        case format::fs_b_yx_fsv32:
            return convolution_fs_b_yx_fsv32_opt(input_layout, weights_layout, prim);
        case format::bs_fs_yx_bsv16_fsv16:
-            return convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, prim);
+            return convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, weights_layout, prim);
        default:
            throw std::invalid_argument(
                "[Layout optimizer] Other formats in is_format_optimized(...) method are not implemented!");
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@ -1186,7 +1186,8 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
            prim.type() != cldnn::scale::type_id() &&
            prim.type() != cldnn::softmax::type_id() &&
            prim.type() != cldnn::fully_connected::type_id() &&
-            prim.type() != cldnn::generic_layer::type_id())
+            prim.type() != cldnn::generic_layer::type_id() &&
+            prim.type() != cldnn::quantize::type_id())
            can_use_bs_fs_yx_bsv16_fsv16 = false;
    }

--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
@ -8376,6 +8376,25 @@ struct params_generator : std::vector<convolution_random_test_all_params> {
        return *this;
    }

+    params_generator& bs_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
+        std::vector<int> strides = { 1, 2 };
+        for (auto s : strides) {
+            // 1x1
+            push_back(convolution_random_test_all_params{
+            //      feature   input     filter    stride    offset  dilation  bias  groups
+            //batch in  out   x  y      x  y      x  y      x  y      x  y
+                16, 32, 32, { 4, 4 }, { 1, 1 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
+            push_back(convolution_random_test_all_params{
+                16, 32, 32, { 9, 9 }, { 1, 1 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
+            // 3x3
+            push_back(convolution_random_test_all_params{
+                16, 32, 32, { 4, 4 }, { 3, 3 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
+            push_back(convolution_random_test_all_params{
+                16, 32, 32, { 9, 9 }, { 3, 3 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
+        }
+        return *this;
+    }
+
    params_generator& all_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
        return smoke_test_params(input_format, asymm_weights, asymm_data, padded_input)
            .extra_test_params(input_format, asymm_weights, asymm_data, padded_input);
@ -8421,6 +8440,7 @@ INSTANTIATE_TEST_CASE_P(
        .smoke_test_params(format::b_fs_yx_fsv32, false, false, true)
        .smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
        .smoke_test_params(format::b_fs_yx_fsv16)
+        .bs_test_params(format::bs_fs_yx_bsv16_fsv16)
    ),
    to_string_convolution_all_params
 );
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -404,6 +404,8 @@ public:
 #define CASE_CONV_U8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
 #define CASE_CONV_U8S8_7 {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_CONV_U8S8_8 {1, 3, 4, 5}, {1, 32, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_CONV_U8S8_9 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx
+#define CASE_CONV_U8S8_10 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx

 #define CASE_CONV_S8S8_1 {1, 15, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_CONV_S8S8_2 {1, 15, 5, 5}, {1, 30, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
@ -413,6 +415,8 @@ public:
 #define CASE_CONV_S8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
 #define CASE_CONV_S8S8_7  {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_CONV_S8S8_8 {1, 3, 4, 5}, {1, 32, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_CONV_S8S8_9 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx
+#define CASE_CONV_S8S8_10 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx

 #define CASE_CONV3D_U8S8_1 {1, 15, 5, 4, 5}, {1, 30, 3, 2, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_U8S8_2 {1, 15, 5, 5, 5}, {1, 30, 3, 3, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
@ -1536,10 +1540,14 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_quantize_i8,
                                bc_test_params{CASE_CONV_U8S8_2, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_3, 2, 4},
                                bc_test_params{CASE_CONV_U8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV_U8S8_9, 2, 4},
+                                bc_test_params{CASE_CONV_U8S8_10, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_1, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_2, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_3, 2, 4},
                                bc_test_params{CASE_CONV_S8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV_S8S8_9, 2, 4},
+                                bc_test_params{CASE_CONV_S8S8_10, 2, 4},

                                bc_test_params{CASE_CONV3D_U8S8_1, 2, 4},
                                bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp
@ -2599,6 +2599,20 @@ INSTANTIATE_TEST_CASE_P(
                                     format::b_fs_yx_fsv32)),
                    testing::internal::DefaultParamName<pooling_random_test_params>);

+INSTANTIATE_TEST_CASE_P(
+    batched_low_precision,
+    pooling_random_test,
+    testing::Combine(
+        testing::Values(16),
+        testing::Values(16, 32),
+        testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
+        testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
+        testing::Values(std::tuple<int, int>(1, 1)),
+        testing::Values(std::tuple<int, int>(0, 0)),
+        testing::Values(format::bs_fs_yx_bsv16_fsv16)
+    ),
+    testing::internal::DefaultParamName<pooling_random_test_params>);
+
 template <typename InputT, pooling_mode Mode>
 class pooling_scale_random_test_base : public pooling_random_test_base<InputT, Mode> {
 public: