From c9d4e6b934c71258cb976ada959907c3cee6da73 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Tue, 30 Jun 2020 22:18:24 +0300
Subject: [PATCH] [IE CLDNN] Removed unused primitives and related structures
 (#1039)

---
 .../src/cldnn_engine/cldnn_program.cpp        |   39 -
 inference-engine/src/cldnn_engine/dllmain.cpp |   22 -
 .../thirdparty/clDNN/api/activation.hpp       |    7 -
 .../thirdparty/clDNN/api/activation_grad.hpp  |   96 -
 .../thirdparty/clDNN/api/apply_adam.hpp       |  111 -
 .../thirdparty/clDNN/api/batch_norm.hpp       |  184 --
 .../thirdparty/clDNN/api/batch_norm_grad.hpp  |   61 -
 .../thirdparty/clDNN/api/contract.hpp         |   95 -
 .../clDNN/api/convolution_grad_input.hpp      |   95 -
 .../clDNN/api/convolution_grad_weights.hpp    |  217 --
 .../thirdparty/clDNN/api/deconvolution.hpp    |   34 +-
 .../thirdparty/clDNN/api/eltwise.hpp          |   49 +-
 .../thirdparty/clDNN/api/embed.hpp            |   79 -
 .../clDNN/api/fully_connected_grad_input.hpp  |   59 -
 .../api/fully_connected_grad_weights.hpp      |  115 -
 .../thirdparty/clDNN/api/index_select.hpp     |  109 -
 .../thirdparty/clDNN/api/lookup_table.hpp     |   58 -
 .../thirdparty/clDNN/api/network.hpp          |    6 -
 .../thirdparty/clDNN/api/scale_grad_input.hpp |   51 -
 .../clDNN/api/scale_grad_weights.hpp          |  131 -
 .../clDNN/api/softmax_loss_grad.hpp           |   47 -
 .../api_extension/fused_conv_bn_scale.hpp     |  115 -
 .../api_extension/fused_conv_eltwise.hpp      |   67 +-
 .../kernel_selector/common/common_tools.h     |    1 -
 .../kernel_selector/common/common_types.h     |   14 -
 .../activation/activation_kernel_base.cpp     |    3 -
 .../activation/activation_kernel_opt.cpp      |    1 -
 .../activation/activation_kernel_ref.cpp      |    1 -
 .../batch_norm/batch_norm_kernel_base.cpp     |   88 -
 .../batch_norm/batch_norm_kernel_base.h       |   66 -
 .../batch_norm/batch_norm_kernel_ref.cpp      |   41 -
 .../batch_norm/batch_norm_kernel_ref.h        |   30 -
 .../batch_norm/batch_norm_kernel_selector.cpp |   29 -
 .../batch_norm/batch_norm_kernel_selector.h   |   35 -
 .../batch_norm_grad_kernel_base.cpp           |   72 -
 .../batch_norm_grad_kernel_base.h             |   57 -
 .../batch_norm_grad_kernel_ref.cpp            |   41 -
 .../batch_norm_grad_kernel_ref.h              |   30 -
 .../batch_norm_grad_kernel_selector.cpp       |   29 -
 .../batch_norm_grad_kernel_selector.h         |   35 -
 .../contract/contract_kernel_base.cpp         |  111 -
 .../contract/contract_kernel_base.h           |   52 -
 .../contract/contract_kernel_ref.cpp          |   49 -
 .../contract/contract_kernel_ref.h            |   27 -
 .../contract/contract_kernel_selector.cpp     |   24 -
 .../contract/contract_kernel_selector.h       |   31 -
 ...on_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp |   87 -
 ...tion_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h |   41 -
 ...lution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp |   61 -
 ...volution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h |   37 -
 .../convolution_kernel_mmad_1x1_gemm.cpp      |  108 -
 .../convolution_kernel_mmad_1x1_gemm.h        |   40 -
 ...kernel_mmad_32x32sg_128x128wg_slm_int8.cpp |  180 --
 ...n_kernel_mmad_32x32sg_128x128wg_slm_int8.h |   42 -
 ...kernel_mmad_32x32sg_224x128wg_slm_int8.cpp |  180 --
 ...n_kernel_mmad_32x32sg_224x128wg_slm_int8.h |   42 -
 ...nvolution_kernel_mmad_32x32sg_slm_int8.cpp |  176 --
 ...convolution_kernel_mmad_32x32sg_slm_int8.h |   41 -
 .../convolution_kernel_selector.cpp           |   24 -
 .../convolution_grad_weights_kernel_1x1.cpp   |   67 -
 .../convolution_grad_weights_kernel_1x1.h     |   32 -
 .../convolution_grad_weights_kernel_3x3.cpp   |   72 -
 .../convolution_grad_weights_kernel_3x3.h     |   32 -
 .../convolution_grad_weights_kernel_7x7.cpp   |   70 -
 .../convolution_grad_weights_kernel_7x7.h     |   32 -
 .../convolution_grad_weights_kernel_base.cpp  |  135 -
 .../convolution_grad_weights_kernel_base.h    |   79 -
 .../convolution_grad_weights_kernel_ref.cpp   |   45 -
 .../convolution_grad_weights_kernel_ref.h     |   29 -
 ...nvolution_grad_weights_kernel_selector.cpp |   36 -
 ...convolution_grad_weights_kernel_selector.h |   34 -
 .../convolution_grad_weights_kernel_yxfb.cpp  |   74 -
 .../convolution_grad_weights_kernel_yxfb.h    |   32 -
 .../deconvolution_kernel_bfyx_opt.cpp         |    1 -
 .../deconvolution_kernel_ref.cpp              |    1 -
 .../eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp  |    4 +-
 .../eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp   |  288 --
 .../eltwise/eltwise_kernel_b_fs_yx_fsv4.h     |   36 -
 .../eltwise/eltwise_kernel_base.cpp           |   15 +-
 .../eltwise/eltwise_kernel_base.h             |    6 -
 .../eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp    |  301 --
 .../eltwise_kernel_fs_bs_yx_bsv4_fsv32.h      |   32 -
 .../eltwise/eltwise_kernel_ref.cpp            |    1 -
 .../eltwise/eltwise_kernel_selector.cpp       |    4 -
 .../actual_kernels/embed/embed_kernel_ref.cpp |  108 -
 .../actual_kernels/embed/embed_kernel_ref.h   |   42 -
 .../embed/embed_kernel_selector.cpp           |   27 -
 .../embed/embed_kernel_selector.h             |   35 -
 .../core/actual_kernels/embed/embed_params.h  |   51 -
 .../fully_connected_kernel_base.cpp           |    2 -
 .../fully_connected_kernel_mmad_batched.cpp   |  122 -
 .../fully_connected_kernel_mmad_batched.h     |   36 -
 .../fully_connected_kernel_selector.cpp       |    2 -
 ...fully_connected_grad_input_kernel_base.cpp |   82 -
 .../fully_connected_grad_input_kernel_base.h  |   54 -
 .../fully_connected_grad_input_kernel_ref.cpp |   44 -
 .../fully_connected_grad_input_kernel_ref.h   |   29 -
 ...y_connected_grad_input_kernel_selector.cpp |   28 -
 ...lly_connected_grad_input_kernel_selector.h |   34 -
 ...lly_connected_grad_weights_kernel_base.cpp |   93 -
 ...fully_connected_grad_weights_kernel_base.h |   58 -
 ...ully_connected_grad_weights_kernel_ref.cpp |   43 -
 .../fully_connected_grad_weights_kernel_ref.h |   30 -
 ...connected_grad_weights_kernel_selector.cpp |   28 -
 ...y_connected_grad_weights_kernel_selector.h |   34 -
 .../fused_conv_bn_scale_kernel_base.cpp       |  158 -
 .../fused_conv_bn_scale_kernel_base.h         |   77 -
 .../fused_conv_bn_scale_kernel_ref.cpp        |   71 -
 .../fused_conv_bn_scale_kernel_ref.h          |   40 -
 .../fused_conv_bn_scale_kernel_selector.cpp   |   26 -
 .../fused_conv_bn_scale_kernel_selector.h     |   34 -
 ...used_conv_eltwise_kernel_af32_imad_1x1.cpp |  145 -
 .../fused_conv_eltwise_kernel_af32_imad_1x1.h |   40 -
 .../fused_conv_eltwise_kernel_base.cpp        |   25 -
 .../fused_conv_eltwise_kernel_base.h          |   12 -
 .../fused_conv_eltwise_kernel_gemm.cpp        |  142 -
 .../fused_conv_eltwise_kernel_gemm.h          |   42 -
 .../fused_conv_eltwise_kernel_imad.cpp        |  221 --
 .../fused_conv_eltwise_kernel_imad.h          |   46 -
 ...kernel_mmad_32x32sg_128x128wg_slm_int8.cpp |  208 --
 ...e_kernel_mmad_32x32sg_128x128wg_slm_int8.h |   42 -
 ...kernel_mmad_32x32sg_224x128wg_slm_int8.cpp |  208 --
 ...e_kernel_mmad_32x32sg_224x128wg_slm_int8.h |   42 -
 .../fused_conv_eltwise_kernel_selector.cpp    |   10 -
 .../index_select/index_select_kernel_base.cpp |  137 -
 .../index_select/index_select_kernel_base.h   |   54 -
 .../index_select/index_select_kernel_ref.cpp  |   54 -
 .../index_select/index_select_kernel_ref.h    |   27 -
 .../index_select_kernel_selector.cpp          |   24 -
 .../index_select_kernel_selector.h            |   31 -
 .../lookup_table/lookup_table_kernel_axis.cpp |   88 -
 .../lookup_table/lookup_table_kernel_axis.h   |   30 -
 .../lookup_table/lookup_table_kernel_base.cpp |   84 -
 .../lookup_table/lookup_table_kernel_base.h   |   64 -
 .../lookup_table/lookup_table_kernel_ref.cpp  |   41 -
 .../lookup_table/lookup_table_kernel_ref.h    |   30 -
 .../lookup_table_kernel_selector.cpp          |   31 -
 .../lookup_table_kernel_selector.h            |   35 -
 .../permute/permute_kernel_ref.cpp            |    2 +-
 .../pooling_kernel_gpu_average_opt.cpp        |  102 -
 .../pooling/pooling_kernel_gpu_average_opt.h  |   33 -
 ...pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp |  111 -
 .../pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h  |   39 -
 ..._kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp |  106 -
 ...ng_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h |   38 -
 .../pooling/pooling_kernel_selector.cpp       |    6 -
 .../quantize/quantize_kernel_base.cpp         |    4 +-
 .../scale_grad_weights_kernel_base.cpp        |   79 -
 .../scale_grad_weights_kernel_base.h          |   58 -
 .../scale_grad_weights_kernel_ref.cpp         |   44 -
 .../scale_grad_weights_kernel_ref.h           |   30 -
 .../scale_grad_weights_kernel_selector.cpp    |   27 -
 .../scale_grad_weights_kernel_selector.h      |   35 -
 .../softmax_loss_grad_kernel_base.cpp         |   77 -
 .../softmax_loss_grad_kernel_base.h           |   49 -
 .../softmax_loss_grad_kernel_ref.cpp          |   41 -
 .../softmax_loss_grad_kernel_ref.h            |   29 -
 .../softmax_loss_grad_kernel_selector.cpp     |   26 -
 .../softmax_loss_grad_kernel_selector.h       |   33 -
 .../strided_slice_kernel_ref.cpp              |    4 +-
 .../core/cl_kernels/batch_norm_gpu_ref.cl     |  121 -
 .../cl_kernels/batch_norm_grad_gpu_ref.cl     |   80 -
 .../core/cl_kernels/contract_ref.cl           |   64 -
 .../convolution_gpu_1x1_gemm_mmad.cl          |  120 -
 ...lution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl |  202 --
 ...onvolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl |  105 -
 ...ion_gpu_mmad_32x32sg_128x128wg_slm_int8.cl |  396 ---
 ...ion_gpu_mmad_32x32sg_224x128wg_slm_int8.cl |  389 ---
 .../convolution_gpu_mmad_32x32sg_slm_int8.cl  |  430 ---
 .../convolution_gpu_mmad_b_fs_yx_fsv32.cl     |    5 -
 .../convolution_gpu_mmad_batched.cl           |  116 -
 .../convolution_gpu_mmad_batched_block.cl     |  199 --
 .../convolution_gpu_mmad_batched_block_1x1.cl |  247 --
 ...volution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl |    5 -
 ...nvolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl |    5 -
 .../convolution_gpu_mmad_slm_2x14_rep4.cl     |  945 ------
 .../convolution_gpu_mmad_slm_7x7_rep4.cl      | 1044 -------
 .../convolution_grad_weights_1x1.cl           |  136 -
 .../convolution_grad_weights_3x3.cl           |  182 --
 .../convolution_grad_weights_7x7.cl           |  105 -
 .../convolution_grad_weights_ref.cl           |  122 -
 .../convolution_grad_weights_yxfb.cl          |  118 -
 .../cl_kernels/deconvolution_gpu_bfyx_opt.cl  |   10 -
 .../core/cl_kernels/deconvolution_gpu_ref.cl  |   19 -
 .../core/cl_kernels/eltwise_b_fs_yx_fsv4.cl   |  104 -
 .../cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl |   83 -
 .../core/cl_kernels/embed_ref.cl              |   34 -
 .../fully_connected_gpu_mmad_batched.cl       |  115 -
 .../fully_connected_grad_input_gpu_ref.cl     |   46 -
 .../fully_connected_grad_weights_gpu_ref.cl   |   80 -
 .../fused_conv_bn_scale_kernel_ref.cl         |  197 --
 .../fused_conv_eltwise_gpu_gemm_fp32.cl       |  602 ----
 ...ise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl |  510 ----
 ...ise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl |  505 ----
 .../gen9_common_conv_fwd_data_f16.cl          |    6 -
 .../gen9_common_conv_fwd_data_f32.cl          |    6 -
 .../core/cl_kernels/generic_eltwise_ref.cl    |    6 +-
 .../core/cl_kernels/index_select_gpu_ref.cl   |  103 -
 .../core/cl_kernels/lookup_table_axis.cl      |   77 -
 .../core/cl_kernels/lookup_table_ref.cl       |   32 -
 .../pooling_gpu_fs_bs_yx_bsv4_fsv32.cl        |  227 --
 .../pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl |  124 -
 .../cl_kernels/scale_grad_weights_gpu_ref.cl  |   93 -
 .../cl_kernels/softmax_loss_grad_gpu_ref.cl   |   38 -
 .../core/common/common_kernel_base.cpp        |   12 +-
 .../core/common/common_kernel_base.h          |    2 -
 .../kernel_selector/core/common/jitter.cpp    |   34 +-
 .../core/common/training_kernel_base.cpp      |   34 -
 .../core/common/training_kernel_base.h        |   34 -
 .../core/common/training_params.cpp           |   29 -
 .../core/common/training_params.h             |   44 -
 .../kernel_selector/core/kernel_base.cpp      |    1 -
 .../core/kernel_selector_common.cpp           |    3 -
 .../core/kernel_selector_common.h             |    5 -
 .../core/kernel_selector_params.cpp           |    4 -
 .../core/kernel_selector_params.h             |   20 +-
 .../thirdparty/clDNN/src/activation_grad.cpp  |   95 -
 .../thirdparty/clDNN/src/apply_adam.cpp       |   97 -
 .../thirdparty/clDNN/src/batch_norm.cpp       |  116 -
 .../thirdparty/clDNN/src/batch_norm_grad.cpp  |   55 -
 .../thirdparty/clDNN/src/contract.cpp         |  126 -
 .../clDNN/src/convolution_grad_weights.cpp    |  214 --
 .../thirdparty/clDNN/src/deconvolution.cpp    |   28 +-
 .../thirdparty/clDNN/src/eltwise.cpp          |   36 -
 .../thirdparty/clDNN/src/embed.cpp            |   87 -
 .../clDNN/src/fully_connected_grad_input.cpp  |   76 -
 .../src/fully_connected_grad_weights.cpp      |   70 -
 .../clDNN/src/fused_conv_bn_scale.cpp         |  229 --
 .../clDNN/src/gpu/activation_grad_gpu.cpp     |  102 -
 .../clDNN/src/gpu/apply_adam_gpu.cpp          |  181 --
 .../clDNN/src/gpu/batch_norm_gpu.cpp          |  156 -
 .../clDNN/src/gpu/batch_norm_grad_gpu.cpp     |   87 -
 .../thirdparty/clDNN/src/gpu/contract_gpu.cpp |   86 -
 .../src/gpu/convolution_grad_weights_gpu.cpp  |  194 --
 .../clDNN/src/gpu/deconvolution_gpu.cpp       |    2 -
 .../thirdparty/clDNN/src/gpu/eltwise_gpu.cpp  |   50 +-
 .../thirdparty/clDNN/src/gpu/embed_gpu.cpp    |   86 -
 .../gpu/fully_connected_grad_input_gpu.cpp    |   86 -
 .../gpu/fully_connected_grad_weights_gpu.cpp  |  113 -
 .../clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp |  160 -
 .../clDNN/src/gpu/fused_conv_eltwise_gpu.cpp  |   41 -
 .../clDNN/src/gpu/index_select_gpu.cpp        |  107 -
 .../thirdparty/clDNN/src/gpu/kernel.cpp       |   68 -
 .../thirdparty/clDNN/src/gpu/kernel.h         |    5 -
 .../clDNN/src/gpu/lookup_table_gpu.cpp        |  129 -
 .../thirdparty/clDNN/src/gpu/register_gpu.cpp |   15 -
 .../thirdparty/clDNN/src/gpu/register_gpu.hpp |   28 -
 .../clDNN/src/gpu/scale_grad_input_gpu.cpp    |   93 -
 .../clDNN/src/gpu/scale_grad_weights_gpu.cpp  |   95 -
 .../clDNN/src/gpu/softmax_loss_grad_gpu.cpp   |   74 -
 .../add_reshape_to_primitives.cpp             |  114 -
 .../src/graph_optimizer/pre_optimize_bias.cpp |    3 -
 .../graph_optimizer/pre_replace_deconv.cpp    |    5 +-
 .../prepare_primitive_fusing.cpp              |   32 +-
 .../src/graph_optimizer/trim_to_outputs.cpp   |    2 -
 .../clDNN/src/include/activation_grad_inst.h  |   58 -
 .../clDNN/src/include/apply_adam_inst.h       |   68 -
 .../clDNN/src/include/batch_norm_grad_inst.h  |   52 -
 .../clDNN/src/include/batch_norm_inst.h       |  108 -
 .../clDNN/src/include/contract_inst.h         |   52 -
 .../include/convolution_grad_weights_inst.h   |  158 -
 .../clDNN/src/include/eltwise_inst.h          |   35 +-
 .../thirdparty/clDNN/src/include/embed_inst.h |   56 -
 .../include/fully_connected_grad_input_inst.h |   54 -
 .../fully_connected_grad_weights_inst.h       |   64 -
 .../src/include/fused_conv_bn_scale_inst.h    |  119 -
 .../src/include/fused_conv_eltwise_inst.h     |   65 +-
 .../clDNN/src/include/index_select_inst.h     |   59 -
 .../src/include/kernel_selector_helper.h      |   25 -
 .../clDNN/src/include/layout_optimizer.h      |    1 -
 .../clDNN/src/include/lookup_table_inst.h     |   53 -
 .../clDNN/src/include/scale_grad_input_inst.h |   54 -
 .../src/include/scale_grad_weights_inst.h     |   67 -
 .../src/include/softmax_loss_grad_inst.h      |   40 -
 .../thirdparty/clDNN/src/index_select.cpp     |  139 -
 .../clDNN/src/kernel_selector_helper.cpp      |   28 -
 .../thirdparty/clDNN/src/lookup_table.cpp     |   61 -
 .../thirdparty/clDNN/src/network.cpp          |    8 -
 .../thirdparty/clDNN/src/primitive_inst.cpp   |    5 -
 .../thirdparty/clDNN/src/program.cpp          |    3 -
 .../thirdparty/clDNN/src/scale_grad_input.cpp |  100 -
 .../clDNN/src/scale_grad_weights.cpp          |  112 -
 .../clDNN/src/softmax_loss_grad.cpp           |   50 -
 .../test_cases/add_reorders_gpu_test.cpp      |   59 +-
 .../tests/test_cases/apply_adam_gpu_test.cpp  |  109 -
 .../tests/test_cases/batch_norm_gpu_test.cpp  | 2663 -----------------
 .../test_cases/batch_norm_grad_gpu_test.cpp   |  114 -
 .../tests/test_cases/contract_gpu_test.cpp    |  362 ---
 .../tests/test_cases/convolution_gpu_test.cpp |  871 ------
 .../convolution_grad_input_gpu_test.cpp       |  208 --
 .../convolution_grad_weights_gpu_test.cpp     | 1112 -------
 .../clDNN/tests/test_cases/embed_gpu_test.cpp |  164 -
 .../fully_connected_grad_input_gpu_test.cpp   |   89 -
 .../fully_connected_grad_weights_gpu_test.cpp |  249 --
 .../fused_conv_eltwise_gpu_test.cpp           |   55 -
 .../tests/test_cases/fusings_gpu_test.cpp     |   18 +-
 .../test_cases/index_select_gpu_test.cpp      | 1672 -----------
 .../tests/test_cases/lookup_table_test.cpp    |  251 --
 .../test_cases/scale_grad_input_test.cpp      |   90 -
 .../test_cases/scale_grad_weights_test.cpp    |  325 --
 .../test_cases/softmax_loss_grad_gpu_test.cpp |   65 -
 301 files changed, 58 insertions(+), 31335 deletions(-)
 delete mode 100644 inference-engine/src/cldnn_engine/dllmain.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/activation_grad.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/apply_adam.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/batch_norm.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/contract.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/embed.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/index_select.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/lookup_table.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.h
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/activation_grad.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/apply_adam.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/batch_norm.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/contract.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/embed.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/apply_adam_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/batch_norm_grad_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/embed_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_input_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/scale_grad_input_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/scale_grad_weights_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/softmax_loss_grad_gpu.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/activation_grad_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/batch_norm_grad_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/contract_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/embed_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_input_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_weights_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/index_select_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/lookup_table_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/scale_grad_input_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/scale_grad_weights_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/include/softmax_loss_grad_inst.h
 delete mode 100644 inference-engine/thirdparty/clDNN/src/index_select.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/lookup_table.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp
 delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp

diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp
index 37c30015c29..bce89f2a8d5 100644
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -30,7 +30,6 @@
 #include <api/detection_output.hpp>
 #include <api/normalize.hpp>
 #include <api/reshape.hpp>
-#include <api/batch_norm.hpp>
 #include <api/permute.hpp>
 #include <api/split.hpp>
 #include <api/resample.hpp>
@@ -1533,49 +1532,11 @@ void Program::CreateBatchNormalizationPrimitive(cldnn::topology& topology, Infer
     cldnn::primitive_id weightID = bnLayerName + "_" + m_scalesTag;
     cldnn::primitive_id biasID = bnLayerName + "_" + m_biasesTag;
 
-#define _SCALE_BN_OPT
-#ifdef _SCALE_BN_OPT
-    // Using scale as an optimization (1 mad instead of mad+rsq)
-    // create new blobs for scale shift
     CreateScaleWeightsAndBiasesFromBN(topology, bnLayer, weightID, biasID);
     auto scalePrim = cldnn::scale(bnLayerName, inputPrimitives[0], weightID, biasID);
 
     topology.add(scalePrim);
-#else
-    cldnn::tensor blobTensor(0);
-    const auto bnDims = bnLayer->outData[0]->getTensorDesc().getDims();
-    switch (bnDims.size()) {
-    case 2:
-        blobTensor = cldnn::feature(TensorValue(bnDims[1]));
-        break;
-    case 4:
-        blobTensor = cldnn::feature(TensorValue(bnDims[1]));
-        break;
-    default:
-        THROW_CLDNN_EXCEPTION("Batch normalization input doesn't have 2 or 4 dimensions in " << bnLayer->name);
-    }
-    cldnn::layout blobLayout(
-        DataTypeFromPrecision(layer->precision),
-        m_defaultFormat,
-        blobTensor);
 
-    // Create variance primitive
-    cldnn::primitive_id varianceID = bnLayerName + "_" + m_weightsTag;
-    varianceID = CreatePrimitiveFromBlob(topology, varianceID, bnLayer->_weights, blobLayout);
-
-    // Create mean primitive
-    cldnn::primitive_id meanID = bnLayerName + "_" + m_biasesTag;
-    meanID = CreatePrimitiveFromBlob(topology, meanID, bnLayer->_biases, blobLayout);
-
-    auto bnPrim = cldnn::batch_norm(
-        bnLayerName,
-        inputPrimitives[0],
-        meanID,
-        varianceID,
-        bnLayer->epsilon);
-
-    topology.add(bnPrim);
-#endif  // _SCALE_BN_OPT
     AddPrimitiveToProfiler(bnLayerName, layer);
 }
 
diff --git a/inference-engine/src/cldnn_engine/dllmain.cpp b/inference-engine/src/cldnn_engine/dllmain.cpp
deleted file mode 100644
index a484571a204..00000000000
--- a/inference-engine/src/cldnn_engine/dllmain.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-// dllmain.cpp : Defines the entry point for the DLL application.
-#ifdef _WIN32
-#include <windows.h>
-
-BOOL APIENTRY DllMain(HMODULE hModule,
-                      DWORD  ul_reason_for_call,
-                      LPVOID lpReserved) {
-    switch (ul_reason_for_call) {
-    case DLL_PROCESS_ATTACH:
-    case DLL_THREAD_ATTACH:
-    case DLL_THREAD_DETACH:
-    case DLL_PROCESS_DETACH:
-        break;
-    }
-    return TRUE;
-}
-
-#endif
diff --git a/inference-engine/thirdparty/clDNN/api/activation.hpp b/inference-engine/thirdparty/clDNN/api/activation.hpp
index 9c88a38d7ea..80a120d3f73 100644
--- a/inference-engine/thirdparty/clDNN/api/activation.hpp
+++ b/inference-engine/thirdparty/clDNN/api/activation.hpp
@@ -71,13 +71,6 @@ enum class activation_func {
     gelu                  // (0.5*val*(1 + erf(val / sqrt(2)))
 };
 
-/// @brief activation gradient functions
-enum class activation_grad_func {
-    none,                 // val
-    relu,                 // val * (input > 0)
-    relu_negative_slope,  // val * ((input > 0) + a * (input <= 0)    (a is additional param)
-};
-
 /// @brief activation additional params
 struct activation_additional_params {
     float a, b;
diff --git a/inference-engine/thirdparty/clDNN/api/activation_grad.hpp b/inference-engine/thirdparty/clDNN/api/activation_grad.hpp
deleted file mode 100644
index d2d4d628dfe..00000000000
--- a/inference-engine/thirdparty/clDNN/api/activation_grad.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include "activation.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Activation gradient for rectified linear unit or parameterized rectified linear unit.
-/// @par Algorithm:
-///   out(i,x,y) = input_gradient(i,x,y) * ((input(i,x,y) > 0) + slope(i)  * (input(i,x,y) <= 0)
-/// @par Where:
-///   @li out(i,x,y) : value at x, y from i-th feature map after activation.
-///   @li in(i,x,y) : value at x, y from i-th feature map before activation.
-///   @li slope(i) : the slope value of the i-th feature map (can be shared across channels or one slope per channel).
-struct activation_grad : public primitive_base<activation_grad> {
-    CLDNN_DECLARE_PRIMITIVE(activation_grad)
-
-    /// @brief Constructs Relu grad primitive.
-    /// @param id This primitive id.
-    /// @param input_grad Input gradient primitive id.
-    /// @param input Input primitive id.
-    /// @param activation_grad_func activation_grad function.
-    /// @param additional_params additional params (slope).
-    activation_grad(const primitive_id& id,
-                    const primitive_id& input_grad,
-                    const primitive_id& input,
-                    activation_grad_func activation_grad_function,
-                    activation_additional_params additional_params = {0.f, 0.f},
-                    const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          activation_grad_function(activation_grad_function),
-          additional_params(additional_params),
-          additional_params_input("") {}
-
-    /// @brief Constructs Relu grad primitive.
-    /// @param id This primitive id.
-    /// @param input_grad Input gradient primitive id.
-    /// @param input Input primitive id.
-    /// @param activation_grad_func activation_grad function.
-    /// @param additional_params additional params (slope).
-    activation_grad(const primitive_id& id,
-                    const primitive_id& input_grad,
-                    const primitive_id& input,
-                    const primitive_id& additional_params_input,
-                    activation_grad_func activation_grad_function,
-                    const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          activation_grad_function(activation_grad_function),
-          additional_params({0, 0}),
-          additional_params_input(additional_params_input) {}
-
-    /// @brief activation_grad function.
-    activation_grad_func activation_grad_function;
-
-    /// @brief activation_grad additional params.
-    activation_additional_params additional_params;
-
-    /// @brief PRelu activation slope input primitive id.
-    /// Input x dimension should be equal to input feature size (one slope per channel).
-    /// All other dimensions should be 1.
-    primitive_id additional_params_input;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        if (additional_params_input.empty())
-            return {};
-        return {additional_params_input};
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/apply_adam.hpp b/inference-engine/thirdparty/clDNN/api/apply_adam.hpp
deleted file mode 100644
index f74523b7062..00000000000
--- a/inference-engine/thirdparty/clDNN/api/apply_adam.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Apply Adam primitive.
-/// @details Updates output using Adam algorithm. The output of this primitive should be mutable_data type in case user wants to update
-/// variable accross network. If output is not mutable_data then it will be initialized with 0.
-/// "Adam: A Method for Stochastic Optimization" by Diederik P. Kingma, Jimmy Ba
-/// @n See: https://arxiv.org/abs/1412.6980
-///
-/// <b>Algorithm:</b>
-/// @n float lr[t] = lr * sqrt(1 - beta2^t) / (1 - beta1^t);
-/// @n float m[t] = beta1 * m[t-1] + (1 - beta1) * grad[t];
-/// @n float v[t] = beta2 * v[t-1] + (1 - beta2) * grad[t] * grad[t];
-/// @n float result = result - lr[t] * m[t] / (sqrt(v[t]) + epsilon);
-
-struct apply_adam : public primitive_base<apply_adam> {
-    CLDNN_DECLARE_PRIMITIVE(apply_adam)
-
-    /// @brief Constructs apply Adam primitive.
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param m Primitive id containing mean data.
-    /// @param v Primitive id containing variance.
-    /// @param beta1_power Primitive id containing beta1^t.
-    /// @param beta2_power Primitive id containing beta2^t.
-    /// @param lr Learning rate parameter.
-    /// @param beta1 Beta1 parameter.
-    /// @param beta2 Beta2 parameter.
-    /// @param epsilon Epsilon.
-    /// @param dependency_id Optional primitive id that need to complete before execution of this primitive. Used only for synchronization.
-    apply_adam(const primitive_id& id,
-               const primitive_id& input,
-               const primitive_id& m,
-               const primitive_id& v,
-               const primitive_id& beta1_power,
-               const primitive_id& beta2_power,
-               float lr,
-               float beta1,
-               float beta2,
-               float epsilon,
-               const primitive_id& dependency_id = "",
-               const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          m(m),
-          v(v),
-          beta1_power(beta1_power),
-          beta2_power(beta2_power),
-          lr(lr),
-          beta1(beta1),
-          beta2(beta2),
-          epsilon(epsilon),
-          dependency_id(dependency_id) {}
-
-    /// @brief Primitive id containing m data.
-    primitive_id m;
-    /// @brief Primitive id containing v data.
-    primitive_id v;
-    /// @brief Primitive id containing beta1^t.
-    primitive_id beta1_power;
-    /// @brief Primitive id containing beta2^t.
-    primitive_id beta2_power;
-    /// @brief Learning rate parameter.
-    float lr;
-    /// @brief Beta1 parameter.
-    float beta1;
-    /// @brief Beta2 parameter.
-    float beta2;
-    /// @brief Epsilon.
-    float epsilon;
-    /// @brief Optional primitive id that need to complete before execution of this primitive. Used only for synchronization.
-    primitive_id dependency_id;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> ret{m, v, beta1_power, beta2_power};
-        ret.reserve(!dependency_id.empty());
-        if (!dependency_id.empty())
-            ret.push_back(dependency_id);
-        return ret;
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/batch_norm.hpp b/inference-engine/thirdparty/clDNN/api/batch_norm.hpp
deleted file mode 100644
index 29b8e69d960..00000000000
--- a/inference-engine/thirdparty/clDNN/api/batch_norm.hpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Batch normalization primitive.
-/// @details Performs batch normalization as discribed in
-/// "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" by Ioffe, Szegedy
-/// @n See: http://arxiv.org/abs/1502.03167
-///
-/// <b>Algorithm:</b>
-/// @n global stats can be computed as:
-/// @n out[i] = ( (in[i] - mean[b]) / sqrt(variance[b] + epsilon) ) * scale[b] + shift[b]
-
-struct batch_norm : public primitive_base<batch_norm> {
-    CLDNN_DECLARE_PRIMITIVE(batch_norm)
-
-    /// @brief Constructs batch normalization primitive.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param mean Primitive id containing mean data.
-    /// @param variance Primitive id containing variance.
-    /// @param epsilon Epsilon.
-    batch_norm(const primitive_id& id,
-               const primitive_id& input,
-               const primitive_id& mean,
-               const primitive_id& variance,
-               float epsilon,
-               const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          mean(mean),
-          variance(variance),
-          inv_variance(""),
-          epsilon(epsilon) {}
-
-    /// @brief Constructs batch normalization primitive.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param mean Primitive id containing mean data.
-    /// @param variance Primitive id containing variance.
-    /// @brief scale Primitive id containing scale.
-    /// @brief shift Primitive id containing shift.
-    /// @param epsilon Epsilon.
-    batch_norm(const primitive_id& id,
-               const primitive_id& input,
-               const primitive_id& mean,
-               const primitive_id& variance,
-               const primitive_id& scale,
-               const primitive_id& shift,
-               float epsilon,
-               const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          mean(mean),
-          variance(variance),
-          scale(scale),
-          shift(shift),
-          inv_variance(""),
-          epsilon(epsilon) {}
-
-    /// @brief Constructs batch normalization primitive with mean and variance calculation (used for training).
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param epsilon Epsilon.
-    /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. For inference leave empty.
-    batch_norm(const primitive_id& id,
-               const primitive_id& input,
-               float epsilon,
-               const primitive_id& inv_variance = "",
-               const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          mean(""),
-          variance(""),
-          inv_variance(inv_variance),
-          epsilon(epsilon) {}
-
-    /// @brief Constructs batch normalization primitive with mean and variance calculation (used for training).
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @brief scale Primitive id containing scale.
-    /// @brief shift Primitive id containing shift.
-    /// @param epsilon Epsilon.
-    /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. For inference leave empty.
-    batch_norm(const primitive_id& id,
-               const primitive_id& input,
-               float epsilon,
-               const primitive_id& scale,
-               const primitive_id& shift,
-               const primitive_id& inv_variance = "",
-               const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          mean(""),
-          variance(""),
-          scale(scale),
-          shift(shift),
-          inv_variance(inv_variance),
-          epsilon(epsilon) {}
-
-    /// @brief Constructs batch normalization primitive with mean and variance calculation (used for training).
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @brief scale Primitive id containing scale.
-    /// @brief shift Primitive id containing shift.
-    /// @brief mean_out Primitive id containing mean output.
-    /// @brief variance_out Primitive id containing variance output.
-    /// @param epsilon Epsilon.
-    /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. For inference leave empty.
-    batch_norm(const primitive_id& id,
-               const primitive_id& input,
-               float epsilon,
-               const primitive_id& mean_out,
-               const primitive_id& variance_out,
-               const primitive_id& scale,
-               const primitive_id& shift,
-               const primitive_id& inv_variance = "",
-               const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          mean(mean_out),
-          variance(variance_out),
-          scale(scale),
-          shift(shift),
-          inv_variance(inv_variance),
-          epsilon(epsilon) {}
-
-    /// @brief Primitive id containing mean data.
-    primitive_id mean;
-    /// @brief Primitive id containing variance.
-    primitive_id variance;
-    /// @brief Primitive id containing scale.
-    primitive_id scale;
-    /// @brief Primitive id containing shift.
-    primitive_id shift;
-    /// @brief Primitive id containing inverted variance used in future gradient computing.
-    primitive_id inv_variance;
-    /// @brief Epsilon.
-    float epsilon;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> deps;
-
-        if (!mean.empty() && !variance.empty()) {
-            deps.push_back(mean);
-            deps.push_back(variance);
-        }
-
-        if (!scale.empty() && !shift.empty()) {
-            deps.push_back(scale);
-            deps.push_back(shift);
-        }
-
-        if (!inv_variance.empty())
-            deps.push_back(inv_variance);
-
-        return deps;
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp b/inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp
deleted file mode 100644
index cf487ad8575..00000000000
--- a/inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs backward batch normalization layer.
-/// @details Calculates mean gradient and gradient * input for every feature in data,
-/// then output is calculated as inv_variance * (input_grad - mean_grad_input * input - mean_grad)
-struct batch_norm_grad : public primitive_base<batch_norm_grad> {
-    CLDNN_DECLARE_PRIMITIVE(batch_norm_grad)
-
-    /// @brief Constructs batch normalization backward layer.
-    /// @param id This primitive id.
-    /// @param input_grad Input gradient primitive id.
-    /// @param input Input primitive id.
-    /// @param inv_variance Primitive id containing inverted variance from forward pass.
-    batch_norm_grad(
-        const primitive_id& id,
-        const primitive_id& input_grad,
-        const primitive_id& input,
-        const primitive_id& inv_variance,
-        const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding), inv_variance(inv_variance) {
-    }
-
-    /// @brief Primitive id containing inverted variance from forward pass.
-    primitive_id inv_variance;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        return {inv_variance};
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/contract.hpp b/inference-engine/thirdparty/clDNN/api/contract.hpp
deleted file mode 100644
index 9242b4e845e..00000000000
--- a/inference-engine/thirdparty/clDNN/api/contract.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Select mode for the @ref contract layer.
-enum class contract_mode : int32_t {
-    /// @brief Sum reduction.
-    sum,
-    /// @brief Product reduction.
-    prod,
-    /// @brief All reduction.
-    all,
-    /// @brief Any reduction.
-    any,
-    /// @brief Max reduction.
-    max
-};
-
-/// @brief Reduces input with an operation defined by @p mode along defined
-///        by @p reduction_axes dimensions.
-///
-/// @details Reduces the input using the binary operation determined by
-///          @p mode. The @p reduction_axes determine the final shape of the
-///          output, which is calculated based on the input shape by
-///          collapsing the dimensions along which the reduction happens.
-///          For example, for the input with
-/// @n      <tt>input_sizes = (in_b, in_f, in_y, in_x)</tt>
-/// @n a reduction with
-/// @n      <tt>reduction_axes = (2)</tt>
-/// @n would collapse the Y dimension, producing
-/// @n      <tt>output_shape = (1, in_b, in_f, in_x)</tt>
-/// @n where every element is a @p mode reduction of the input elements with
-/// @n the same B, F and X coordinates.
-/// @n
-/// @n@b Requirements:
-/// @n - @p reduction_axes size (dimensions count) must be within (inclusive) range
-///      1 - 4.
-/// @n - @p reduction_axes mustn't have duplicate values.
-/// @n - Values of @p reduction_axes must be within (inclusive) range 0 - 3
-/// @n Breaking any of these conditions will raise an exception.
-struct contract : public primitive_base<contract> {
-    CLDNN_DECLARE_PRIMITIVE(contract)
-
-    /// @brief Constructs contract primitive / layer.
-    ///
-    /// @param id              An identifier of new primitive.
-    /// @param input           An identifier of primitive which is an input for newly created
-    ///                        contract primitive.
-    /// @param mode            Reduction mode.
-    /// @param reduction_axes  Axes positions (0-based, from left to right) in input_shape
-    ///                        that are being reduced.
-    /// @param output_padding  Optional padding for output from primitive.
-    contract(
-        const primitive_id& id,
-        const primitive_id& input,
-        contract_mode mode,
-        const std::vector<uint16_t>& reduction_axes = {},
-        const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding),
-          mode(mode),
-          reduction_axes(reduction_axes) {
-    }
-    /// @param mode Contract mode.
-    contract_mode mode;
-    /// @brief Array of axes positions from input shape (0-based, from left to right)
-    ///        along which reduction should happen.
-    std::vector<uint16_t> reduction_axes;
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp b/inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp
deleted file mode 100644
index 534aedb78d6..00000000000
--- a/inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "deconvolution.hpp"
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs backward convolution operation for input.
-/// @details convolution_grad_input is similar to deconvolution layer without biases and activation support.
-/// It actually uses deconvolution primitive underneath with gradient bool set to true.
-struct convolution_grad_input : public deconvolution {
-    /// @brief Constructs convolution_grad_input primitive.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_input window should start calculations.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param with_activation Enables Relu activation.
-    /// @param activation_slp Relu activation slope.
-    convolution_grad_input(const primitive_id& id,
-                           const primitive_id& input,
-                           const std::vector<primitive_id>& weights,
-                           tensor stride = {1, 1, 1, 1},
-                           tensor input_offset = {0, 0, 0, 0},
-                           const padding& output_padding = padding())
-        : deconvolution(id, input, {weights}, stride, input_offset, output_padding, true) {}
-
-    /// @brief Constructs convolution_grad_input primitive (computes input paddings to match output size).
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_input window should start calculations.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param with_activation Enables Relu activation.
-    /// @param activation_slp Relu activation slope.
-    /// @param output_size User-defined output data size of the primitive (w/o padding).
-    convolution_grad_input(const primitive_id& id,
-                           const primitive_id& input,
-                           const std::vector<primitive_id>& weights,
-                           tensor stride,
-                           tensor input_offset,
-                           tensor output_size,
-                           const padding& output_padding = padding())
-        : deconvolution(id, input, {weights}, stride, input_offset, output_size, output_padding, true) {}
-
-    /// @brief Constructs convolution_grad_input primitive (computes input paddings to match output size).
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_input window should start calculations.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param with_activation Enables Relu activation.
-    /// @param activation_slp Relu activation slope.
-    /// @param output_size User-defined output data size of the primitive (w/o padding).
-    /// @return convolution_grad_input primitive with specified settings.
-    static convolution_grad_input create_with_output_size(const primitive_id& id,
-                                                          const primitive_id& input,
-                                                          const std::vector<primitive_id>& weights,
-                                                          tensor output_size,
-                                                          tensor stride = {1, 1, 1, 1},
-                                                          tensor input_offset = {0, 0, 0, 0},
-                                                          const padding& output_padding = padding()) {
-        return convolution_grad_input(id, input, weights, stride, input_offset, output_size, output_padding);
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp
deleted file mode 100644
index fa15fa73fca..00000000000
--- a/inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs backward convolution operation for weights and biases.
-/// @details convolution_grad_weights updates weights and bias mutable data for training purposes.
-/// @details Please note that this primitive was not heavily tested and currently only batch=1 is enabled for this primitive.
-struct convolution_grad_weights
-    : public primitive_base<convolution_grad_weights> {
-    CLDNN_DECLARE_PRIMITIVE(convolution_grad_weights)
-
-    /// @brief Constructs convolution_grad_weights primitive.
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param input Input primitive id from convolution forward pass.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param bias List of primitive ids containing bias data. Provide empty vector if using next parameters without bias.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_weights window should start calculations.
-    /// @param dilation Defines dilation size.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive.
-    /// This is for correct order of calculating. Leave empty if primitive is last in backward pass.
-    convolution_grad_weights(const primitive_id& id,
-                             const primitive_id& input_grad,
-                             const primitive_id& input,
-                             const std::vector<primitive_id>& weights,
-                             const std::vector<primitive_id>& bias,
-                             tensor stride = {1, 1, 1, 1},
-                             tensor input_offset = {0, 0, 0, 0},
-                             tensor dilation = {1, 1, 1, 1},
-                             const primitive_id& conv_grad = "",
-                             const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          conv_grad(conv_grad),
-          stride(stride),
-          input_offset(input_offset),
-          dilation(dilation),
-          output_grad_w(false),
-          weights(weights),
-          bias(bias),
-          prev_weights_grad(std::vector<primitive_id>(0)),
-          prev_bias_grad(std::vector<primitive_id>(0)) {}
-
-    /// @brief Constructs convolution_grad_weights primitive (w/o bias).
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param input Input primitive id from convolution forward pass.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_weights window should start calculations.
-    /// @param dilation Defines dilation size.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param Should primitive give weights gradient (delta) as an output
-    /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive.
-    /// This is for correct order of calculating. Leave empty if primitive is last in backward pass.
-    convolution_grad_weights(const primitive_id& id,
-                             const primitive_id& input_grad,
-                             const primitive_id& input,
-                             const std::vector<primitive_id>& weights,
-                             tensor stride = {1, 1, 1, 1},
-                             tensor input_offset = {0, 0, 0, 0},
-                             tensor dilation = {1, 1, 1, 1},
-                             bool output_grad_w = false,
-                             const primitive_id& conv_grad = "",
-                             const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          conv_grad(conv_grad),
-          stride(stride),
-          input_offset(input_offset),
-          dilation(dilation),
-          output_grad_w(output_grad_w),
-          weights(weights),
-          bias(std::vector<primitive_id>(0)),
-          prev_weights_grad(std::vector<primitive_id>(0)),
-          prev_bias_grad(std::vector<primitive_id>(0)) {}
-
-    /// @brief Constructs convolution_grad_weights primitive (w/o bias).
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param input Input primitive id from convolution forward pass.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_weights window should start calculations.
-    /// @param dilation Defines dilation size.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive.
-    /// This is for correct order of calculating. Leave empty if primitive is last in backward pass.
-    convolution_grad_weights(const primitive_id& id,
-                             const primitive_id& input_grad,
-                             const primitive_id& input,
-                             const std::vector<primitive_id>& weights,
-                             tensor stride,
-                             tensor input_offset,
-                             tensor dilation,
-                             const primitive_id& conv_grad = "",
-                             const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          conv_grad(conv_grad),
-          stride(stride),
-          input_offset(input_offset),
-          dilation(dilation),
-          output_grad_w(false),
-          weights(weights),
-          bias(std::vector<primitive_id>(0)),
-          prev_weights_grad(std::vector<primitive_id>(0)),
-          prev_bias_grad(std::vector<primitive_id>(0)) {}
-
-    /// @brief Constructs convolution_grad_weights primitive with momentum optimizer.
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param input Input primitive id from convolution forward pass.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param bias List of primitive ids containing bias data. Provide empty vector if using next parameters without bias.
-    /// @param prev_weights_grad List of primitive ids which contains weights gradient data calculated in previous iteration. Used in momentum optimizer.
-    /// @param prev_bias_grad List of primitive ids which contains bias gradient data calculated in previous iteration. Used in momentum optimizer.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_weights window should start calculations.
-    /// @param dilation Defines dilation size.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive.
-    /// This is for correct order of calculating. Leave empty if primitive is last in backward pass.
-    convolution_grad_weights(const primitive_id& id,
-                             const primitive_id& input_grad,
-                             const primitive_id& input,
-                             const std::vector<primitive_id>& weights,
-                             const std::vector<primitive_id>& bias,
-                             const std::vector<primitive_id>& prev_weights_grad,
-                             const std::vector<primitive_id>& prev_bias_grad,
-                             tensor stride = {1, 1, 1, 1},
-                             tensor input_offset = {0, 0, 0, 0},
-                             tensor dilation = {1, 1, 1, 1},
-                             const primitive_id& conv_grad = "",
-                             const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          conv_grad(conv_grad),
-          stride(stride),
-          input_offset(input_offset),
-          dilation(dilation),
-          output_grad_w(false),
-          weights(weights),
-          bias(bias),
-          prev_weights_grad(prev_weights_grad),
-          prev_bias_grad(prev_bias_grad) {}
-
-    /// @brief Primitive id containing convolution gradient data.
-    primitive_id conv_grad;
-    /// @brief Defines shift in input buffer between adjacent calculations of output values.
-    tensor stride;
-    /// @brief Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution_grad_weights window should start calculations.
-    tensor input_offset;
-    /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels.
-    /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1.
-    /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4].
-    tensor dilation;
-    /// @brief Should primitive give weights gradient (delta) as an output
-    bool output_grad_w;
-    /// @brief List of primitive ids containing weights data.
-    const primitive_id_arr weights;
-    /// @brief List of primitive ids containing bias data.
-    const primitive_id_arr bias;
-    /// @brief Array of primitive ids containing weights gradient data calculated in previous iteration.
-    /// Amount of primitives and their memory sizes should be same as weights.
-    const primitive_id_arr prev_weights_grad;
-    /// @brief Array of primitive ids containing bias gradient data calculated in previous iteration.
-    /// Amount of primitives and their memory sizes should be same as biases.
-    const primitive_id_arr prev_bias_grad;
-
-    /// @brief On how many cards split the computation to.
-    int32_t split() const { return static_cast<int32_t>(weights.size()); }
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> ret;
-        ret.reserve(weights.size() + bias.size() + !conv_grad.empty() + prev_weights_grad.size() +
-                    prev_bias_grad.size());
-        for (auto& w : weights) ret.push_back(std::ref(w));
-        for (auto& b : bias) ret.push_back(std::ref(b));
-
-        for (auto& g : prev_weights_grad) ret.push_back(std::ref(g));
-        for (auto& g : prev_bias_grad) ret.push_back(std::ref(g));
-        if (!conv_grad.empty())
-            ret.push_back(conv_grad);
-
-        return ret;
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/deconvolution.hpp b/inference-engine/thirdparty/clDNN/api/deconvolution.hpp
index a506a0850fd..141e7b8bbe5 100644
--- a/inference-engine/thirdparty/clDNN/api/deconvolution.hpp
+++ b/inference-engine/thirdparty/clDNN/api/deconvolution.hpp
@@ -56,8 +56,7 @@ struct deconvolution : public primitive_base<deconvolution> {
           with_output_size(false),
           groups(1),
           weights(weights),
-          bias(bias),
-          _gradient(false) {}
+          bias(bias) {}
     /// @brief Constructs deconvolution primitive.
     /// @param id This primitive id.
     /// @param input Input primitive id.
@@ -83,8 +82,7 @@ struct deconvolution : public primitive_base<deconvolution> {
           with_output_size(false),
           groups(groups),
           weights(weights),
-          bias(bias),
-          _gradient(false) {}
+          bias(bias) {}
 
     /// @brief Constructs deconvolution primitive (w/o bias).
     /// @param id This primitive id.
@@ -100,16 +98,14 @@ struct deconvolution : public primitive_base<deconvolution> {
                   const std::vector<primitive_id>& weights,
                   tensor stride = {1, 1, 1, 1},
                   tensor input_offset = {0, 0, 0, 0},
-                  const padding& output_padding = padding(),
-                  bool gradient = false)
+                  const padding& output_padding = padding())
         : primitive_base(id, {input}, output_padding),
           input_offset(input_offset),
           stride(stride),
           with_output_size(false),
           groups(1),
           weights(weights),
-          bias(std::vector<primitive_id>(0)),
-          _gradient(gradient) {}
+          bias(std::vector<primitive_id>(0)) {}
 
     /// @brief Constructs deconvolution primitive (w/o bias).
     /// @param id This primitive id.
@@ -127,16 +123,14 @@ struct deconvolution : public primitive_base<deconvolution> {
                   uint32_t groups,
                   tensor stride = {1, 1, 1, 1},
                   tensor input_offset = {0, 0, 0, 0},
-                  const padding& output_padding = padding(),
-                  bool gradient = false)
+                  const padding& output_padding = padding())
         : primitive_base(id, {input}, output_padding),
           input_offset(input_offset),
           stride(stride),
           with_output_size(false),
           groups(groups),
           weights(weights),
-          bias(std::vector<primitive_id>(0)),
-          _gradient(gradient) {}
+          bias(std::vector<primitive_id>(0)) {}
 
     /// @brief Constructs deconvolution primitive (computes input paddings to match output size).
     /// @param id This primitive id.
@@ -164,8 +158,7 @@ struct deconvolution : public primitive_base<deconvolution> {
           output_size(output_size),
           groups(1),
           weights(weights),
-          bias(bias),
-          _gradient(false) {}
+          bias(bias) {}
 
     /// @brief Constructs deconvolution primitive (computes input paddings to match output size).
     /// @param id This primitive id.
@@ -195,8 +188,7 @@ struct deconvolution : public primitive_base<deconvolution> {
           output_size(output_size),
           groups(groups),
           weights(weights),
-          bias(bias),
-          _gradient(false) {}
+          bias(bias) {}
 
     /// @brief Constructs deconvolution primitive (w/o bias, computes input paddings to match output size).
     /// @param id This primitive id.
@@ -214,8 +206,7 @@ struct deconvolution : public primitive_base<deconvolution> {
                   tensor stride,
                   tensor input_offset,
                   tensor output_size,
-                  const padding& output_padding = padding(),
-                  bool gradient = false)
+                  const padding& output_padding = padding())
         : primitive_base(id, {input}, output_padding),
           input_offset(input_offset),
           stride(stride),
@@ -223,8 +214,7 @@ struct deconvolution : public primitive_base<deconvolution> {
           output_size(output_size),
           groups(1),
           weights(weights),
-          bias(std::vector<primitive_id>(0)),
-          _gradient(gradient) {}
+          bias(std::vector<primitive_id>(0)) {}
 
     /// @brief Constructs deconvolution primitive (computes input paddings to match output size).
     /// @param id This primitive id.
@@ -300,12 +290,8 @@ struct deconvolution : public primitive_base<deconvolution> {
 
     /// @brief On how many cards split the computation to.
     int32_t split() const { return static_cast<int32_t>(weights.size()); }
-    /// @brief Indicates that deconvolution is used for convolution backward computation (convolution_grad_input)
-    bool gradient() const { return _gradient; }
 
 protected:
-    bool _gradient;
-
     std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
         std::vector<std::reference_wrapper<const primitive_id>> ret;
         ret.reserve(weights.size() + bias.size());
diff --git a/inference-engine/thirdparty/clDNN/api/eltwise.hpp b/inference-engine/thirdparty/clDNN/api/eltwise.hpp
index bff27ac4991..0926b52c430 100644
--- a/inference-engine/thirdparty/clDNN/api/eltwise.hpp
+++ b/inference-engine/thirdparty/clDNN/api/eltwise.hpp
@@ -92,13 +92,9 @@ struct eltwise : public primitive_base<eltwise> {
             eltwise_mode mode,
             const padding& output_padding = padding())
         : primitive_base(id, {input, input2}, output_padding),
-          output_calibration_factors(""),
-          output_quantization_factor(1.0f),
-          input_quantization_factors(0),
           mode(mode),
           coefficients(std::vector<float>(0)),
-          stride(std::vector<tensor>(0)),
-          inputs_calibration_factors(std::vector<primitive_id>(0)) {}
+          stride(std::vector<tensor>(0)) {}
 
     /// @brief Constructs eltwise primitive.
     /// @param id This primitive id.
@@ -115,13 +111,9 @@ struct eltwise : public primitive_base<eltwise> {
             eltwise_mode mode,
             const padding& output_padding = padding())
         : primitive_base(id, {input, input2}, output_padding),
-          output_calibration_factors(""),
-          output_quantization_factor(1.0f),
-          input_quantization_factors(0),
           mode(mode),
           coefficients(std::vector<float>(0)),
-          stride(stride),
-          inputs_calibration_factors(std::vector<primitive_id>(0)) {}
+          stride(stride) {}
 
     /// @brief Constructs eltwise primitive.
     /// @param id This primitive id.
@@ -134,13 +126,9 @@ struct eltwise : public primitive_base<eltwise> {
             data_types data_type,
             const padding& output_padding = padding())
         : primitive_base(id, inputs, output_padding, optional_data_type{data_type}),
-          output_calibration_factors(""),
-          output_quantization_factor(1.0f),
-          input_quantization_factors(0),
           mode(mode),
           coefficients(std::vector<float>(0)),
-          stride(std::vector<tensor>(0)),
-          inputs_calibration_factors(std::vector<primitive_id>(0)) {}
+          stride(std::vector<tensor>(0)) {}
 
     /// @brief Constructs eltwise primitive.
     /// @param id This primitive id.
@@ -151,13 +139,9 @@ struct eltwise : public primitive_base<eltwise> {
             eltwise_mode mode,
             const padding& output_padding = padding())
         : primitive_base(id, inputs, output_padding),
-          output_calibration_factors(""),
-          output_quantization_factor(1.0f),
-          input_quantization_factors(0),
           mode(mode),
           coefficients(std::vector<float>(0)),
-          stride(std::vector<tensor>(0)),
-          inputs_calibration_factors(std::vector<primitive_id>(0)) {}
+          stride(std::vector<tensor>(0)) {}
 
     /// @brief Constructs eltwise primitive.
     /// @param id This primitive id.
@@ -171,13 +155,9 @@ struct eltwise : public primitive_base<eltwise> {
             data_types data_type,
             const padding& output_padding = padding())
         : primitive_base(id, inputs, output_padding, optional_data_type{data_type}),
-          output_calibration_factors(""),
-          output_quantization_factor(1.0f),
-          input_quantization_factors(0),
           mode(mode),
           coefficients(coefficients),
-          stride(std::vector<tensor>(0)),
-          inputs_calibration_factors(std::vector<primitive_id>(0)) {
+          stride(std::vector<tensor>(0)) {
         if (mode == eltwise_mode::sum && !coefficients.empty() && coefficients.size() != inputs.size()) {
             throw std::invalid_argument("Invalid eltwise sum coefficients count (should be equal to 0 or input.size)");
         }
@@ -186,31 +166,12 @@ struct eltwise : public primitive_base<eltwise> {
         }
     }
 
-    /// @brief Primitive id containing output quanitization factors per output feature map.
-    primitive_id output_calibration_factors;
-    /// @brief Output quantization factor
-    float output_quantization_factor;
-    /// @brief List of quantization factors per input.
-    std::vector<float> input_quantization_factors;
     /// @param mode Eltwise mode.
     eltwise_mode mode;
     /// @param coefficients Blob-wise coefficient for SUM operation.
     std::vector<float> coefficients;
     /// @brief Defines shift in input buffers between adjacent calculations of output values.
     std::vector<tensor> stride;
-    /// @brief List of primitive ids containing input quantization factors per feature map, one primitive id for each input.
-    const primitive_id_arr inputs_calibration_factors;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> ret;
-        if (!output_calibration_factors.empty())
-            ret.push_back(output_calibration_factors);
-
-        for (auto& icf : inputs_calibration_factors) ret.push_back(std::ref(icf));
-
-        return ret;
-    }
 };
 /// @}
 /// @}
diff --git a/inference-engine/thirdparty/clDNN/api/embed.hpp b/inference-engine/thirdparty/clDNN/api/embed.hpp
deleted file mode 100644
index 91a66e32b44..00000000000
--- a/inference-engine/thirdparty/clDNN/api/embed.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief
-/// @details Performs embedding upon input.
-/// @n\b Example:
-/// @n input_size = { 8, 1, 1, 75 };
-/// @n weights_size = {15, 1, 62, 1 };
-/// @n output_size = { 8, 75, 15, 1 };
-/// @par Algorithm:
-/// @par Where:
-struct embed : public primitive_base<embed> {
-    CLDNN_DECLARE_PRIMITIVE(embed)
-
-    /// @brief Constructs embed primitive.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param weights Primitive id containing weights data.
-    /// @param bias Primitive id containing bias data.
-    embed(
-        const primitive_id& id,
-        const primitive_id& input,
-        const primitive_id& weights,
-        const primitive_id& bias)
-        : primitive_base(id, {input}), weights(weights), bias(bias) {}
-
-    /// @brief Constructs embed primitive.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    embed(
-        const primitive_id& id,
-        const primitive_id& input,
-        const primitive_id& weights)
-        : primitive_base(id, {input}), weights(weights), bias("") {}
-
-    /// @brief Primitive id containing weights data.
-    primitive_id weights;
-    /// @brief Primitive id containing bias data.
-    primitive_id bias;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        if (bias.empty())
-            return {weights};
-        else
-            return {weights, bias};
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
-#pragma once
diff --git a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp b/inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp
deleted file mode 100644
index 23463cda324..00000000000
--- a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs backward fully connected layer (inner product) for input.
-struct fully_connected_grad_input : public primitive_base<fully_connected_grad_input> {
-    CLDNN_DECLARE_PRIMITIVE(fully_connected_grad_input)
-
-    /// @brief Constructs fully connected layer grad for input.
-    /// @param id This primitive id.
-    /// @param input_grad Input gradient primitive id.
-    /// @param input Input primitive id.
-    /// @param weights Primitive id containing weights data.
-    fully_connected_grad_input(
-        const primitive_id& id,
-        const primitive_id& input_grad,
-        const primitive_id& input,
-        const primitive_id& weights,
-        const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding), weights(weights) {
-    }
-
-    /// @brief Primitive id containing weights data.
-    primitive_id weights;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        return {weights};
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp
deleted file mode 100644
index 71af7a81691..00000000000
--- a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs backward fully connected layer (inner product) for weights and biases.
-struct fully_connected_grad_weights
-    : public primitive_base<fully_connected_grad_weights> {
-    CLDNN_DECLARE_PRIMITIVE(fully_connected_grad_weights)
-
-    /// @brief Constructs fully connected layer for weights and biases.
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param input Input primitive id.
-    /// @param weights Primitive id containing weights data.
-    /// @param bias Primitive id containing bias data. Provide empty string if using Relu without bias.
-    /// @param fc_grad Id of primitive which uses weights and biases updated in this primitive.
-    /// This is for correct order of calculating. Leave empty if primitive is last in backward pass.
-    fully_connected_grad_weights(const primitive_id& id,
-                                 const primitive_id& input_grad,
-                                 const primitive_id& input,
-                                 const primitive_id& weights,
-                                 const primitive_id& bias = "",
-                                 const primitive_id& fc_grad = "",
-                                 const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          weights(weights),
-          bias(bias),
-          fc_grad(fc_grad),
-          prev_weights_grad(""),
-          prev_bias_grad("") {}
-
-    /// @brief Constructs fully connected layer for weights and biases with momentum optimizer.
-    /// @param id This primitive id.
-    /// @param input Input gradient primitive id.
-    /// @param input Input primitive id.
-    /// @param weights Primitive id containing weights data.
-    /// @param bias Primitive id containing bias data. Provide empty string if using Relu without bias.
-    /// @param prev_weights_grad Id of primitive which contains weights gradient data calculated in previous iteration. Used in momentum optimizer.
-    /// @param prev_bias_grad Id of primitive which contains bias gradient data calculated in previous iteration. Used in momentum optimizer.
-    /// @param fc_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating.
-    fully_connected_grad_weights(const primitive_id& id,
-                                 const primitive_id& input_grad,
-                                 const primitive_id& input,
-                                 const primitive_id& weights,
-                                 const primitive_id& bias,
-                                 const primitive_id& prev_weights_grad,
-                                 const primitive_id& prev_bias_grad,
-                                 const primitive_id& fc_grad = "",
-                                 const padding& output_padding = padding())
-        : primitive_base(id, {input_grad, input}, output_padding),
-          weights(weights),
-          bias(bias),
-          fc_grad(fc_grad),
-          prev_weights_grad(prev_weights_grad),
-          prev_bias_grad(prev_bias_grad) {}
-
-    /// @brief Primitive id containing weights data.
-    primitive_id weights;
-    /// @brief Primitive id containing bias data.
-    primitive_id bias;
-    /// @brief Primitive id containing fully connected gradient data.
-    primitive_id fc_grad;
-    /// @brief Id of primitive containing weights gradient data calculated in previous iteration. It's memory size should be same as weights.
-    primitive_id prev_weights_grad;
-    /// @brief Id of primitive containing bias gradient data calculated in previous iteration. It's memory size should be same as biases.
-    primitive_id prev_bias_grad;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> ret;
-        ret.reserve(1 + !bias.empty() + !fc_grad.empty() + !prev_weights_grad.empty() + !prev_bias_grad.empty());
-
-        ret.push_back(weights);
-        if (!bias.empty())
-            ret.push_back(bias);
-
-        if (!prev_weights_grad.empty())
-            ret.push_back(prev_weights_grad);
-        if (!prev_bias_grad.empty())
-            ret.push_back(prev_bias_grad);
-        if (!fc_grad.empty())
-            ret.push_back(fc_grad);
-
-        return ret;
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/index_select.hpp b/inference-engine/thirdparty/clDNN/api/index_select.hpp
deleted file mode 100644
index 0e6548eec25..00000000000
--- a/inference-engine/thirdparty/clDNN/api/index_select.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-
-/// @brief Axis which index_select primitive will index.
-enum class index_select_axis_name {
-    along_b,
-    along_f,
-    along_y,
-    along_x
-};
-
-/// @brief Select index, which will be copied to the output..
-///
-/// @details Applies index selecting along specified dimension. The indices, which will be copied are specifed by
-///          by @c indices.
-/// @n
-/// @n Example:
-/// @n      <tt>input_sizes  = (1, 2, 4, 2)</tt>
-/// @n      <tt>input_values = (a, b, c, d)</tt>
-/// @n      <tt>               (e, f, g, h)</tt>
-/// @n      <tt>indices_sizes  = (1, 1, 6, 1)</tt>
-/// @n      <tt>indices_values = {0, 0, 1, 1, 3, 3}</tt>
-/// @n  For axis: along_x:
-/// @n      <tt>output_sizes  = (1, 2, 6, 2)</tt>
-/// @n      <tt>output_values = (a, a, b, b, d, d)</tt>
-/// @n      <tt>                (e, e, f, f, h, h)</tt>
-/// @n
-/// @n The resulting output will have sizes equal to input_size with changed concrete tensor size to inidices x size.
-/// @n
-/// @n@b Requirements:
-/// @n - @c input must be a valid primitive_id, which output's format is bfyx/yxfb;
-/// @n - @c indices must be a valid primitive_id, which output's layout is: (bfyx/yxfb, i32, {1, 1, indicies_size, 1})
-/// @n - @c axis - valid index_select_axis_name instance.
-/// @n Breaking any of this conditions will cause exeption throw.
-struct index_select : public primitive_base<index_select> {
-    CLDNN_DECLARE_PRIMITIVE(index_select)
-
-    /// @brief Constructs index_select primitive / layer.
-    ///
-    /// @param id                 An identifier of new primitive.
-    /// @param input              An identifier of primitive, which is an input for newly created
-    ///                           index_select primitive.
-    /// @param indicies           An identifer of primitive, which have indices in memory distributed along x.
-    /// @param axis               Axis of index selecting.
-    /// @param output_padding     Optional padding for output from primitive.
-    index_select(
-        const primitive_id& id,
-        const primitive_id& input,
-        const primitive_id& indices,
-        index_select_axis_name axis = index_select_axis_name::along_b,
-        const padding& output_padding = padding())
-        : primitive_base(id, {input, indices}, output_padding), axis({axis}), reverse(false) {}
-
-    /// @brief Constructs index_select primitive / layer.
-    ///
-    /// @param id                 An identifier of new primitive.
-    /// @param input              An identifier of primitive, which is an input for newly created
-    ///                           index_select primitive.
-    /// @param axis               Axis of index selecting.
-    /// @param output_padding     Optional padding for output from primitive.
-    index_select(
-        const primitive_id& id,
-        const primitive_id& input,
-        index_select_axis_name axis = index_select_axis_name::along_b,
-        const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding), axis({axis}), reverse(true) {}
-
-    /// @brief Constructs index_select primitive / layer.
-    ///
-    /// @param id                 An identifier of new primitive.
-    /// @param input              An identifier of primitive, which is an input for newly created
-    ///                           index_select primitive.
-    /// @param axis               Vector of axes of index selecting.
-    /// @param output_padding     Optional padding for output from primitive.
-    index_select(
-        const primitive_id& id,
-        const primitive_id& input,
-        const std::vector<index_select_axis_name>& axis = {index_select_axis_name::along_b},
-        const padding& output_padding = padding())
-        : primitive_base(id, {input}, output_padding), axis(axis), reverse(true) {}
-
-    /// @brief A list of axes of index selecting
-    std::vector<index_select_axis_name> axis;
-    /// @brief Do index_select in reverse order on axis/axes.
-    bool reverse;
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/lookup_table.hpp b/inference-engine/thirdparty/clDNN/api/lookup_table.hpp
deleted file mode 100644
index 65349edd55c..00000000000
--- a/inference-engine/thirdparty/clDNN/api/lookup_table.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Returns values from data on which given indices are pointing at.
-struct lookup_table : public primitive_base<lookup_table> {
-    CLDNN_DECLARE_PRIMITIVE(lookup_table)
-
-    /// @brief Enum type to specify axis to maximize/minimize along.
-    enum axis_name { batch, feature, x, y, xyf };
-
-    /// @brief Constructs lookup_table primitive.
-    /// @param id This primitive id.
-    /// @param input_data Input data primitive id.
-    /// @param input_indices Input indices primitive id.
-    /// @param axis Axis to return values from.
-    lookup_table(const primitive_id& id,
-                 const primitive_id& input_data,
-                 const primitive_id& input_indices,
-                 axis_name axis = axis_name::xyf,
-                 const padding& output_padding = padding())
-        : primitive_base(id, {input_data, input_indices}, output_padding),
-          axis(axis),
-          with_axis(axis == axis_name::xyf ? false : true) {}
-
-    /// @brief Axis to return values from. If not set, returns data which index is pointing at in the flattened x, y, f dimensions for each batch.
-    axis_name axis;
-    /// @brief Indicates that the primitive has user defined axis to return values from.
-    bool with_axis;
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/network.hpp b/inference-engine/thirdparty/clDNN/api/network.hpp
index ffb9645840e..6c83688120d 100644
--- a/inference-engine/thirdparty/clDNN/api/network.hpp
+++ b/inference-engine/thirdparty/clDNN/api/network.hpp
@@ -113,12 +113,6 @@ struct network {
     /// @brief Provides user-supplied @ref memory for output primitives defined by user in source @ref topology.
     void set_output_memory(const primitive_id& id, const memory& mem) const;
 
-    /// @brief Sets learning rate for training primitives.
-    void set_learning_rate(const float lr);
-
-    /// @brief Return learning rate.
-    float get_learning_rate();
-
     /// @brief Return stream id.
     uint16_t get_stream_id();
 
diff --git a/inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp b/inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp
deleted file mode 100644
index 667cf5b3d2f..00000000000
--- a/inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs scale primitive backward for input.
-struct scale_grad_input : public primitive_base<scale_grad_input> {
-    CLDNN_DECLARE_PRIMITIVE(scale_grad_input)
-
-    /// @brief Constructs scale_grad_input.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param scale_input Scale input primitive id with values needed for product computation.
-    scale_grad_input(const primitive_id& id,
-                     const primitive_id& input,
-                     const primitive_id& scale_input,  // should be bfyx or yxfb, where each dimension can be 1, if all
-                                                       // dimensions are 1 then this is scalar
-                     const padding& output_padding = padding())
-        : primitive_base(id, {input, scale_input}, output_padding) {}
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp
deleted file mode 100644
index d13b18d137a..00000000000
--- a/inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Performs scale layer backward for scale_input and biases.
-struct scale_grad_weights : public primitive_base<scale_grad_weights> {
-    CLDNN_DECLARE_PRIMITIVE(scale_grad_weights)
-
-    /// @brief Constructs scale_grad_weights primitive without bias.
-    /// @param id This primitive id.
-    /// @param input Input primitive id. Same as input for scale forward.
-    /// @param input_grad Input gradient primitive id.
-    /// @param scale_input Scale input primitive id.
-    /// @param scale_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating.
-    scale_grad_weights(const primitive_id& id,
-                       const primitive_id& input,
-                       const primitive_id& input_grad,
-                       const primitive_id& scale_input,      // should be one number per feature
-                       const primitive_id& scale_grad = "",  // leave empty if this is last primitive in backward pass
-                       const padding& output_padding = padding())
-        : primitive_base(id, {input, input_grad}, output_padding),
-          scale_input(scale_input),
-          bias(""),
-          prev_scale_grad(""),
-          prev_bias_grad(""),
-          scale_grad(scale_grad) {}
-
-    /// @brief Constructs scale_grad_weights primitive with optional adding bias.
-    /// @param id This primitive id.
-    /// @param input Input primitive id. Same as input for scale forward.
-    /// @param input_grad Input gradient primitive id.
-    /// @param scale_input Scale input primitive id.
-    /// @param bias Primitive id containing bias data.
-    /// @param scale_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating.
-    scale_grad_weights(const primitive_id& id,
-                       const primitive_id& input,
-                       const primitive_id& input_grad,
-                       const primitive_id& scale_input,      // should be one number per feature
-                       const primitive_id& bias,             // should be same size as scale_input
-                       const primitive_id& scale_grad = "",  // leave empty if this is last primitive in backward pass
-                       const padding& output_padding = padding())
-        : primitive_base(id, {input, input_grad}, output_padding),
-          scale_input(scale_input),
-          bias(bias),
-          prev_scale_grad(""),
-          prev_bias_grad(""),
-          scale_grad(scale_grad) {}
-
-    /// @brief Constructs scale_grad_weights primitive with optional bias and momentum optimizer.
-    /// @param id This primitive id.
-    /// @param input Input primitive id. Same as input for scale forward.
-    /// @param input_grad Input gradient primitive id.
-    /// @param scale_input Scale input primitive id.
-    /// @param bias Primitive id containing bias data.
-    /// @param prev_scale_grad Id of primitive which contains scale gradient data calculated in previous iteration. Used in momentum optimizer.
-    /// @param prev_bias_grad Id of primitive which contains bias gradient data calculated in previous iteration. Used in momentum optimizer.
-    /// @param scale_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating.
-    scale_grad_weights(const primitive_id& id,
-                       const primitive_id& input,
-                       const primitive_id& input_grad,
-                       const primitive_id& scale_input,  // should be one number per feature
-                       const primitive_id& bias,         // should be same size as scale_input
-                       const primitive_id& prev_scale_grad,
-                       const primitive_id& prev_bias_grad,   // leave empty if bias not specified
-                       const primitive_id& scale_grad = "",  // leave empty if this is last primitive in backward pass
-                       const padding& output_padding = padding())
-        : primitive_base(id, {input, input_grad}, output_padding),
-          scale_input(scale_input),
-          bias(bias),
-          prev_scale_grad(prev_scale_grad),
-          prev_bias_grad(prev_bias_grad),
-          scale_grad(scale_grad) {}
-
-    /// @brief Scale input primitive id.
-    primitive_id scale_input;
-    /// @brief Primitive id containing bias data.
-    primitive_id bias;
-    /// @brief Primitive id containing scale gradient data calculated in previous iteration.
-    primitive_id prev_scale_grad;
-    /// @brief Primitive id containing bias gradient data calculated in previous iteration.
-    primitive_id prev_bias_grad;
-    /// @brief Primitive id which uses weights and biases updated in this primitive.
-    primitive_id scale_grad;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> ret;
-        ret.reserve(1 + !bias.empty() + !prev_scale_grad.empty() + !prev_bias_grad.empty());
-
-        ret.push_back(scale_input);
-        if (!bias.empty())
-            ret.push_back(bias);
-        if (!prev_scale_grad.empty())
-            ret.push_back(prev_scale_grad);
-        if (!prev_bias_grad.empty())
-            ret.push_back(prev_bias_grad);
-        if (!scale_grad.empty())
-            ret.push_back(scale_grad);
-
-        return ret;
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp b/inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp
deleted file mode 100644
index e436f5b0baa..00000000000
--- a/inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "primitive.hpp"
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Backward pass for Softmax log loss.
-/// @details The output values are the same as input_prob, except for the correct one based on the label which is subtracted by 1.
-struct softmax_loss_grad : public primitive_base<softmax_loss_grad> {
-    CLDNN_DECLARE_PRIMITIVE(softmax_loss_grad)
-
-    /// @brief Constructs softmax_loss_grad primitive.
-    /// @param id This primitive id.
-    /// @param input_prob Input primitive id.
-    /// @param labels Labels primitive id.
-    softmax_loss_grad(const primitive_id& id,
-                      const primitive_id& input_prob,
-                      const primitive_id& labels,
-                      const padding& output_padding = padding())
-        : primitive_base(id, {input_prob, labels}, output_padding) {}
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp b/inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp
deleted file mode 100644
index cf27c417e42..00000000000
--- a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/primitive.hpp"
-#include <vector>
-
-namespace cldnn {
-/// @addtogroup cpp_api C++ API
-/// @{
-/// @addtogroup cpp_topology Network Topology
-/// @{
-/// @addtogroup cpp_primitives Primitives
-/// @{
-
-/// @brief Primitives that fuses convolution, batch norm, scale and optionally Relu.
-struct fused_conv_bn_scale : public primitive_base<fused_conv_bn_scale> {
-    CLDNN_DECLARE_PRIMITIVE(fused_conv_bn_scale)
-
-    /// @brief Constructs convolution primitive fused with batch norm and scale.
-    /// @param id This primitive id.
-    /// @param input Input primitive id.
-    /// @param weights List of primitive ids containing weights data.
-    /// @param bias List of primitive ids containing bias data.
-    /// @param epsilon Small number to protect from 0 dividing.
-    /// @param scale_input Scale input primitive id with values needed for product computation. Used in fused scale part.
-    /// @param scale_bias Primitive id containing bias data for fused scale part.
-    /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
-    /// where (0,0) point of the convolution window should start calculations.
-    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
-    /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. Used in fused batch norm part.
-    /// @param with_activation Enable Relu activation.
-    /// @param activation_slp Relu activation slope.
-    fused_conv_bn_scale(const primitive_id& id,
-                        const primitive_id& input,
-                        const std::vector<primitive_id>& weights,
-                        const std::vector<primitive_id>& bias,
-                        float epsilon,
-                        const primitive_id& scale_input,
-                        const primitive_id& scale_bias = "",
-                        tensor stride = {1, 1, 1, 1},
-                        tensor dilation = {1, 1, 1, 1},
-                        tensor input_offset = {0, 0, 0, 0},
-                        const primitive_id& inv_variance = "",
-                        const padding& output_padding = padding())
-        : primitive_base(id, {input, scale_input}, output_padding),
-          input_offset(input_offset),
-          stride(stride),
-          dilation(dilation),
-          with_output_size(false),
-          scale_bias(scale_bias),
-          inv_variance(inv_variance),
-          epsilon(epsilon),
-          weights(weights),
-          bias(bias) {
-        if ((bias.size() != 0) && (weights.size() != bias.size()))
-            throw std::runtime_error("convolution's weights/bias count does not match");
-    }
-
-    /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations.
-    tensor input_offset;
-    /// @brief Defines shift in input buffer between adjacent calculations of output values.
-    tensor stride;
-    /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels.
-    /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1.
-    /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4].
-    tensor dilation;
-    /// @brief Indicates that the primitive has user-defined output size (non-zero value).
-    bool with_output_size;
-    /// @brief User-defined output data size of the primitive (w/o padding).
-    tensor output_size;
-    /// @brief Primitive id containing scale bias data for fused convolution.
-    primitive_id scale_bias;
-    /// @brief Primitive id containing inverted variance used in future gradient computing for fused convolution.
-    primitive_id inv_variance;
-    /// @brief Epsilon for fused convolution.
-    float epsilon;
-    /// @brief On how many cards split the computation to.
-    int32_t split() const { return static_cast<int32_t>(weights.size()); }
-    /// @brief List of primitive ids containing weights data.
-    const primitive_id_arr weights;
-    /// @brief List of primitive ids containing bias data.
-    const primitive_id_arr bias;
-
-protected:
-    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        std::vector<std::reference_wrapper<const primitive_id>> ret;
-        ret.reserve(weights.size() + bias.size() + !scale_bias.empty() + !inv_variance.empty());
-        for (auto& w : weights) ret.push_back(std::ref(w));
-        for (auto& b : bias) ret.push_back(std::ref(b));
-        if (!scale_bias.empty())
-            ret.push_back(scale_bias);
-        if (!inv_variance.empty())
-            ret.push_back(inv_variance);
-        return ret;
-    }
-};
-/// @}
-/// @}
-/// @}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp b/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp
index f7528c4e884..deb9a3ec73d 100644
--- a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp
+++ b/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp
@@ -37,9 +37,6 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
     /// @param input Input primitive id.
     /// @param weights List of primitive ids containing weights data.
     /// @param bias List of primitive ids containing bias data.
-    /// @param w_quantization_factor List of primitive ids containing weights quanitization factors per output feature map.
-    /// @param output_calibration_factors List of primitive ids output containing calibration factors per output feature map.
-    /// @param i_quantization_factor Input quantization factor
     /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer,
     /// where (0,0) point of the convolution window should start calculations.
     /// @param stride Defines shift in input buffer between adjacent calculations of output values.
@@ -57,11 +54,6 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
                        eltwise_mode mode,
                        const std::vector<primitive_id>& weights,
                        const std::vector<primitive_id>& bias,
-                       const std::vector<primitive_id>& conv_w_quantization_factor,
-                       const std::vector<primitive_id>& conv_output_calibration_factors,
-                       const float conv_i_quantization_factor,
-                       const float non_conv_scale,
-                       const primitive_id& eltw_output_calibration_factors,
                        const std::vector<tensor>& eltw_stride,
                        tensor stride = {1, 1, 1, 1},
                        tensor input_offset = {0, 0, 0, 0},
@@ -74,18 +66,10 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
                        optional_data_type output_data_type = {})
         : primitive_base(id, {input, input2}, output_padding, output_data_type),
           conv((primitive_id_arr)weights,
-              (primitive_id_arr)bias,
-              (primitive_id_arr)conv_w_quantization_factor,
-              (primitive_id_arr)conv_output_calibration_factors),
-          eltw(eltw_output_calibration_factors),
-          non_conv_scale(non_conv_scale),
+              (primitive_id_arr)bias),
+          eltw(),
           conv_weights(weights),
-          conv_bias(bias),
-          conv_weights_quantization_factors(conv_w_quantization_factor),
-          conv_output_calibration_factors(conv_output_calibration_factors) {
-        conv.input_quantization_factor = conv_i_quantization_factor;
-        conv.output_quantization_factor = 1.0f;
-
+          conv_bias(bias) {
         conv.input_offset = input_offset;
         conv.stride = stride;
         conv.dilation = dilation;
@@ -100,10 +84,6 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
 
         if ((bias.size() != 0) && (weights.size() != bias.size()))
             throw std::runtime_error("convolution's weights/bias count does not match");
-        if (conv.output_calibration_factors.size()) {
-            if ((weights.size() != 0) && (weights.size() != conv.weights_quantization_factors.size()))
-                throw std::runtime_error("convolution's weights count does not match quantization factors count");
-        }
     }
 
     struct conv_data {
@@ -111,14 +91,6 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
         const primitive_id_arr weights;
         /// @brief List of primitive ids containing bias data.
         const primitive_id_arr bias;
-        /// @brief List of primitive ids containing weights quanitization factors per output feature map.
-        const primitive_id_arr weights_quantization_factors;
-        /// @brief List of primitive ids containing output quanitization factors per output feature map for convolution.
-        const primitive_id_arr output_calibration_factors;
-        /// @brief Input quantization factor for convolution
-        float input_quantization_factor;
-        /// @brief Output quantization factor for convolution
-        float output_quantization_factor;
         /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations.
         tensor input_offset;
         /// @brief Defines shift in input buffer between adjacent calculations of output values.
@@ -137,20 +109,12 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
         tensor output_size;
 
         conv_data(const primitive_id_arr& weights,
-                  const primitive_id_arr& bias,
-                  const primitive_id_arr& weights_quantization_factors,
-                  const primitive_id_arr& output_calibration_factors)
+                  const primitive_id_arr& bias)
             : weights(weights),
-              bias(bias),
-              weights_quantization_factors(weights_quantization_factors),
-              output_calibration_factors(output_calibration_factors) {}
+              bias(bias) {}
     } conv;
 
     struct eltw_data {
-        /// @brief Primitive id containing output quanitization factors per output feature map.
-        primitive_id output_calibration_factors;
-        /// @brief Output quantization factor for eltwise
-        float output_quantization_factor;
         /// @param mode Eltwise mode.
         eltwise_mode mode;
         /// @brief Enable Relu activation.
@@ -159,22 +123,11 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
         float activation_negative_slope;
         /// @brief Defines shift in input buffers between adjacent calculations of output values.
         std::vector<tensor> stride;
-        explicit eltw_data(const primitive_id& output_calibration_factors)
-            : output_calibration_factors(output_calibration_factors) {}
     } eltw;
 
     /// @brief On how many cards split the computation to.
     int32_t split() const { return static_cast<int32_t>(conv.weights.size()); }
 
-    // FIXME: In fact, that should be needed for any EltWise primitive, not
-    // only the fused one. What's more important, these scales should be
-    // separate for different inputs and probably per-channel, not per
-    // primitive.
-    //
-    // I'm only needing a scalar for my particular task, so let's hack like
-    // this in the meantime. The final design is still to be investigated.
-    float non_conv_scale = 1.0f;
-
     /// @brief Is optimization that output contains data from second input ON ?
     bool second_input_in_output = false;
     bool depth_to_space_already_fused = false;
@@ -182,21 +135,13 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
 protected:
     const primitive_id_arr conv_weights;
     const primitive_id_arr conv_bias;
-    const primitive_id_arr conv_weights_quantization_factors;
-    const primitive_id_arr conv_output_calibration_factors;
 
     std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
         std::vector<std::reference_wrapper<const primitive_id>> ret;
-        ret.reserve(conv.weights.size() + conv.bias.size() + conv.weights_quantization_factors.size() +
-                    conv.output_calibration_factors.size() + (eltw.output_calibration_factors.empty() ? 0 : 1));
+        ret.reserve(conv.weights.size() + conv.bias.size());
 
         for (auto& w : conv.weights) ret.push_back(std::ref(w));
         for (auto& b : conv.bias) ret.push_back(std::ref(b));
-        for (auto& q : conv.weights_quantization_factors) ret.push_back(std::ref(q));
-        for (auto& q : conv.output_calibration_factors) ret.push_back(std::ref(q));
-
-        if (!eltw.output_calibration_factors.empty())
-            ret.push_back(eltw.output_calibration_factors);
 
         return ret;
     }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h
index 5865ca81734..d159874ee0a 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h
@@ -67,7 +67,6 @@ inline uint8_t GetActivationAdditionalParamsNumber(ActivationFunction func) {
             break;
         case ActivationFunction::RELU_NEGATIVE_SLOPE:
         case ActivationFunction::ELU:
-        case ActivationFunction::RELU_NEGATIVE_SLOPE_GRAD:
             paramsNum = 1;
             break;
         default:
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
index 58772662a89..c8e39446e3e 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
@@ -25,8 +25,6 @@ enum class KernelType {
     UNKNOWN,
     ARG_MAX_MIN,
     AVERAGE_UNPOOLING,
-    BATCH_NORM_GRAD,
-    LOOKUP_TABLE,
     CONVOLUTION,
     DECONVOLUTION,
     LRN,
@@ -38,9 +36,7 @@ enum class KernelType {
     SOFT_MAX,
     ELTWISE,
     SCALE,
-    FUSED_CONV_BN_SCALE,
     FUSED_CONV_ELTWISE,
-    TABLE_LOOKUP,
     REORDER,
     RESHAPE,
     PERMUTE,
@@ -49,21 +45,14 @@ enum class KernelType {
     REGION_YOLO,
     REORG_YOLO,
     MAX_UNPOOLING,
-    CONVOLUTION_GRAD_WEIGHTS,
-    SCALE_GRAD_WEIGHTS,
     MVN,
-    FULLY_CONNECTED_GRAD_INPUT,
-    FULLY_CONNECTED_GRAD_WEIGHTS,
     LSTM_GEMM,
     LSTM_ELT,
-    EMBED,
-    SOFT_MAX_LOSS_GRAD,
     BORDER,
     TILE,
     SELECT,
     BROADCAST,
     GEMM,
-    INDEX_SELECT,
     PYRAMID_ROI_ALIGN,
     CONTRACT,
     ONE_HOT,
@@ -133,8 +122,6 @@ enum class ActivationFunction {
     SQRT,
     LINEAR,
     ELU,
-    RELU_GRAD,
-    RELU_NEGATIVE_SLOPE_GRAD,
     SIN,
     ASIN,
     SINH,
@@ -155,7 +142,6 @@ enum class ActivationFunction {
     NEGATIVE,
     NOT,
     POW,
-    NONE_GRAD,
     ERF,
     HARD_SIGMOID,
     RECIPROCAL,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp
index e85a6e0eaaf..31b20418efb 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp
@@ -103,9 +103,6 @@ KernelsData ActivationKernelBase::GetCommonKernelsData(const Params& params, con
     FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point,
                      DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params));
 
-    if (newParams.gradient)
-        kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
-
     if (!newParams.inputActivationParams.empty()) {
         kernel.arguments.push_back({ArgumentDescriptor::Types::SLOPE, 0});
     }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp
index 0a14ff8c472..cbe17079757 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp
@@ -34,7 +34,6 @@ ParamsKey ActivationKernelOpt::GetSupportedKey() const {
     k.EnableAllOutputLayout();
     k.EnableTensorOffset();
     k.EnableBatching();
-    k.EnableGradient();
     return k;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp
index cc3231946a7..89f019c7af6 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp
@@ -38,7 +38,6 @@ ParamsKey ActivationKernelRef::GetSupportedKey() const {
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
-    k.EnableGradient();
     return k;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp
deleted file mode 100644
index ca181884b13..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_kernel_base.h"
-#include <algorithm>
-
-namespace kernel_selector {
-bool BatchNormKernelBase::Validate(const Params& p, const optional_params& o) const {
-    if (p.GetType() != KernelType::BATCH_NORM_GRAD || o.GetType() != KernelType::BATCH_NORM_GRAD) {
-        return false;
-    }
-
-    return true;
-}
-
-JitConstants BatchNormKernelBase::GetJitConstants(const batch_norm_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    jit.AddConstant(MakeJitConstant("EPSILON", params.batchNormParams.epsilon));
-    if (params.batchNormParams.with_inv_var)
-        jit.AddConstant(MakeJitConstant("FORWARD", 1));
-    if (params.batchNormParams.with_scale_shift)
-        jit.AddConstant(MakeJitConstant("SCALE_SHIFT", 1));
-    if (params.batchNormParams.with_mean_var_out)
-        jit.AddConstant(MakeJitConstant("MEAN_VAR_OUT", 1));
-
-    return jit;
-}
-
-BatchNormKernelBase::DispatchData BatchNormKernelBase::SetDefault(const batch_norm_params& params) const {
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    kd.gws0 = params.inputs[0].Batch().v;
-    kd.gws1 = params.inputs[0].Feature().v;
-    kd.gws2 = 1;
-
-    kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(256));
-    while (kd.gws0 % kd.lws0 != 0) {
-        --kd.lws0;
-    }
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-
-    return kd;
-}
-
-KernelsData BatchNormKernelBase::GetCommonKernelsData(const Params& params,
-                                                      const optional_params& options,
-                                                      float estimatedTime) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    const batch_norm_params& orgParams = static_cast<const batch_norm_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-
-    KernelData kd = KernelData::Default<batch_norm_params>(params);
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    int inputs_num = 1 + orgParams.batchNormParams.with_inv_var + 2 * orgParams.batchNormParams.with_scale_shift +
-                     2 * orgParams.batchNormParams.with_mean_var_out;
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, inputs_num);
-
-    kd.estimatedTime = estimatedTime;
-
-    return {kd};
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h
deleted file mode 100644
index 91344f2786a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "common_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// batch_norm_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct batch_norm_params : public base_params {
-    batch_norm_params() : base_params(KernelType::BATCH_NORM_GRAD) {}
-
-    struct DedicatedParams {
-        float epsilon;
-        bool with_inv_var;
-        bool with_scale_shift;
-        bool with_mean_var_out = false;
-    };
-
-    DedicatedParams batchNormParams;
-
-    virtual ParamsKey GetParamsKey() const {
-        return base_params::GetParamsKey();
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// batch_norm_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct batch_norm_optional_params : optional_params {
-    batch_norm_optional_params() : optional_params(KernelType::BATCH_NORM_GRAD) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// BatchNormKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class BatchNormKernelBase : public common_kernel_base {
-public:
-    using common_kernel_base::common_kernel_base;
-    virtual ~BatchNormKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    bool Validate(const Params& params, const optional_params& options) const override;
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
-    virtual JitConstants GetJitConstants(const batch_norm_params& params) const;
-    virtual DispatchData SetDefault(const batch_norm_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp
deleted file mode 100644
index e839de45283..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_kernel_ref.h"
-
-namespace kernel_selector {
-ParamsKey BatchNormKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableBatching();
-    return k;
-}
-
-KernelsData BatchNormKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h
deleted file mode 100644
index 117b068446c..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "batch_norm_kernel_base.h"
-
-namespace kernel_selector {
-class BatchNormKernelRef : public BatchNormKernelBase {
-public:
-    BatchNormKernelRef() : BatchNormKernelBase("batch_norm_gpu_ref") {}
-    virtual ~BatchNormKernelRef() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp
deleted file mode 100644
index 5d48a80933d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_kernel_selector.h"
-#include "batch_norm_kernel_ref.h"
-
-namespace kernel_selector {
-
-batch_norm_kernel_selector::batch_norm_kernel_selector() {
-    Attach<BatchNormKernelRef>();
-}
-
-KernelsData batch_norm_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::BATCH_NORM_GRAD);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h
deleted file mode 100644
index 25915b6bb2c..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class batch_norm_kernel_selector : public kernel_selector_base {
-public:
-    static batch_norm_kernel_selector& Instance() {
-        static batch_norm_kernel_selector instance_;
-        return instance_;
-    }
-
-    batch_norm_kernel_selector();
-
-    virtual ~batch_norm_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp
deleted file mode 100644
index b5f679179a7..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_grad_kernel_base.h"
-
-namespace kernel_selector {
-bool BatchNormGradKernelBase::Validate(const Params& p, const optional_params& o) const {
-    if (p.GetType() != KernelType::BATCH_NORM_GRAD ||
-        o.GetType() != KernelType::BATCH_NORM_GRAD) {
-        return false;
-    }
-
-    return true;
-}
-
-JitConstants BatchNormGradKernelBase::GetJitConstants(const batch_norm_grad_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-    return jit;
-}
-
-BatchNormGradKernelBase::DispatchData BatchNormGradKernelBase::SetDefault(const batch_norm_grad_params& params) const {
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    kd.gws0 = params.inputs[0].Batch().v;
-    kd.gws1 = params.inputs[0].Feature().v;
-    kd.gws2 = 1;
-
-    kd.lws0 = params.inputs[0].Batch().v;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-
-    return kd;
-}
-
-KernelsData BatchNormGradKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    const batch_norm_grad_params& orgParams = static_cast<const batch_norm_grad_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-
-    KernelData kd = KernelData::Default<batch_norm_grad_params>(params);
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 3);
-
-    kd.estimatedTime = estimatedTime;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h
deleted file mode 100644
index f89a6ec3cf8..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "common_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// batch_norm_grad_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct batch_norm_grad_params : public base_params {
-    batch_norm_grad_params() : base_params(KernelType::BATCH_NORM_GRAD) {}
-
-    virtual ParamsKey GetParamsKey() const {
-        return base_params::GetParamsKey();
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// batch_norm_grad_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct batch_norm_grad_optional_params : optional_params {
-    batch_norm_grad_optional_params() : optional_params(KernelType::BATCH_NORM_GRAD) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// BatchNormGradKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class BatchNormGradKernelBase : public common_kernel_base {
-public:
-    using common_kernel_base::common_kernel_base;
-    virtual ~BatchNormGradKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    bool Validate(const Params& params, const optional_params& options) const override;
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
-    virtual JitConstants GetJitConstants(const batch_norm_grad_params& params) const;
-    virtual DispatchData SetDefault(const batch_norm_grad_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp
deleted file mode 100644
index c775d379fa9..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_grad_kernel_ref.h"
-
-namespace kernel_selector {
-ParamsKey BatchNormGradKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableBatching();
-    return k;
-}
-
-KernelsData BatchNormGradKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h
deleted file mode 100644
index f24fbc2ca98..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "batch_norm_grad_kernel_base.h"
-
-namespace kernel_selector {
-class BatchNormGradKernelRef : public BatchNormGradKernelBase {
-public:
-    BatchNormGradKernelRef() : BatchNormGradKernelBase("batch_norm_grad_gpu_ref") {}
-    virtual ~BatchNormGradKernelRef() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp
deleted file mode 100644
index 6891bd11ed3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_grad_kernel_selector.h"
-#include "batch_norm_grad_kernel_ref.h"
-
-namespace kernel_selector {
-
-batch_norm_grad_kernel_selector::batch_norm_grad_kernel_selector() {
-    Attach<BatchNormGradKernelRef>();
-}
-
-KernelsData batch_norm_grad_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::BATCH_NORM_GRAD);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h
deleted file mode 100644
index 9a20745f9fb..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class batch_norm_grad_kernel_selector : public kernel_selector_base {
-public:
-    static batch_norm_grad_kernel_selector& Instance() {
-        static batch_norm_grad_kernel_selector instance_;
-        return instance_;
-    }
-
-    batch_norm_grad_kernel_selector();
-
-    virtual ~batch_norm_grad_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp
deleted file mode 100644
index 111971e0a6d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "contract_kernel_base.h"
-#include <vector>
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-JitConstants ContractKernelBase::GetJitConstants(const contract_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    const size_t no_dim_flag = 6;
-    std::vector<size_t> output_dims(4, no_dim_flag);
-    int out_dim = 2;
-    for (int i = 3; i >= 0; --i) {
-        if (std::find(params.reduction_axes.begin(), params.reduction_axes.end(), i) == params.reduction_axes.end())
-            output_dims.at(i) = out_dim--;
-    }
-
-    if (output_dims[3] != no_dim_flag)
-        jit.AddConstants({MakeJitConstant("DIM_X", output_dims.at(3))});
-    if (output_dims[2] != no_dim_flag)
-        jit.AddConstants({MakeJitConstant("DIM_Y", output_dims.at(2))});
-    if (output_dims[1] != no_dim_flag)
-        jit.AddConstants({MakeJitConstant("DIM_F", output_dims.at(1))});
-    if (output_dims[0] != no_dim_flag)
-        jit.AddConstants({MakeJitConstant("DIM_B", output_dims.at(0))});
-
-    jit.AddConstants({MakeJitConstant("REDUCE_X", output_dims.at(3) == no_dim_flag),
-                      MakeJitConstant("REDUCE_Y", output_dims.at(2) == no_dim_flag),
-                      MakeJitConstant("REDUCE_F", output_dims.at(1) == no_dim_flag),
-                      MakeJitConstant("REDUCE_B", output_dims.at(0) == no_dim_flag)});
-
-    switch (params.mode) {
-        case ContractMode::SUM:
-            jit.AddConstants({MakeJitConstant("REDUCE_SEED", "0"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a + b")});
-            break;
-        case ContractMode::PRODUCT:
-            jit.AddConstants({MakeJitConstant("REDUCE_SEED", "1"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a * b")});
-            break;
-        case ContractMode::ALL:
-            jit.AddConstants(
-                {MakeJitConstant("REDUCE_SEED", "1"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a && b")});
-            break;
-        case ContractMode::ANY:
-            jit.AddConstants(
-                {MakeJitConstant("REDUCE_SEED", "0"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a || b")});
-            break;
-        case ContractMode::MAX:
-            jit.AddConstants({MakeJitConstant("REDUCE_SEED", "UNIT_VAL_MIN"),
-                              MakeJitConstant("REDUCE_OPERATION(a, b)", "UNIT_MAX_FUNC(a,b)")});
-            break;
-    }
-
-    return jit;
-}
-
-ContractKernelBase::DispatchData ContractKernelBase::SetDefault(const contract_params& params) {
-    const auto& output = params.output;
-
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    std::vector<size_t> global{output.Feature().v, output.Y().v, output.X().v};
-    const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
-    kd.gws0 = global[0];
-    kd.gws1 = global[1];
-    kd.gws2 = global[2];
-
-    kd.lws0 = local[0];
-    kd.lws1 = local[1];
-    kd.lws2 = local[2];
-
-    return kd;
-}
-
-KernelsData ContractKernelBase::GetCommonKernelsData(const Params& params,
-                                                     const optional_params& options,
-                                                     float estimated_time) const {
-    assert(params.GetType() == KernelType::CONTRACT);
-
-    const auto& prim_params =
-        static_cast<const contract_params&>(params);
-
-    auto run_info = SetDefault(prim_params);
-    KernelData k_data = KernelData::Default<contract_params>(params);
-
-    auto cldnn_jit = GetJitConstants(prim_params);
-    auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = k_data.kernels[0];
-    FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point);
-    k_data.estimatedTime = estimated_time;
-
-    return {k_data};
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h
deleted file mode 100644
index e5bb4e81f06..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "common_kernel_base.h"
-#include "kernel_selector_params.h"
-#include <vector>
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// contract_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct contract_params : public base_params {
-    contract_params() : base_params(KernelType::CONTRACT), mode(ContractMode::ANY) {}
-    ContractMode mode;
-    std::vector<uint16_t> reduction_axes;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// contract_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct contract_optional_params : optional_params {
-    contract_optional_params() : optional_params(KernelType::CONTRACT) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// ContractKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class ContractKernelBase : public common_kernel_base {
-public:
-    using common_kernel_base::common_kernel_base;
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    JitConstants GetJitConstants(const contract_params& params) const;
-    static DispatchData SetDefault(const contract_params& params);
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp
deleted file mode 100644
index a9ad9a42496..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "contract_kernel_ref.h"
-
-namespace kernel_selector {
-ParamsKey ContractKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableInputDataType(Datatype::INT32);
-    k.EnableInputDataType(Datatype::INT64);
-
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT32);
-    k.EnableOutputDataType(Datatype::INT64);
-
-    k.EnableInputLayout(DataLayout::bfyx);
-
-    k.EnableOutputLayout(DataLayout::bfyx);
-
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-
-    return k;
-}
-
-KernelsData ContractKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h
deleted file mode 100644
index feabcafab76..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "contract_kernel_base.h"
-
-namespace kernel_selector {
-class ContractKernelRef : public ContractKernelBase {
-public:
-    ContractKernelRef() : ContractKernelBase("contract_ref") {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp
deleted file mode 100644
index e339c1fefa6..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "contract_kernel_selector.h"
-#include "contract_kernel_ref.h"
-
-namespace kernel_selector {
-contract_kernel_selector::contract_kernel_selector() { Attach<ContractKernelRef>(); }
-
-KernelsData contract_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::CONTRACT);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h
deleted file mode 100644
index b286988d504..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class contract_kernel_selector : public kernel_selector_base {
-public:
-    static contract_kernel_selector& Instance() {
-        static contract_kernel_selector instance;
-        return instance;
-    }
-
-    contract_kernel_selector();
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp
deleted file mode 100644
index ce52aedc295..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byx8_f4);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::Validate(const Params& p, const optional_params& o) const {
-    if (!Parent::Validate(p, o)) {
-        return false;
-    }
-
-    return true;
-}
-
-size_t static get_wg_batch_size(const convolution_params& params) {
-    if (params.inputs[0].Batch().v % 64 == 0)
-        return 32;
-    return 1;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::SetDefault(
-    const convolution_params& arg,
-    int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    runInfo.gws0 = (arg.output.Batch().v * arg.output.Feature().v) / (4 * 2);
-    runInfo.gws1 = arg.output.X().v / 8;
-    runInfo.gws2 = arg.output.Y().v / 2;
-
-    runInfo.lws0 = 8 * get_wg_batch_size(arg);
-    runInfo.lws1 = 1;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetJitConstants(const convolution_params& params,
-                                                                             const DispatchData& kd) const {
-    auto jits = ConvolutionKernelBase::GetJitConstants(params, kd);
-
-    jits.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params)));
-
-    return jits;
-}
-
-KernelsData ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params,
-                                                                           const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_3;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h
deleted file mode 100644
index b7eeb1e51fa..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h
+++ /dev/null
@@ -1,41 +0,0 @@
-﻿// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32 : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32()
-        : ConvolutionKernelBase("convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32") {}
-    virtual ~ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    ConvolutionKernelBase::DispatchData SetDefault(const convolution_params& arg, int) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_y_x8_osv8_isv4_swizzled_by_4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp
deleted file mode 100644
index 45bb5a16728..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::SetDefault(
-    const convolution_params& arg,
-    int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    runInfo.gws0 = (arg.output.Batch().v * arg.output.Feature().v) / 4;
-    runInfo.gws1 = arg.output.X().v / 8;
-    runInfo.gws2 = arg.output.Y().v;
-
-    runInfo.lws0 = 8;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-KernelsData ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params,
-                                                                       const optional_params& options) const {
-    return GetCommonKernelsData(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h
deleted file mode 100644
index 3507b157074..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h
+++ /dev/null
@@ -1,37 +0,0 @@
-﻿// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32 : public ConvolutionKernelBase {
-public:
-    ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32() : ConvolutionKernelBase("convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32") {}
-    virtual ~ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    ConvolutionKernelBase::DispatchData SetDefault(const convolution_params& arg, int) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::yxio;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp
deleted file mode 100644
index 35fade3594b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_1x1_gemm.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionKernel_mmad_1x1_gemm::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableDepthwiseSeparableOpt();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_1x1_gemm::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o)) {
-        return false;
-    }
-
-    const auto& params = static_cast<const convolution_params&>(p);
-
-    if (params.filterSize.x != 1 || params.filterSize.y != 1)
-        return false;
-
-    if (params.stride.x != 1 || params.stride.y != 1)
-        return false;
-
-    if (params.padding.x != 0 || params.padding.y != 0)
-        return false;
-
-    const auto& input = params.inputs[0];
-
-    // we do not support padded input
-    if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0)
-        return false;
-
-    if (params.split != 1)
-        return false;
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_1x1_gemm::SetDefault(const convolution_params& arg, int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    // Sub-group size used by "convolution_1x1_gemm_MMAD" kernel.
-    constexpr size_t sub_group_size = 8;
-
-    const auto of_maps = arg.output.Feature().v;
-    const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
-    runInfo.efficiency = FORCE_PRIORITY_2;
-
-    runInfo.gws0 = RoundUp(arg.output.X().v * arg.output.Y().v, 8) / 8;
-    runInfo.gws1 = of_threads_per_batch * arg.output.Batch().v;
-    runInfo.gws2 = 1;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = sub_group_size;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_1x1_gemm::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_1x1_gemm::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetTunedKernelsDataByIndex(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h
deleted file mode 100644
index 001e92a16c8..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h
+++ /dev/null
@@ -1,40 +0,0 @@
-﻿// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_1x1_gemm : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_1x1_gemm() : ConvolutionKernelBase("convolution_gpu_1x1_gemm_MMAD") {}
-    virtual ~ConvolutionKernel_mmad_1x1_gemm() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_isa8_osv8_isv4;
-    }
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp
deleted file mode 100644
index 69d79e7527e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-static const size_t _SG_TILE_M = 32;
-static const size_t _SG_TILE_N = 32;
-static const size_t _SG_SIZE = 8;         // sub group size
-static const size_t _TILES_PER_SG_X = 1;  // Persistent threads
-static const size_t _TILES_PER_SG_Y = 1;  // Persistent threads
-
-ParamsKey ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // make sure it's 1x1 conv
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
-        return false;
-
-    // make sure stride is 1x1
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    // input padding not supported
-    if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 ||
-        cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0)
-        return false;
-
-    // input and output spatial sizes must match
-    if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
-        return false;
-
-    const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v;
-    const auto k = cp.inputs[0].Feature().v;
-    const auto n = cp.output.Feature().v;
-
-    if (m % 32 != 0 && m % 128 != 0)  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128
-        return false;
-
-    if (k % 32 != 0)  // Matrix size K, Must be mutliple of 32
-        return false;
-
-    if (n % 32 != 0 && n % 128 != 0)  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128
-        return false;
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault(
-    const convolution_params& arg,
-    int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
-    size_t mat_n = arg.output.Feature().v;
-
-    size_t _MATRIX_M = mat_m;
-    size_t _MATRIX_N = mat_n;
-
-    size_t _WG_TILE_M = 128;
-    size_t _WG_TILE_N = 128;
-
-    // Calculate number of threads needed
-    const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
-    const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y;
-
-    // Define execution setup for kernel:
-    size_t globalWorkSize[3] = {threadsX, threadsY, 1};
-    size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1};
-
-    runInfo.gws0 = globalWorkSize[0];
-    runInfo.gws1 = globalWorkSize[1];
-    runInfo.gws2 = globalWorkSize[2];
-
-    runInfo.lws0 = localWorkSize[0];
-    runInfo.lws1 = localWorkSize[1];
-    runInfo.lws2 = localWorkSize[2];
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants(const convolution_params& params,
-                                                                                const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("WG_TILE_M", 128));     // Work-Group tile size M, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("WG_TILE_N", 128));     // Work-Group tile size N, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1));  // Persistent threads
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1));  // Persistent threads
-
-    // Do not change values below
-    jit.AddConstant(MakeJitConstant("DIM_X", 0));
-    jit.AddConstant(MakeJitConstant("DIM_Y", 1));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
-    jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
-    jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
-    jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
-    jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
-
-    jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
-    jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
-
-    const auto& input = params.inputs[0];
-    const auto& output = params.output;
-
-    auto m = output.X().v * output.Y().v * output.Batch().v;
-    auto k = input.Feature().v;
-    auto n = output.Feature().v;
-
-    jit.AddConstant(MakeJitConstant("MATRIX_M", m));
-    jit.AddConstant(MakeJitConstant("MATRIX_K", k));
-    jit.AddConstant(MakeJitConstant("MATRIX_N", n));
-
-    const size_t out_x_pitch = 32 * 4;
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-    const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
-    const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
-    const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
-
-    bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0;
-    jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData(const Params& params,
-                                                                              const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_1;  // _3
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h
deleted file mode 100644
index 4ae916967fc..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h
+++ /dev/null
@@ -1,42 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8 : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8()
-        : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_128x128wg_slm_int8") {}
-
-    virtual ~ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::is_o32_yx_isv32_swizzled_by_4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp
deleted file mode 100644
index 6360cd68b1d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-static const size_t _SG_TILE_M = 32;
-static const size_t _SG_TILE_N = 32;
-static const size_t _SG_SIZE = 8;         // sub group size
-static const size_t _TILES_PER_SG_X = 1;  // Persistent threads
-static const size_t _TILES_PER_SG_Y = 1;  // Persistent threads
-
-ParamsKey ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // make sure it's 1x1 conv
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
-        return false;
-
-    // make sure stride is 1x1
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    // input padding not supported
-    if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 ||
-        cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0)
-        return false;
-
-    // input and output spatial sizes must match
-    if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
-        return false;
-
-    const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v;
-    const auto k = cp.inputs[0].Feature().v;
-    const auto n = cp.output.Feature().v;
-
-    if (m % 32 != 0 && m % 224 != 0)  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128
-        return false;
-
-    if (k % 32 != 0)  // Matrix size K, Must be mutliple of 32
-        return false;
-
-    if (n % 32 != 0 && n % 128 != 0)  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128
-        return false;
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault(
-    const convolution_params& arg,
-    int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
-    size_t mat_n = arg.output.Feature().v;
-
-    size_t _MATRIX_M = mat_m;
-    size_t _MATRIX_N = mat_n;
-
-    size_t _WG_TILE_M = 224;
-    size_t _WG_TILE_N = 128;
-
-    // Calculate number of threads needed
-    const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
-    const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y;
-
-    // Define execution setup for kernel:
-    size_t globalWorkSize[3] = {threadsX, threadsY, 1};
-    size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1};
-
-    runInfo.gws0 = globalWorkSize[0];
-    runInfo.gws1 = globalWorkSize[1];
-    runInfo.gws2 = globalWorkSize[2];
-
-    runInfo.lws0 = localWorkSize[0];
-    runInfo.lws1 = localWorkSize[1];
-    runInfo.lws2 = localWorkSize[2];
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants(const convolution_params& params,
-                                                                                const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("WG_TILE_M", 224));  // Work-Group tile size M, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("WG_TILE_N", 128));  // Work-Group tile size N, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X));
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y));
-
-    // Do not change values below
-    jit.AddConstant(MakeJitConstant("DIM_X", 0));
-    jit.AddConstant(MakeJitConstant("DIM_Y", 1));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
-    jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
-    jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
-    jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
-    jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
-
-    jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
-    jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
-
-    const auto& input = params.inputs[0];
-    const auto& output = params.output;
-
-    auto m = output.X().v * output.Y().v * output.Batch().v;
-    auto k = input.Feature().v;
-    auto n = output.Feature().v;
-
-    jit.AddConstant(MakeJitConstant("MATRIX_M", m));  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M
-    jit.AddConstant(MakeJitConstant("MATRIX_K", k));  // Matrix size K, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("MATRIX_N", n));  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N
-
-    const size_t out_x_pitch = 32 * 4;
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-    const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
-    const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
-    const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
-
-    bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0;
-    jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData(const Params& params,
-                                                                              const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_1;  // _3
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h
deleted file mode 100644
index 4ac16cf5e1b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h
+++ /dev/null
@@ -1,42 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8 : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8()
-        : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_224x128wg_slm_int8") {}
-
-    virtual ~ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::is_o32_yx_isv32_swizzled_by_4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp
deleted file mode 100644
index 141ec7cf439..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_kernel_mmad_32x32sg_slm_int8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-static const size_t _SG_TILE_M = 32;
-static const size_t _SG_TILE_N = 32;
-static const size_t _SG_SIZE = 8;         // sub group size
-static const size_t _TILES_PER_SG_X = 1;  // Persistent threads
-static const size_t _TILES_PER_SG_Y = 1;  // Persistent threads
-
-ParamsKey ConvolutionKernel_mmad_32x32sg_slm_int8::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableQuantization(QuantizationType::SYMMETRIC);
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionKernel_mmad_32x32sg_slm_int8::Validate(const Params& p, const optional_params& o) const {
-    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // make sure it's 1x1 conv
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
-        return false;
-
-    // make sure stride is 1x1
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    // input padding not supported
-    if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 ||
-        cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0)
-        return false;
-
-    // input and output spatial sizes must match
-    if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
-        return false;
-
-    const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v;
-    const auto k = cp.inputs[0].Feature().v;
-    const auto n = cp.output.Feature().v;
-
-    if (m % 32 != 0)  // Matrix size M, Must be mutliple of 32
-        return false;
-
-    if (k % 32 != 0)  // Matrix size K, Must be multiple of 32
-        return false;
-
-    if (n % 32 != 0)  // Matrix size N, Must be mutliple of 32
-        return false;
-
-    return true;
-}
-
-ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_slm_int8::SetDefault(const convolution_params& arg,
-                                                                                        int) const {
-    DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_2;
-
-    size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
-    size_t mat_n = arg.output.Feature().v;
-
-    size_t _MATRIX_M = mat_m;
-    size_t _MATRIX_N = mat_n;
-
-    size_t _WG_TILE_M = 32;
-    size_t _WG_TILE_N = 32;
-
-    // Calculate number of threads needed
-    const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
-    const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y;
-
-    // Define execution setup for kernel:
-    size_t globalWorkSize[3] = {threadsX, threadsY, 1};
-    size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1};
-
-    runInfo.gws0 = globalWorkSize[0];
-    runInfo.gws1 = globalWorkSize[1];
-    runInfo.gws2 = globalWorkSize[2];
-
-    runInfo.lws0 = localWorkSize[0];
-    runInfo.lws1 = localWorkSize[1];
-    runInfo.lws2 = localWorkSize[2];
-
-    return runInfo;
-}
-
-JitConstants ConvolutionKernel_mmad_32x32sg_slm_int8::GetJitConstants(const convolution_params& params,
-                                                                      const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("WG_TILE_M", 32));  // Work-Group tile size M, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("WG_TILE_N", 32));  // Work-Group tile size N, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X));
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y));
-
-    // Do not change values below
-    jit.AddConstant(MakeJitConstant("DIM_X", 0));
-    jit.AddConstant(MakeJitConstant("DIM_Y", 1));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
-    jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
-    jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
-    jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
-    jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
-
-    jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
-    jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
-
-    const auto& input = params.inputs[0];
-    const auto& output = params.output;
-
-    auto m = output.X().v * output.Y().v * output.Batch().v;
-    auto k = input.Feature().v;
-    auto n = output.Feature().v;
-
-    jit.AddConstant(MakeJitConstant("MATRIX_M", m));  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M
-    jit.AddConstant(MakeJitConstant("MATRIX_K", k));  // Matrix size K, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("MATRIX_N", n));  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N
-
-    const size_t out_x_pitch = 32 * 4;
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-    const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
-    const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
-    const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
-
-    return jit;
-}
-
-KernelsData ConvolutionKernel_mmad_32x32sg_slm_int8::GetKernelsData(const Params& params,
-                                                                    const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_2;  // _3
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h
deleted file mode 100644
index 6a9250d5e8a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h
+++ /dev/null
@@ -1,41 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class ConvolutionKernel_mmad_32x32sg_slm_int8 : public ConvolutionKernelBase {
-public:
-    using Parent = ConvolutionKernelBase;
-    ConvolutionKernel_mmad_32x32sg_slm_int8() : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_slm_int8") {}
-
-    virtual ~ConvolutionKernel_mmad_32x32sg_slm_int8() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::is_o_yx_isv32;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
index a1f16f37da9..558b2265399 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
@@ -35,19 +35,8 @@
 #include "convolution_kernel_winograd_6x3_s1_fused.h"
 #include "convolution_kernel_mmad.h"
 #include "convolution_kernel_mmad_blocks.h"
-#include "convolution_kernel_mmad_1x1_gemm.h"
 #include "convolution_kernel_imad_byxf_af32_depthwise.h"
-#include "convolution_kernel_mmad_batched.h"
 #include "convolution_kernel_bfyx_depthwise_weights_lwg.h"
-#include "convolution_kernel_mmad_slm_2x14_rep4.h"
-#include "convolution_kernel_mmad_slm_7x7_rep4.h"
-#include "convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h"
-#include "convolution_kernel_mmad_batched_block.h"
-#include "convolution_kernel_mmad_batched_block_1x1.h"
-#include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h"
-#include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h"
-#include "convolution_kernel_mmad_32x32sg_slm_int8.h"
-#include "convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h"
 #include "convolution_kernel_imad.h"
 #include "convolution_kernel_fs_byx_fsv32.h"
 #include "convolution_kernel_fs_byx_fsv32_1x1.h"
@@ -134,19 +123,6 @@ convolution_kernel_selector::convolution_kernel_selector() {
     Attach<ConvolutionKernel_mmad_blocks>();
     Attach<ConvolutionKernel_imad_byxf_af32_1x1>();
     Attach<ConvolutionKernel_imad_byxf_af32_depthiwise>();
-    Attach<ConvolutionKernel_mmad_1x1_gemm>();
-
-    // fs_bs_yx_bsv4_fsv32 int8
-    Attach<ConvolutionKernel_mmad_batched>();
-    Attach<ConvolutionKernel_mmad_slm_2x14_rep4>();
-    Attach<ConvolutionKernel_mmad_slm_7x7_rep4>();
-    Attach<ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8>();
-    Attach<ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8>();
-    Attach<ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32>();
-    Attach<ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32>();
-    Attach<ConvolutionKernel_mmad_batched_block>();
-    Attach<ConvolutionKernel_mmad_batched_block_1x1>();
-    // Attach<ConvolutionKernel_mmad_32x32sg_slm_int8>();
 
     // b_fs_yx_fsv4 kernels
     Attach<ConvolutionKernel_imad>();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp
deleted file mode 100644
index 85a7d07a936..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_grad_weights_kernel_1x1.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionGradWeightsKernel1x1::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableSubGroup();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableMomentum();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionGradWeightsKernel1x1::Validate(const Params& p, const optional_params&) const {
-    const convolution_grad_weights_params& params = static_cast<const convolution_grad_weights_params&>(p);
-
-    if (params.filterSize.x != 1 || params.filterSize.y != 1)
-        return false;
-    return true;
-}
-
-ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel1x1::SetDefault(
-    const convolution_grad_weights_params& params) const {
-    auto input_features = params.weights.IFM().v;
-    auto output_features = params.weights.OFM().v;
-
-    DispatchData kd;
-
-    kd.gws0 = 16;
-    kd.gws1 = input_features;
-    kd.gws2 = output_features;
-    kd.lws0 = 16;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = FORCE_PRIORITY_8;
-    return kd;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h
deleted file mode 100644
index 7770075a385..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "convolution_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class ConvolutionGradWeightsKernel1x1 : public ConvolutionGradWeightsKernelBase {
-public:
-    ConvolutionGradWeightsKernel1x1() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_1x1") {}
-    virtual ~ConvolutionGradWeightsKernel1x1() {}
-
-    DispatchData SetDefault(const convolution_grad_weights_params& params) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp
deleted file mode 100644
index 6158d7a7ead..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_grad_weights_kernel_3x3.h"
-#include <algorithm>
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionGradWeightsKernel3x3::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableMomentum();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionGradWeightsKernel3x3::Validate(const Params& p, const optional_params&) const {
-    const auto& params = static_cast<const convolution_grad_weights_params&>(p);
-
-    if (params.stride.x != 1 || params.stride.y != 1)
-        return false;
-    if (params.filterSize.x != 3 || params.filterSize.y != 3)
-        return false;
-    return true;
-}
-
-ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel3x3::SetDefault(
-    const convolution_grad_weights_params& params) const {
-    auto input_features = params.weights.IFM().v;
-    auto output_features = params.weights.OFM().v;
-
-    DispatchData kd;
-
-    kd.gws0 = Align(output_features, 16);
-    kd.gws1 = input_features;
-    kd.gws2 = 1;
-    kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
-    while (kd.gws0 % kd.lws0 != 0) {
-        kd.lws0 -= 16;
-    }
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = FORCE_PRIORITY_8;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h
deleted file mode 100644
index 48f3591a1bb..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "convolution_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class ConvolutionGradWeightsKernel3x3 : public ConvolutionGradWeightsKernelBase {
-public:
-    ConvolutionGradWeightsKernel3x3() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_3x3") {}
-    virtual ~ConvolutionGradWeightsKernel3x3() {}
-
-    DispatchData SetDefault(const convolution_grad_weights_params& params) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp
deleted file mode 100644
index 2e4254d3cd4..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_grad_weights_kernel_7x7.h"
-#include <algorithm>
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionGradWeightsKernel7x7::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableMomentum();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionGradWeightsKernel7x7::Validate(const Params& p, const optional_params&) const {
-    const auto& params = static_cast<const convolution_grad_weights_params&>(p);
-
-    if (params.filterSize.x != 7 || params.filterSize.y != 7)
-        return false;
-    return true;
-}
-
-ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel7x7::SetDefault(
-    const convolution_grad_weights_params& params) const {
-    auto input_features = params.weights.IFM().v;
-    auto output_features = params.weights.OFM().v;
-
-    DispatchData kd;
-
-    kd.gws0 = 8;
-    kd.gws1 = Align(output_features, 16);
-    kd.gws2 = input_features;
-    kd.lws0 = 1;
-    kd.lws1 = std::min(std::max(kd.gws1, static_cast<size_t>(1)), static_cast<size_t>(32));
-    while (kd.gws1 % kd.lws1 != 0) {
-        kd.lws1 -= 16;
-    }
-    kd.lws2 = 1;
-    kd.efficiency = FORCE_PRIORITY_8;
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h
deleted file mode 100644
index a1f99ce0799..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "convolution_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class ConvolutionGradWeightsKernel7x7 : public ConvolutionGradWeightsKernelBase {
-public:
-    ConvolutionGradWeightsKernel7x7() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_7x7") {}
-    virtual ~ConvolutionGradWeightsKernel7x7() {}
-
-    DispatchData SetDefault(const convolution_grad_weights_params& params) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp
deleted file mode 100644
index 6d799f73849..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-﻿// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "convolution_grad_weights_kernel_base.h"
-#include "kernel_selector_utils.h"
-#include <string>
-#include <vector>
-#include <algorithm>
-
-namespace kernel_selector {
-std::string convolution_grad_weights_params::to_string() const {
-    std::stringstream s;
-
-    s << base_params::to_string() << "_";
-    if (bias.empty()) {
-        s << "no_bias"
-          << "_";
-    } else {
-        s << "bias_" << bias[0].PhysicalSize() << "_";
-    }
-    s << filterSize.x << "_" << filterSize.y << "_";
-    s << stride.x << "_" << stride.y << "_";
-    s << dilation.x << "_" << dilation.y << "_";
-    s << padding.x << "_" << padding.y << "_";
-    s << split;
-
-    return s.str();
-}
-
-JitConstants ConvolutionGradWeightsKernelBase::GetJitConstants(const convolution_grad_weights_params& cp) const {
-    JitConstants jit = training_kernel_base::GetJitConstants(cp);
-    const auto& padding = cp.padding;
-    const auto& input = cp.inputs[0];
-
-    int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() -
-                                        (cp.filterSize.x - 1 + padding.x) * input.X().pitch -
-                                        (cp.filterSize.y - 1 + padding.y) * input.Y().pitch;
-    input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0);
-
-    jit.AddConstants({
-        MakeJitConstant("STRIDE", cp.stride),
-        MakeJitConstant("PADDING", cp.padding),
-        MakeJitConstant("DILATION", cp.dilation),
-        MakeJitConstant("FILTER_ARRAY_NUM", cp.split),
-        MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding),
-        MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", cp.depthwise_separable_opt),
-        MakeJitConstant("OUTPUT_GRAD_W", cp.output_grad_w),
-    });
-
-    return jit;
-}
-
-ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernelBase::SetDefault(
-    const convolution_grad_weights_params& params) const {
-    auto input_features = params.weights.IFM().v;
-    auto output_features = params.weights.OFM().v;
-
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-    size_t gws0 = output_features * input_features;
-    size_t lws0 = std::min(gws0, static_cast<size_t>(32));
-    while (gws0 % lws0) {
-        lws0--;
-    }
-    kd.gws0 = gws0;
-    kd.gws1 = params.weights.X().v;
-    kd.gws2 = params.weights.Y().v;
-    kd.lws0 = lws0;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-    return kd;
-}
-
-KernelsData ConvolutionGradWeightsKernelBase::GetKernelsData(const Params& params,
-                                                             const optional_params& options) const {
-    assert(params.GetType() == KernelType::CONVOLUTION_GRAD_WEIGHTS);
-
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    const convolution_grad_weights_params& orgParams = static_cast<const convolution_grad_weights_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-    KernelData kd = KernelData::Default<convolution_grad_weights_params>(params);
-    convolution_grad_weights_params& newParams = *static_cast<convolution_grad_weights_params*>(kd.params.get());
-
-    bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oiyx, kd.weightsReorderParams);
-
-    if (!succeed) {
-        return {};
-    }
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel,
-                     runInfo,
-                     params.engineInfo,
-                     kernelName,
-                     jit,
-                     entry_point,
-                     DEFAULT,
-                     true,
-                     !orgParams.bias.empty());
-    if (newParams.use_momentum) {
-        kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0});
-        if (!newParams.bias.empty())
-            kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_BIAS_GRADIENT, 0});
-    }
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
-    kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0});
-    kernel.arguments.push_back({ArgumentDescriptor::Types::LEARNING_RATE, 0});
-
-    kd.estimatedTime = runInfo.efficiency;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h
deleted file mode 100644
index d3f843b174d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h
+++ /dev/null
@@ -1,79 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "training_kernel_base.h"
-#include "kernel_selector_params.h"
-#include <string>
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// convolution_grad_weights_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct convolution_grad_weights_params : public training_params {
-    convolution_grad_weights_params() : training_params(KernelType::CONVOLUTION_GRAD_WEIGHTS) {}
-
-    uSize filterSize;
-    uSize stride;
-    uSize dilation;
-    uSize padding;
-    uint32_t split = 1;
-    bool depthwise_separable_opt = false;
-    bool output_grad_w = false;
-
-    std::string to_string() const override;
-
-    ParamsKey GetParamsKey() const override {
-        ParamsKey k = training_params::GetParamsKey();
-
-        if (split > 1) {
-            k.EnableSplitSupport();
-        }
-
-        if (dilation.x != 1 || dilation.y != 1) {
-            k.EnableDilation();
-        }
-
-        if (depthwise_separable_opt) {
-            k.EnableDepthwiseSeparableOpt();
-        }
-        return k;
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// convolution_grad_weights_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct convolution_grad_weights_optional_params : training_optional_params {
-    convolution_grad_weights_optional_params() : training_optional_params(KernelType::CONVOLUTION_GRAD_WEIGHTS) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// ConvolutionGradWeightsKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class ConvolutionGradWeightsKernelBase : public training_kernel_base {
-public:
-    using training_kernel_base::training_kernel_base;
-    virtual ~ConvolutionGradWeightsKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const;
-    virtual JitConstants GetJitConstants(const convolution_grad_weights_params& params) const;
-    virtual DispatchData SetDefault(const convolution_grad_weights_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp
deleted file mode 100644
index 6ce107dcec7..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_grad_weights_kernel_ref.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionGradWeightsKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableMomentum();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h
deleted file mode 100644
index 141ca55ec6f..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h
+++ /dev/null
@@ -1,29 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "convolution_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class ConvolutionGradWeightsKernelRef : public ConvolutionGradWeightsKernelBase {
-public:
-    ConvolutionGradWeightsKernelRef() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_ref") {}
-    virtual ~ConvolutionGradWeightsKernelRef() {}
-
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp
deleted file mode 100644
index 405c6801516..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "convolution_grad_weights_kernel_selector.h"
-#include "convolution_grad_weights_kernel_ref.h"
-#include "convolution_grad_weights_kernel_1x1.h"
-#include "convolution_grad_weights_kernel_yxfb.h"
-#include "convolution_grad_weights_kernel_3x3.h"
-#include "convolution_grad_weights_kernel_7x7.h"
-
-namespace kernel_selector {
-convolution_grad_weights_kernel_selector::convolution_grad_weights_kernel_selector() {
-    Attach<ConvolutionGradWeightsKernelRef>();
-    Attach<ConvolutionGradWeightsKernel1x1>();
-    Attach<ConvolutionGradWeightsKernel_yxfb>();
-    Attach<ConvolutionGradWeightsKernel3x3>();
-    Attach<ConvolutionGradWeightsKernel7x7>();
-}
-
-KernelsData convolution_grad_weights_kernel_selector::GetBestKernels(const Params& params,
-                                                                     const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::CONVOLUTION_GRAD_WEIGHTS);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h
deleted file mode 100644
index ed5a30c6df5..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h
+++ /dev/null
@@ -1,34 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class convolution_grad_weights_kernel_selector : public kernel_selector_base {
-public:
-    static convolution_grad_weights_kernel_selector& Instance() {
-        static convolution_grad_weights_kernel_selector instance_;
-        return instance_;
-    }
-
-    convolution_grad_weights_kernel_selector();
-
-    virtual ~convolution_grad_weights_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp
deleted file mode 100644
index d5b63fe62d7..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_grad_weights_kernel_yxfb.h"
-
-namespace kernel_selector {
-
-ParamsKey ConvolutionGradWeightsKernel_yxfb::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableSubGroup();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableMomentum();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-
-bool ConvolutionGradWeightsKernel_yxfb::Validate(const Params& p, const optional_params&) const {
-    const convolution_grad_weights_params& params = static_cast<const convolution_grad_weights_params&>(p);
-    auto batch = params.inputs[0].Batch().v;
-
-    if (batch % 16 != 0)
-        return false;
-    if (params.stride.x != 1 || params.stride.y != 1)
-        return false;
-    return true;
-}
-
-ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel_yxfb::SetDefault(
-    const convolution_grad_weights_params& params) const {
-    auto input_features = params.weights.IFM().v;
-    auto output_features = params.weights.OFM().v;
-    auto x = params.weights.X().v;
-    auto y = params.weights.Y().v;
-
-    DispatchData kd;
-
-    kd.gws0 = 16;
-    kd.gws1 = input_features * output_features;
-    kd.gws2 = x * y;
-
-    kd.lws0 = 16;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = FORCE_PRIORITY_7;
-
-    return kd;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h
deleted file mode 100644
index 6e897babfa2..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "convolution_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class ConvolutionGradWeightsKernel_yxfb : public ConvolutionGradWeightsKernelBase {
-public:
-    ConvolutionGradWeightsKernel_yxfb() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_yxfb") {}
-    virtual ~ConvolutionGradWeightsKernel_yxfb() {}
-
-    DispatchData SetDefault(const convolution_grad_weights_params& params) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp
index 8ec74cd406f..4084bdb8ae1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp
@@ -35,7 +35,6 @@ ParamsKey DeconvolutionKernel_bfyx_opt::GetSupportedKey() const {
     k.EnableBatching();
     k.EnableSplitSupport();
     k.EnableDepthwiseSeparableOpt();
-    k.EnableGradient();
     k.EnableGroupedConvolution();
     return k;
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp
index 4f6bfc29ade..b3d4268b4d9 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp
@@ -55,7 +55,6 @@ ParamsKey DeconvolutionKernelRef::GetSupportedKey() const {
     k.EnableBatching();
     k.EnableSplitSupport();
     k.EnableDepthwiseSeparableOpt();
-    k.EnableGradient();
     k.EnableGroupedConvolution();
     k.EnableDifferentTypes();
     k.EnableDifferentInputWeightsTypes();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
index 102fcf2a59f..38c69c3c017 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
@@ -222,9 +222,7 @@ KernelsData EltwiseKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, co
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
     kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
                                    false,
-                                   false,
-                                   newParams.int8_quantization,
-                                   newParams.output_calibration);
+                                   false);
 
     kd.estimatedTime = runInfo.efficiency;
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp
deleted file mode 100644
index 1f173bb8886..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
-// Copyright (c) 2019-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "eltwise_kernel_b_fs_yx_fsv4.h"
-#include "kernel_selector_utils.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey EltwiseKernel_b_fs_yx_fsv4::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnableInt8Quantization();
-    k.EnableEltwiseStride();
-    return k;
-}
-
-EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv4::SetDefault(const eltwise_params& params) const {
-    DispatchData kd;
-
-    // Because of very specific requirements for data, we may linearize the data,
-    // i.e. use only one dimension, e.g. 'X'.
-
-    // GWS:
-    // we process 4*4 (4 int8 bytes per on block_read4 reading) features per workitem
-    kd.gws0 = params.output.X().v * params.output.Y().v * params.output.Batch().v * params.output.Feature().v / (4 * 4);
-    kd.gws1 = 1;
-    kd.gws2 = 1;
-    // LWS:
-    kd.lws0 = 8;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-
-    kd.efficiency = FORCE_PRIORITY_1;
-    return kd;
-}
-
-bool EltwiseKernel_b_fs_yx_fsv4::Validate(const Params& params, const optional_params& options) const {
-    // Requirents to use 'eltwise_b_fs_yx_fsv4' kernel are below:
-    // 1. No stride
-    // 2. All dimensions for all inputs are the same
-    // 3. No padding
-    // So, it can be linearized
-
-    if (!Parent::Validate(params, options)) {
-        return false;
-    }
-
-    KernelData kd = KernelData::Default<eltwise_params>(params);
-    eltwise_params& newParams = *static_cast<eltwise_params*>(kd.params.get());
-
-    // 1. No stride
-    if (!newParams.stride.empty()) {
-        return false;
-    }
-
-    for (size_t i = 0; i < newParams.inputs.size() - 1; i++) {
-        // 2. All dimensions for all inputs are the same
-        if (!(newParams.inputs[i] == newParams.inputs[i + 1])) {
-            return false;
-        }
-    }
-
-    const auto& in = newParams.inputs[0];
-    for (size_t i = 0; i < in.Dimentions(); i++) {
-        // 3. No padding
-        if ((in.GetDims()[i].pad.before != 0) || (in.GetDims()[i].pad.after != 0)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-JitConstants EltwiseKernel_b_fs_yx_fsv4::GetJitConstants(const eltwise_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    if (params.inputs[0].GetDType() == Datatype::UINT8) {
-        // Special handler for unsigned types
-        jit.AddConstants({MakeJitConstant("ELTW_UNSIGNED", 1)});
-    }
-
-    ///////////////
-    jit.AddConstants({
-        MakeJitConstant("ELTWISE_LAYOUT_BASED", params.layoutBased),
-        MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization),
-    });
-
-    if (params.int8_quantization) {
-        if (params.output_calibration) {
-            jit.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.output_calibration));
-            jit.AddConstant(MakeJitConstant("O_QF", params.output_calibration_factors[0]));
-
-        } else {
-            jit.AddConstants({MakeJitConstant("O_QF", params.output_quantization_factor)});
-        }
-    }
-
-    std::string inputs_decls;
-    auto& updateInputs = params.updateInputIds;
-
-    for (size_t i = 0; i < params.inputs.size(); i++) {
-        // const should be added only to inputs which will not be updated
-        std::string const_str = "const";
-        for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) {
-            if (updateInputs[update_input_idx].inputId == i) {
-                const_str = "";
-                break;
-            }
-        }
-
-        inputs_decls +=
-            const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", ";
-    }
-
-    jit.AddConstant(MakeJitConstant("INPUTS_DECLS", inputs_decls));
-    jit.AddConstant(MakeJitConstant("ELTWISE_NO_PITCH_SAME_DIMS", CheckInputsOutputNoPitchSameDims(params)));
-
-    std::string do_eltwise;
-
-    auto& operations = params.operations;
-    auto& coefficients = params.coefficients;
-
-    for (size_t op_num = 0; op_num < operations.size(); op_num++) {
-        const std::string op_num_str = std::to_string(op_num);
-        const auto& ew = operations[op_num];
-
-        for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
-            const auto& input = ew.inputs[input_idx];
-            const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx);
-            switch (input.mode) {
-                case EltwiseInputMode::SCALAR:
-                    jit.AddConstant(MakeJitConstant(name, input.scalar));
-                    break;
-                case EltwiseInputMode::INPUT_BUFFER:
-                    jit.AddConstant(MakeJitConstant(name,
-                                                    "GET_INPUT(input" + std::to_string(input.index) + ", INPUT" +
-                                                        std::to_string(input.index) + ")"));
-                    break;
-                case EltwiseInputMode::OUTPUT_BUFFER:
-                    jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT, )]"));
-                    break;
-                case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER:
-                    jit.AddConstant(MakeJitConstant(
-                        name,
-                        "input" + std::to_string(input.index) + "[(size_t)tmp" + std::to_string(input.tmpIndex) + "]"));
-                    break;
-                case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX:
-                    jit.AddConstant(MakeJitConstant(name, "tmp" + std::to_string(input.tmpIndex)));
-                    break;
-                default:
-                    break;
-            }
-        }
-        std::string input0_str, input1_str, cast_type, op;
-
-        cast_type = "(int16)";
-        op = "const int16 tmp" + op_num_str + " = ";
-
-        input0_str = cast_type + "INPUT_" + op_num_str + "_0";
-        input1_str = cast_type + "INPUT_" + op_num_str + "_1";
-
-        if (ew.mode == EltwiseMode::ADD) {
-            std::vector<std::string> coeff_strings(ew.inputs.size(), "");
-            for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
-                const auto& input = ew.inputs[input_idx];
-                if (input.mode == EltwiseInputMode::INPUT_BUFFER && input.index < coefficients.size()) {
-                    const float c = coefficients[input.index];
-                    if (c != 1.0f)
-                        coeff_strings[input_idx] = cast_type + "(" + std::to_string(c) + ")*";
-                }
-            }
-
-            input0_str = coeff_strings[0] + input0_str;
-            input1_str = coeff_strings[1] + input1_str;
-        }
-
-        switch (ew.mode) {
-            case EltwiseMode::ADD:
-                op += input0_str + " + " + input1_str;
-                break;
-            case EltwiseMode::SUB:
-                op += input0_str + " - " + input1_str;
-                break;
-            case EltwiseMode::MUL:
-                op += input0_str + " * " + input1_str;
-                break;
-            case EltwiseMode::DIV:
-                op += input0_str + " / " + input1_str;
-                break;
-            case EltwiseMode::MODULU:
-            case EltwiseMode::MIN:
-            case EltwiseMode::MAX: {
-                auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max"));
-                auto input_0_type = params.inputs[0].GetDType();
-                auto input_1_type = params.inputs[1].GetDType();
-
-                // input_0 == int
-                if (input_0_type == kernel_selector::Datatype::INT8 ||
-                    input_0_type == kernel_selector::Datatype::UINT8) {
-                    // input_0 == int && input_1 == int
-                    if (input_1_type == kernel_selector::Datatype::INT8 ||
-                        input_1_type == kernel_selector::Datatype::UINT8) {
-                        if (ew.mode == EltwiseMode::MODULU)
-                            op += input0_str + " % " + input1_str;
-                        else
-                            op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")";
-                    // input_0 == int && input_1 != int
-                    } else {
-                        op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")";
-                    }
-                // input_0 != int && input_1 == int
-                } else if (input_1_type == kernel_selector::Datatype::INT8 ||
-                         input_1_type == kernel_selector::Datatype::UINT8) {
-                    op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))";
-                // input_0 != int && input_1 != int
-                } else {
-                    op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")";
-                }
-            } break;
-            case EltwiseMode::POW:
-                op += cast_type + "pow(" + input0_str + ", " + input1_str + ")";
-                break;
-            case EltwiseMode::SQRT:
-                op += cast_type + "sqrt(" + input0_str + ")";
-                break;
-            case EltwiseMode::RSQRT:
-                op += cast_type + "1/sqrt(" + input0_str + ")";
-                break;
-            case EltwiseMode::ASSIGN:
-                op += input0_str;
-                break;
-            default:
-                break;
-        }
-
-        std::string opname = "OPERATION" + op_num_str;
-        jit.AddConstant(MakeJitConstant(opname, op));
-        do_eltwise += "\\\n\t" + opname + ";";
-    }
-
-    for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++)
-        do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + "[GET_INDEX(INPUT, " +
-                      std::to_string(updateInputs[update_input_idx].inputId) + ")] = tmp" +
-                      std::to_string(updateInputs[update_input_idx].tmpId) + ";";
-
-    do_eltwise += "\\\n\tres = tmp" + std::to_string(operations.size() - 1) + ";";
-
-    jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise));
-
-    if (params.layoutBased || params.int8_quantization) {
-        jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));
-    }
-
-    if (!params.stride.empty()) {
-        jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1));
-    }
-
-    ///////////////
-    return jit;
-}
-
-KernelsData EltwiseKernel_b_fs_yx_fsv4::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h
deleted file mode 100644
index 72d9d5a09cf..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "eltwise_kernel_base.h"
-
-namespace kernel_selector {
-class EltwiseKernel_b_fs_yx_fsv4 : public EltwiseKernelBase {
-public:
-    using Parent = EltwiseKernelBase;
-    EltwiseKernel_b_fs_yx_fsv4() : EltwiseKernelBase("eltwise_b_fs_yx_fsv4") {}
-    virtual ~EltwiseKernel_b_fs_yx_fsv4() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& params, const optional_params& options) const override;
-    JitConstants GetJitConstants(const eltwise_params& params) const override;
-    DispatchData SetDefault(const eltwise_params& params) const override;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
index e3b4b7ddf6c..dc702492587 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp
@@ -51,17 +51,6 @@ static uint32_t GetNumberOfInputs(EltwiseMode m) {
 
 ParamsKey eltwise_params::GetParamsKey() const {
     ParamsKey k = base_params::GetParamsKey();
-    if (int8_quantization) {
-        k.EnableInt8Quantization();
-    }
-
-    if (output_calibration) {
-        k.EnableOutputCalibration();
-    }
-
-    if (inputs_calibration) {
-        k.EnableEltwiseInputsCalibration();
-    }
 
     if (!stride.empty()) {
         k.EnableEltwiseStride();
@@ -617,9 +606,7 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
     kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(),
                                    false,
-                                   false,
-                                   newParams.int8_quantization,
-                                   newParams.output_calibration);
+                                   false);
 
     kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h
index 22d398d0b55..0e59efa68b1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h
@@ -84,14 +84,8 @@ struct eltwise_params : public base_params {
 
     bool layoutBased = false;
     bool int8_quantization = false;
-    bool output_calibration = false;
-    float output_quantization_factor = 1.0f;
-    bool inputs_calibration = false;
     bool broadcast = false;
 
-    MultiDataTensor output_calibration_factors;
-    MultiDataTensor inputs_calibration_factors;
-    std::vector<float> input_quantization_factors;
     virtual ParamsKey GetParamsKey() const;
 };
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp
deleted file mode 100644
index 4e8ff935db3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-﻿// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "eltwise_kernel_fs_bs_yx_bsv4_fsv32.h"
-#include "kernel_selector_utils.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey EltwiseKernel_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnableInt8Quantization();
-    k.EnableEltwiseStride();
-    return k;
-}
-
-EltwiseKernelBase::DispatchData EltwiseKernel_fs_bs_yx_bsv4_fsv32::SetDefault(const eltwise_params& params) const {
-    DispatchData kd;
-
-    kd.gws0 = params.output.X().v;
-    kd.gws1 = params.output.Y().v;
-    // we process 4 batches and 4 features per workitem
-    kd.gws2 = (params.output.Batch().v / 4) * (params.output.Feature().v / 4);
-    kd.lws0 = 1;
-    kd.lws1 = 1;
-    kd.lws2 = 8;
-
-    kd.efficiency = FORCE_PRIORITY_3;
-    return kd;
-}
-
-JitConstants EltwiseKernel_fs_bs_yx_bsv4_fsv32::GetJitConstants(const eltwise_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
-    ///////////////
-    jit.AddConstants({
-        MakeJitConstant("ELTWISE_LAYOUT_BASED", params.layoutBased),
-        MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization),
-    });
-
-    if (params.int8_quantization) {
-        if (params.output_calibration) {
-            jit.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.output_calibration));
-            jit.AddConstant(MakeJitConstant("O_QF", params.output_calibration_factors[0]));
-
-        } else {
-            jit.AddConstants({MakeJitConstant("O_QF", params.output_quantization_factor)});
-        }
-    }
-
-    std::string inputs_decls;
-    auto& updateInputs = params.updateInputIds;
-
-    for (size_t i = 0; i < params.inputs.size(); i++) {
-        // const should be added only to inputs which will not be updated
-        std::string const_str = "const";
-        for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) {
-            if (updateInputs[update_input_idx].inputId == i) {
-                const_str = "";
-                break;
-            }
-        }
-
-        inputs_decls +=
-            const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", ";
-
-        if (!params.stride.empty()) {
-            jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_X", params.stride[i].x));
-            jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_Y", params.stride[i].y));
-        }
-    }
-
-    jit.AddConstant(MakeJitConstant("INPUTS_DECLS", inputs_decls));
-    jit.AddConstant(MakeJitConstant("ELTWISE_NO_PITCH_SAME_DIMS", CheckInputsOutputNoPitchSameDims(params)));
-
-    std::string do_eltwise;
-
-    auto& operations = params.operations;
-    auto& coefficients = params.coefficients;
-
-    for (size_t op_num = 0; op_num < operations.size(); op_num++) {
-        const std::string op_num_str = std::to_string(op_num);
-        const auto& ew = operations[op_num];
-
-        for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
-            const auto& input = ew.inputs[input_idx];
-            const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx);
-            switch (input.mode) {
-                case EltwiseInputMode::SCALAR:
-                    jit.AddConstant(MakeJitConstant(name, input.scalar));
-                    break;
-                case EltwiseInputMode::INPUT_BUFFER:
-                    jit.AddConstant(MakeJitConstant(name,
-                                                    "GET_INPUT(input" + std::to_string(input.index) + ", INPUT" +
-                                                        std::to_string(input.index) + ")"));
-                    break;
-                case EltwiseInputMode::OUTPUT_BUFFER:
-                    jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT, )]"));
-                    break;
-                case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER:
-                    jit.AddConstant(MakeJitConstant(
-                        name,
-                        "input" + std::to_string(input.index) + "[(size_t)tmp" + std::to_string(input.tmpIndex) + "]"));
-                    break;
-                case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX:
-                    jit.AddConstant(MakeJitConstant(name, "tmp" + std::to_string(input.tmpIndex)));
-                    break;
-                default:
-                    break;
-            }
-        }
-        std::string input0_str, input1_str, cast_type, op;
-
-        if (params.int8_quantization) {
-            cast_type = "(int16)";
-            op = "const int16 tmp" + op_num_str + " = ";
-        } else {
-            cast_type = "(UNIT_TYPE)";
-            op = "const UNIT_TYPE tmp" + op_num_str + " = ";
-        }
-
-        input0_str = cast_type + "INPUT_" + op_num_str + "_0";
-        input1_str = cast_type + "INPUT_" + op_num_str + "_1";
-
-        if (ew.mode == EltwiseMode::ADD) {
-            std::vector<std::string> coeff_strings(ew.inputs.size(), "");
-            for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) {
-                const auto& input = ew.inputs[input_idx];
-                if (input.mode == EltwiseInputMode::INPUT_BUFFER && input.index < coefficients.size()) {
-                    const float c = coefficients[input.index];
-                    if (c != 1.0f)
-                        coeff_strings[input_idx] = cast_type + "(" + std::to_string(c) + ")*";
-                }
-            }
-
-            input0_str = coeff_strings[0] + input0_str;
-            input1_str = coeff_strings[1] + input1_str;
-        }
-
-        switch (ew.mode) {
-            case EltwiseMode::ADD:
-                op += input0_str + " + " + input1_str;
-                break;
-            case EltwiseMode::SUB:
-                op += input0_str + " - " + input1_str;
-                break;
-            case EltwiseMode::MUL:
-                op += input0_str + " * " + input1_str;
-                break;
-            case EltwiseMode::DIV:
-                op += input0_str + " / " + input1_str;
-                break;
-            case EltwiseMode::MODULU:
-            case EltwiseMode::MIN:
-            case EltwiseMode::MAX: {
-                auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max"));
-                auto input_0_type = params.inputs[0].GetDType();
-                auto input_1_type = params.inputs[1].GetDType();
-
-                // input_0 == int
-                if (input_0_type == kernel_selector::Datatype::INT8 ||
-                    input_0_type == kernel_selector::Datatype::INT32 ||
-                    input_0_type == kernel_selector::Datatype::INT64) {
-                    // input_0 == int && input_1 == int
-                    if (input_1_type == kernel_selector::Datatype::INT8 ||
-                        input_1_type == kernel_selector::Datatype::INT32 ||
-                        input_1_type == kernel_selector::Datatype::INT64) {
-                        if (ew.mode == EltwiseMode::MODULU)
-                            op += input0_str + " % " + input1_str;
-                        else
-                            op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")";
-                    // input_0 == int && input_1 != int
-                    } else {
-                        op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")";
-                    }
-                // input_0 != int && input_1 == int
-                } else if (input_1_type == kernel_selector::Datatype::INT8 ||
-                         input_1_type == kernel_selector::Datatype::INT32 ||
-                         input_1_type == kernel_selector::Datatype::INT64) {
-                    op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))";
-                // input_0 != int && input_1 != int
-                } else {
-                    op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")";
-                }
-            } break;
-            case EltwiseMode::POW:
-                op += cast_type + "pow(" + input0_str + ", " + input1_str + ")";
-                break;
-            case EltwiseMode::SQRT:
-                op += cast_type + "sqrt(" + input0_str + ")";
-                break;
-            case EltwiseMode::RSQRT:
-                op += cast_type + "1/sqrt(" + input0_str + ")";
-                break;
-            case EltwiseMode::SQUARED_DIFF:
-                op += cast_type + "((" + input0_str + " - " + input1_str +
-                      ")"
-                      " * (" +
-                      input0_str + " - " + input1_str + "))";
-                break;
-            case EltwiseMode::EQ:
-                op += cast_type + "(" + input0_str + " == " + input1_str + ")";
-                break;
-            case EltwiseMode::NE:
-                op += cast_type + "(" + input0_str + " != " + input1_str + ")";
-                break;
-            case EltwiseMode::LT:
-                op += cast_type + "(" + input0_str + " < " + input1_str + ")";
-                break;
-            case EltwiseMode::LE:
-                op += cast_type + "(" + input0_str + " <= " + input1_str + ")";
-                break;
-            case EltwiseMode::GT:
-                op += cast_type + "(" + input0_str + " > " + input1_str + ")";
-                break;
-            case EltwiseMode::GE:
-                op += cast_type + "(" + input0_str + " >= " + input1_str + ")";
-                break;
-            case EltwiseMode::LOGIC_AND:
-                op += cast_type + "(" + input0_str + " && " + input1_str + ")";
-                break;
-            case EltwiseMode::LOGIC_OR:
-                op += cast_type + "(" + input0_str + " || " + input1_str + ")";
-                break;
-            case EltwiseMode::LOGIC_XOR:
-                op += cast_type + "(!" + input0_str + " != !" + input1_str + ")";
-                break;
-            case EltwiseMode::FLOOR_MOD:
-                op += cast_type + "(" + input0_str + " - " + input0_str + " / " + input1_str + " * " + input1_str + ")";
-                break;
-            case EltwiseMode::ASSIGN:
-                op += input0_str;
-                break;
-            default:
-                break;
-        }
-
-        std::string opname = "OPERATION" + op_num_str;
-        jit.AddConstant(MakeJitConstant(opname, op));
-        do_eltwise += "\\\n\t" + opname + ";";
-    }
-
-    for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++)
-        do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + "[GET_INDEX(INPUT, " +
-                      std::to_string(updateInputs[update_input_idx].inputId) + ")] = tmp" +
-                      std::to_string(updateInputs[update_input_idx].tmpId) + ";";
-
-    do_eltwise += "\\\n\tres = tmp" + std::to_string(operations.size() - 1) + ";";
-
-    jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise));
-
-    if (params.layoutBased || params.int8_quantization) {
-        jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));
-    }
-
-    if (!params.stride.empty()) {
-        jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1));
-    }
-
-    ///////////////
-    return jit;
-}
-
-KernelsData EltwiseKernel_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params,
-                                                              const optional_params& options) const {
-    return GetCommonKernelsData(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h
deleted file mode 100644
index d13407a1b7e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h
+++ /dev/null
@@ -1,32 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "eltwise_kernel_base.h"
-
-namespace kernel_selector {
-class EltwiseKernel_fs_bs_yx_bsv4_fsv32 : public EltwiseKernelBase {
-public:
-    EltwiseKernel_fs_bs_yx_bsv4_fsv32() : EltwiseKernelBase("eltwise_fs_bs_yx_bsv4_fsv32") {}
-    virtual ~EltwiseKernel_fs_bs_yx_bsv4_fsv32() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const eltwise_params& params) const override;
-    DispatchData SetDefault(const eltwise_params& params) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp
index aeda60ba4ae..f8021ff16cf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp
@@ -38,7 +38,6 @@ ParamsKey EltwiseKernelRef::GetSupportedKey() const {
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
-    k.EnableInt8Quantization();
     k.EnableEltwiseStride();
     k.EnableEltwiseBroadcast();
     return k;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp
index 04d9ad9a9d0..697e6a847f1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp
@@ -16,8 +16,6 @@
 #include "eltwise_kernel_selector.h"
 #include "eltwise_kernel_ref.h"
 #include "eltwise_kernel_vload8.h"
-#include "eltwise_kernel_fs_bs_yx_bsv4_fsv32.h"
-#include "eltwise_kernel_b_fs_yx_fsv4.h"
 #include "eltwise_kernel_fs_b_yx_fsv32.h"
 #include "eltwise_kernel_b_fs_yx_fsv16.h"
 #include "eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.h"
@@ -26,8 +24,6 @@ namespace kernel_selector {
 eltwise_kernel_selector::eltwise_kernel_selector() {
     Attach<EltwiseKernelRef>();
     Attach<EltwiseKernel_vload8>();
-    Attach<EltwiseKernel_fs_bs_yx_bsv4_fsv32>();
-    Attach<EltwiseKernel_b_fs_yx_fsv4>();
     Attach<EltwiseKernel_fs_b_yx_fsv32>();
     Attach<EltwiseKernel_mixed_byxf_and_fs_b_yx_fsv32>();
     Attach<EltwiseKernel_b_fs_yx_fsv16>();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp
deleted file mode 100644
index 1b8e52e2dba..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "embed_kernel_ref.h"
-#include "kernel_selector_utils.h"
-#include "common_tools.h"
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey EmbedKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::F16);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableAllInputLayout();
-    k.EnableOutputLayout(DataLayout::bf);
-    k.EnableBiasPerOutput();
-    k.EnableBiasPerFeature();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnableNonBiasTerm();
-    return k;
-}
-
-JitConstants EmbedKernelRef::GetJitConstants(const embed_params& params) const {
-    JitConstants jit = WeightBiasKernelBase::GetJitConstants(params);
-    const auto& input = params.inputs[0];
-    const auto x_size = input.LogicalSize() / input.Batch().v;
-    const auto w_size = params.weights.OFM().v;
-    jit.AddConstant(MakeJitConstant("INPUT0_ELEMENTS_COUNT", x_size));
-    jit.AddConstant(MakeJitConstant("NUM_OUTPUT_SIZE", w_size));
-
-    return jit;
-}
-
-EmbedKernelRef::DispatchData EmbedKernelRef::SetDefault(const embed_params& params) const {
-    DispatchData kd;
-    std::vector<size_t> global = {params.inputs[0].X().v, params.weights.OFM().v, params.inputs[0].Batch().v};
-    std::vector<size_t> local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
-    kd.gws0 = global[0];
-    kd.gws1 = global[1];
-    kd.gws2 = global[2];
-
-    kd.lws0 = local[0];
-    kd.lws1 = local[1];
-    kd.lws2 = 1;
-    return kd;
-}
-
-KernelsData EmbedKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    assert(params.GetType() == KernelType::EMBED);
-
-    const embed_params& orgParams = static_cast<const embed_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-    KernelData kd = KernelData::Default<embed_params>(params);
-    embed_params& newParams = *static_cast<embed_params*>(kd.params.get());
-
-    bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oiyx, kd.weightsReorderParams);
-
-    if (!succeed) {
-        return {};
-    }
-
-    auto cldnn_jit = GetJitConstants(newParams);
-    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-
-    FillCLKernelData(kernel,
-                     runInfo,
-                     params.engineInfo,
-                     kernelName,
-                     jit,
-                     entry_point,
-                     DEFAULT,
-                     true,
-                     !newParams.bias.empty());
-
-    kd.estimatedTime = runInfo.efficiency;
-
-    return {kd};
-}
-
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h
deleted file mode 100644
index ccedf630559..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "weight_bias_kernel_base.h"
-#include "embed_params.h"
-#include "common_kernel_base.h"
-
-namespace kernel_selector {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// EmbedKernelRef
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class EmbedKernelRef : public WeightBiasKernelBase {
-public:
-    EmbedKernelRef() : WeightBiasKernelBase("embed_ref") {}
-    virtual ~EmbedKernelRef() {}
-
-    struct DispatchData : public CommonDispatchData {};
-
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    virtual JitConstants GetJitConstants(const embed_params& params) const;
-    virtual DispatchData SetDefault(const embed_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp
deleted file mode 100644
index d9de5af8951..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "embed_kernel_selector.h"
-#include "embed_kernel_ref.h"
-
-namespace kernel_selector {
-
-embed_kernel_selector::embed_kernel_selector() { Attach<EmbedKernelRef>(); }
-
-KernelsData embed_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::EMBED);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h
deleted file mode 100644
index 1e2db97263c..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class embed_kernel_selector : public kernel_selector_base {
-public:
-    static embed_kernel_selector& Instance() {
-        static embed_kernel_selector instance_;
-        return instance_;
-    }
-
-    embed_kernel_selector();
-
-    virtual ~embed_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h
deleted file mode 100644
index 94826c29858..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "weight_bias_params.h"
-#include <string>
-
-namespace kernel_selector {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// embed_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct embed_params : public weight_bias_params {
-    embed_params() : weight_bias_params(KernelType::EMBED) {}
-
-    std::string to_string() const {
-        std::stringstream s;
-
-        s << base_params::to_string() << "_";
-        if (bias.empty()) {
-            s << "no_bias"
-              << "_";
-        } else {
-            s << "bias_" << bias[0].PhysicalSize() << "_";
-        }
-        return s.str();
-    }
-    virtual ParamsKey GetParamsKey() const { return weight_bias_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// embed_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct embed_optional_params : weight_bias_optional_params {
-    embed_optional_params() : weight_bias_optional_params(KernelType::EMBED) {}
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp
index 56d1ad1f1a3..9617e458cca 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp
@@ -29,8 +29,6 @@ JitConstants FullyConnectedKernelBase::GetJitConstants(const fully_connected_par
 
     jit.AddConstant(MakeJitConstant("INPUT0_ELEMENTS_COUNT", x_size));
 
-    jit.AddConstant(MakeJitConstant("QUANTIZATION_TERM", params.quantization != QuantizationType::NONE));
-
     return jit;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp
deleted file mode 100644
index 035fc421d93..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-﻿// Copyright (c) 2016-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_kernel_mmad_batched.h"
-
-namespace kernel_selector {
-ParamsKey FullyConnected_mmad_batched::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::bf);
-    k.EnableBiasPerOutput();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnableInt8Quantization();
-    k.EnableOutputCalibration();
-    return k;
-}
-
-bool FullyConnected_mmad_batched::Validate(const Params& p, const optional_params& o) const {
-    if (!FullyConnectedKernelBase::Validate(p, o)) {
-        return false;
-    }
-
-    const auto& params = static_cast<const fully_connected_params&>(p);
-
-    // we do not support padded input
-    if (params.inputs[0].X().pad.Total() != 0 || params.inputs[0].Y().pad.Total() != 0)
-        return false;
-
-    size_t batch = params.inputs[0].Batch().v;
-    // batch must be a multiple of 8
-    if (batch % 8 != 0) {
-        return false;
-    }
-
-    return true;
-}
-
-JitConstants FullyConnected_mmad_batched::GetJitConstants(const fully_connected_params& params,
-                                                          const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
-
-    // pitch for special block format used in this kernel
-    const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
-    const size_t filter_ofm_block_pitch =
-        (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
-    jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-
-    return jit;
-}
-
-FullyConnected_mmad_batched::DispatchData FullyConnected_mmad_batched::SetDefault(const fully_connected_params& params,
-                                                                                  int) const {
-    auto runInfo = Parent::SetDefault(params);
-
-    constexpr size_t sub_group_size = 8;
-
-    const auto of_maps = params.output.Feature().v;
-    const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
-
-    runInfo.gws0 = params.output.Batch().v / 8;  // we process 8 batches in a single WG
-    runInfo.gws1 = of_threads_per_batch;
-    runInfo.gws2 = 1;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = sub_group_size;
-    runInfo.lws2 = 1;
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-    return runInfo;
-}
-
-KernelsData FullyConnected_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelsData res = {};
-    for (size_t i = 0; i < autoTuneOptions.size(); i++) {
-        KernelsData kd = GetTunedKernelsDataByIndex(params,
-                                                    options,
-                                                    DataLayout::fs_bs_yx_bsv4_fsv32,
-                                                    WeightsLayout::os_is_yx_isa8_osv8_isv4,
-                                                    FORCE_PRIORITY_1,
-                                                    static_cast<int>(i));
-        if (!kd.empty()) {
-            res.emplace_back(kd[0]);
-        }
-    }
-    return res;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h
deleted file mode 100644
index 07feee159a0..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h
+++ /dev/null
@@ -1,36 +0,0 @@
-﻿// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fully_connected_kernel_base.h"
-
-namespace kernel_selector {
-
-class FullyConnected_mmad_batched : public FullyConnectedKernelBase {
-public:
-    using Parent = FullyConnectedKernelBase;
-
-    FullyConnected_mmad_batched() : Parent("fully_connected_gpu_mmad_batched") {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp
index 3135d00e592..fc7d28aedcf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp
@@ -28,7 +28,6 @@
 #include "fully_connected_kernel_fb_io_block.h"
 #include "fully_connected_kernel_bf_io_input_spatial.h"
 #include "fully_connected_kernel_mmad.h"
-#include "fully_connected_kernel_mmad_batched.h"
 #include "fully_connected_kernel_imad.h"
 #include "fully_connected_kernel_fs_byx_fsv32.h"
 
@@ -49,7 +48,6 @@ fully_connected_kernel_selector::fully_connected_kernel_selector() {
     Attach<FullyConnected_fb_io_b8_f8>();
     Attach<FullyConnected_bf_io_input_spatial>();
     Attach<FullyConnectedKernelMMAD>();
-    // Attach<FullyConnected_mmad_batched>();
     Attach<FullyConnectedKernelIMAD>();
     Attach<FullyConnected_fs_byx_fsv32>();
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp
deleted file mode 100644
index a5eb45ed347..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-﻿// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_grad_input_kernel_base.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-#include <algorithm>
-
-namespace kernel_selector {
-JitConstants FullyConnectedGradInputKernelBase::GetJitConstants(const fully_connected_grad_input_params& params) const {
-    return WeightBiasKernelBase::GetJitConstants(params);
-}
-
-FullyConnectedGradInputKernelBase::DispatchData FullyConnectedGradInputKernelBase::SetDefault(
-    const fully_connected_grad_input_params& params) const {
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-    size_t gws0 = params.output.Batch().v * params.weights.IFM().v;
-    size_t lws0 = std::min(gws0, static_cast<size_t>(32));
-    while (gws0 % lws0) {
-        lws0--;
-    }
-    kd.gws0 = gws0;
-    kd.gws1 = params.weights.X().v;
-    kd.gws2 = params.weights.Y().v;
-    kd.lws0 = lws0;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-    return kd;
-}
-
-KernelsData FullyConnectedGradInputKernelBase::GetKernelsData(const Params& params,
-                                                              const optional_params& options) const {
-    assert(params.GetType() == KernelType::FULLY_CONNECTED_GRAD_INPUT);
-
-    const fully_connected_grad_input_params& orgParams = static_cast<const fully_connected_grad_input_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-    KernelData kd = KernelData::Default<fully_connected_grad_input_params>(params);
-    fully_connected_grad_input_params& newParams = *static_cast<fully_connected_grad_input_params*>(kd.params.get());
-
-    bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oi, kd.weightsReorderParams);
-
-    if (!succeed) {
-        return {};
-    }
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel,
-                     runInfo,
-                     params.engineInfo,
-                     kernelName,
-                     jit,
-                     entry_point,
-                     DEFAULT,
-                     true,
-                     !orgParams.bias.empty());
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
-
-    kd.estimatedTime = runInfo.efficiency;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h
deleted file mode 100644
index 29ada244e3a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h
+++ /dev/null
@@ -1,54 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "weight_bias_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fully_connected_grad_input_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct fully_connected_grad_input_params : public weight_bias_params {
-    fully_connected_grad_input_params() : weight_bias_params(KernelType::FULLY_CONNECTED_GRAD_INPUT) {}
-
-    virtual ParamsKey GetParamsKey() const { return weight_bias_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fully_connected_grad_input_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct fully_connected_grad_input_optional_params : weight_bias_optional_params {
-    fully_connected_grad_input_optional_params()
-        : weight_bias_optional_params(KernelType::FULLY_CONNECTED_GRAD_INPUT) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// FullyConnectedGradInputKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class FullyConnectedGradInputKernelBase : public WeightBiasKernelBase {
-public:
-    using WeightBiasKernelBase::WeightBiasKernelBase;
-    virtual ~FullyConnectedGradInputKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const;
-    virtual JitConstants GetJitConstants(const fully_connected_grad_input_params& params) const;
-    virtual DispatchData SetDefault(const fully_connected_grad_input_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp
deleted file mode 100644
index 4eeab782044..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_grad_input_kernel_ref.h"
-
-namespace kernel_selector {
-
-ParamsKey FullyConnectedGradInputKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F16);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    // TODO: add support to batching, figure out the way to update weights/biases for multiple batches at the same time
-    k.EnableBatching();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h
deleted file mode 100644
index 4ccab494f70..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h
+++ /dev/null
@@ -1,29 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fully_connected_grad_input_kernel_base.h"
-
-namespace kernel_selector {
-
-class FullyConnectedGradInputKernelRef : public FullyConnectedGradInputKernelBase {
-public:
-    FullyConnectedGradInputKernelRef() : FullyConnectedGradInputKernelBase("fully_connected_grad_input_gpu_ref") {}
-    virtual ~FullyConnectedGradInputKernelRef() {}
-
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp
deleted file mode 100644
index a7df113d338..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_grad_input_kernel_selector.h"
-#include "fully_connected_grad_input_kernel_ref.h"
-
-namespace kernel_selector {
-fully_connected_grad_input_kernel_selector::fully_connected_grad_input_kernel_selector() {
-    Attach<FullyConnectedGradInputKernelRef>();
-}
-
-KernelsData fully_connected_grad_input_kernel_selector::GetBestKernels(const Params& params,
-                                                                       const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::FULLY_CONNECTED_GRAD_INPUT);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h
deleted file mode 100644
index b2d165d1b4e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h
+++ /dev/null
@@ -1,34 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class fully_connected_grad_input_kernel_selector : public kernel_selector_base {
-public:
-    static fully_connected_grad_input_kernel_selector& Instance() {
-        static fully_connected_grad_input_kernel_selector instance_;
-        return instance_;
-    }
-
-    fully_connected_grad_input_kernel_selector();
-
-    virtual ~fully_connected_grad_input_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp
deleted file mode 100644
index a5e4cdae69a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-﻿// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_grad_weights_kernel_base.h"
-#include "kernel_selector_utils.h"
-#include <algorithm>
-#include <vector>
-
-namespace kernel_selector {
-JitConstants FullyConnectedGradWeightsKernelBase::GetJitConstants(
-    const fully_connected_grad_weights_params& params) const {
-    JitConstants jit = training_kernel_base::GetJitConstants(params);
-
-    return jit;
-}
-
-FullyConnectedGradWeightsKernelBase::DispatchData FullyConnectedGradWeightsKernelBase::SetDefault(
-    const fully_connected_grad_weights_params& params) const {
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-    size_t gws0 = params.weights.OFM().v * params.weights.IFM().v;
-    size_t lws0 = std::min(gws0, static_cast<size_t>(32));
-    while (gws0 % lws0) {
-        lws0--;
-    }
-    kd.gws0 = gws0;
-    kd.gws1 = params.weights.X().v;
-    kd.gws2 = params.weights.Y().v;
-    kd.lws0 = lws0;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-    return kd;
-}
-
-KernelsData FullyConnectedGradWeightsKernelBase::GetKernelsData(const Params& params,
-                                                                const optional_params& options) const {
-    assert(params.GetType() == KernelType::FULLY_CONNECTED_GRAD_WEIGHTS);
-
-    const fully_connected_grad_weights_params& orgParams =
-        static_cast<const fully_connected_grad_weights_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-    KernelData kd = KernelData::Default<fully_connected_grad_weights_params>(params);
-    fully_connected_grad_weights_params& newParams =
-        *static_cast<fully_connected_grad_weights_params*>(kd.params.get());
-
-    bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oi, kd.weightsReorderParams);
-
-    if (!succeed) {
-        return {};
-    }
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel,
-                     runInfo,
-                     params.engineInfo,
-                     kernelName,
-                     jit,
-                     entry_point,
-                     DEFAULT,
-                     true,
-                     !orgParams.bias.empty());
-    if (orgParams.use_momentum) {
-        kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0});
-        if (!orgParams.bias.empty())
-            kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_BIAS_GRADIENT, 0});
-    }
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
-    kernel.arguments.push_back({ArgumentDescriptor::Types::LEARNING_RATE, 0});
-
-    kd.estimatedTime = runInfo.efficiency;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h
deleted file mode 100644
index 38115d73fcc..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h
+++ /dev/null
@@ -1,58 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "training_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fully_connected_grad_weights_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct fully_connected_grad_weights_params : public training_params {
-    fully_connected_grad_weights_params() : training_params(KernelType::FULLY_CONNECTED_GRAD_WEIGHTS) {}
-
-    virtual ParamsKey GetParamsKey() const {
-        ParamsKey k = training_params::GetParamsKey();
-
-        return k;
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fully_connected_grad_weights_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct fully_connected_grad_weights_optional_params : training_optional_params {
-    fully_connected_grad_weights_optional_params()
-        : training_optional_params(KernelType::FULLY_CONNECTED_GRAD_WEIGHTS) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// FullyConnectedGradWeightsKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class FullyConnectedGradWeightsKernelBase : public training_kernel_base {
-public:
-    using training_kernel_base::training_kernel_base;
-    virtual ~FullyConnectedGradWeightsKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const;
-    virtual JitConstants GetJitConstants(const fully_connected_grad_weights_params& params) const;
-    virtual DispatchData SetDefault(const fully_connected_grad_weights_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp
deleted file mode 100644
index ef14d53e6a3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_grad_weights_kernel_ref.h"
-
-namespace kernel_selector {
-
-ParamsKey FullyConnectedGradWeightsKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableMomentum();
-    k.EnableBatching();
-    k.EnableGradient();
-    k.DisableTuning();
-    return k;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h
deleted file mode 100644
index 196d07d6579..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h
+++ /dev/null
@@ -1,30 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fully_connected_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class FullyConnectedGradWeightsKernelRef : public FullyConnectedGradWeightsKernelBase {
-public:
-    FullyConnectedGradWeightsKernelRef()
-        : FullyConnectedGradWeightsKernelBase("fully_connected_grad_weights_gpu_ref") {}
-    virtual ~FullyConnectedGradWeightsKernelRef() {}
-
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp
deleted file mode 100644
index 0887084a7bb..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fully_connected_grad_weights_kernel_selector.h"
-#include "fully_connected_grad_weights_kernel_ref.h"
-
-namespace kernel_selector {
-fully_connected_grad_weights_kernel_selector::fully_connected_grad_weights_kernel_selector() {
-    Attach<FullyConnectedGradWeightsKernelRef>();
-}
-
-KernelsData fully_connected_grad_weights_kernel_selector::GetBestKernels(const Params& params,
-                                                                         const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::FULLY_CONNECTED_GRAD_WEIGHTS);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h
deleted file mode 100644
index 680b2229313..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h
+++ /dev/null
@@ -1,34 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class fully_connected_grad_weights_kernel_selector : public kernel_selector_base {
-public:
-    static fully_connected_grad_weights_kernel_selector& Instance() {
-        static fully_connected_grad_weights_kernel_selector instance_;
-        return instance_;
-    }
-
-    fully_connected_grad_weights_kernel_selector();
-
-    virtual ~fully_connected_grad_weights_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp
deleted file mode 100644
index aea2eb96edb..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fused_conv_bn_scale_kernel_base.h"
-#include "kernel_selector_utils.h"
-#include "common_tools.h"
-#include <vector>
-#include <algorithm>
-
-namespace kernel_selector {
-bool fused_conv_bn_scale_kernel_base::Validate(const Params& p, const optional_params& o) const {
-    if (p.GetType() != KernelType::FUSED_CONV_BN_SCALE || o.GetType() != KernelType::FUSED_CONV_BN_SCALE) {
-        return false;
-    }
-
-    const fused_conv_bn_scale_params& params = static_cast<const fused_conv_bn_scale_params&>(p);
-    const fused_conv_bn_scale_optional_params& optParams = static_cast<const fused_conv_bn_scale_optional_params&>(o);
-
-    bool bSupportedWeightsLayout = params.weights.GetLayout() == GetPreferredWeightsLayout(params);
-
-    const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering;
-
-    return bWeightsOK;
-}
-
-JitConstants fused_conv_bn_scale_kernel_base::GetJitConstants(const fused_conv_bn_scale_params& params,
-                                                              const DispatchData&) const {
-    JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params);
-    const auto& padding = params.padding;
-    const auto& input = params.inputs[0];
-
-    int64_t input_offset_with_padding =
-        (int64_t)input.GetFirstElementOffset() - padding.x * input.X().pitch - input.Y().pitch * padding.y;
-    input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0);
-
-    mem_consts.AddConstants({MakeJitConstant("STRIDE", params.stride),
-                             MakeJitConstant("PADDING", params.padding),
-                             MakeJitConstant("FILTER_ARRAY_NUM", params.split),
-                             MakeJitConstant("DILATION", params.dilation),
-                             MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding),
-                             MakeJitConstant("EPSILON", params.epsilon)});
-
-    if (params.fused_in_training)
-        mem_consts.AddConstant(MakeJitConstant("FUSED_TRAINING", 1));
-    if (params.scale_bias)
-        mem_consts.AddConstant(MakeJitConstant("SCALE_BIAS_TERM", 1));
-
-    return mem_consts;
-}
-
-bool fused_conv_bn_scale_kernel_base::CheckWorkGroups(const DispatchData& kd) {
-    if (kd.gws0 == 0 || kd.gws1 == 0 || kd.gws2 == 0 || kd.lws0 == 0 || kd.lws1 == 0 || kd.lws2 == 0) {
-        return false;
-    }
-
-    if ((kd.gws0 % kd.lws0) != 0 || (kd.gws1 % kd.lws1) != 0 || (kd.gws2 % kd.lws2) != 0) {
-        return false;
-    }
-
-    return true;
-}
-
-fused_conv_bn_scale_kernel_base::DispatchData fused_conv_bn_scale_kernel_base::SetDefault(
-    const fused_conv_bn_scale_params& params) const {
-    DispatchData kd;
-
-    const auto& out = params.output;
-    kd.fp16UnitUsed = out.GetDType() == Datatype::F16;
-    std::vector<size_t> global;
-    if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) {
-        global = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v};
-    } else {
-        global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v};
-    }
-
-    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
-    kd.gws0 = global[0];
-    kd.gws1 = global[1];
-    kd.gws2 = global[2];
-
-    kd.lws0 = local[0];
-    kd.lws1 = local[1];
-    kd.lws2 = local[2];
-
-    kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-    return kd;
-}
-
-KernelsData fused_conv_bn_scale_kernel_base::GetCommonKernelsData(const Params& params,
-                                                                  const optional_params& options,
-                                                                  float estimated_time) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    KernelData kd = KernelData::Default<fused_conv_bn_scale_params>(params);
-    fused_conv_bn_scale_params& newParams = *static_cast<fused_conv_bn_scale_params*>(kd.params.get());
-
-    DispatchData runInfo = SetDefault(newParams);
-
-    if (!CheckWorkGroups(runInfo)) {
-        // Internal Error - wrong calculation of global/local work group sizes
-        return {};
-    }
-
-    bool succeed =
-        UpdateWeightsParams(newParams, options, GetPreferredWeightsLayout(newParams), kd.weightsReorderParams);
-
-    if (!succeed) {
-        return {};
-    }
-
-    auto finalKernelName = GetKernelName(newParams);
-    auto cldnnJit = GetJitConstants(newParams, runInfo);
-    auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
-    auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel,
-                     runInfo,
-                     params.engineInfo,
-                     finalKernelName,
-                     jit,
-                     entryPoint,
-                     "",
-                     true,
-                     !newParams.bias.empty(),
-                     1);
-    kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0});
-    uint32_t idx = 1;
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++});
-    if (newParams.scale_bias)
-        kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++});
-    if (newParams.fused_in_training) {
-        kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++});
-        kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++});
-        kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx});
-    }
-
-    kd.estimatedTime = estimated_time;
-
-    return {kd};
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h
deleted file mode 100644
index 6abddd9f09f..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "weight_bias_kernel_base.h"
-#include "actual_kernels/convolution/convolution_params.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fused_conv_bn_scale_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct fused_conv_bn_scale_params : public weight_bias_params {
-    fused_conv_bn_scale_params() : weight_bias_params(KernelType::FUSED_CONV_BN_SCALE) {}
-
-    uSize filterSize;
-    uSize stride;
-    uSize dilation;
-    uSize padding;
-    uint32_t split = 1;
-    bool fused_in_training = false;
-    bool scale_bias = false;
-    float epsilon = 0.00001f;
-
-    ParamsKey GetParamsKey() const override {
-        ParamsKey k = weight_bias_params::GetParamsKey();
-
-        if (split > 1) {
-            k.EnableSplitSupport();
-        }
-
-        return k;
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fused_conv_bn_scale_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct fused_conv_bn_scale_optional_params : weight_bias_optional_params {
-    fused_conv_bn_scale_optional_params() : weight_bias_optional_params(KernelType::FUSED_CONV_BN_SCALE) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// fused_conv_bn_scale_kernel_base
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class fused_conv_bn_scale_kernel_base : public WeightBiasKernelBase {
-public:
-    using WeightBiasKernelBase::WeightBiasKernelBase;
-    virtual ~fused_conv_bn_scale_kernel_base() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    virtual WeightsLayout GetPreferredWeightsLayout(const fused_conv_bn_scale_params &) const = 0;
-    virtual std::string GetKernelName(const fused_conv_bn_scale_params&) const { return kernelName; }
-    bool Validate(const Params& p, const optional_params& o) const override;
-    virtual JitConstants GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& kd) const;
-    virtual DispatchData SetDefault(const fused_conv_bn_scale_params& params) const;
-    static bool CheckWorkGroups(const DispatchData&);
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp
deleted file mode 100644
index ebd7e52624c..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fused_conv_bn_scale_kernel_ref.h"
-#include "kernel_selector_utils.h"
-#include <algorithm>
-
-namespace kernel_selector {
-
-ParamsKey fused_conv_bn_scale_kernel_ref::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableSplitSupport();
-    k.EnableBatching();
-    k.DisableTuning();
-    return k;
-}
-
-fused_conv_bn_scale_kernel_base::DispatchData fused_conv_bn_scale_kernel_ref::SetDefault(
-    const fused_conv_bn_scale_params& arg) const {
-    DispatchData runInfo = fused_conv_bn_scale_kernel_base::SetDefault(arg);
-
-    runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
-    runInfo.gws0 = arg.output.Batch().v;
-    runInfo.gws1 = arg.output.Feature().v;
-    runInfo.gws2 = 1;
-
-    runInfo.lws0 = std::min(std::max(runInfo.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
-    while (runInfo.gws0 % runInfo.lws0 != 0) {
-        --runInfo.lws0;
-    }
-    runInfo.lws1 = 1;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-JitConstants fused_conv_bn_scale_kernel_ref::GetJitConstants(const fused_conv_bn_scale_params& params,
-                                                             const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    return jit;
-}
-
-KernelsData fused_conv_bn_scale_kernel_ref::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options, DONT_USE_IF_HAVE_SOMETHING_ELSE);
-
-    return kd;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h
deleted file mode 100644
index 9e8222fc71b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fused_conv_bn_scale_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class fused_conv_bn_scale_kernel_ref : public fused_conv_bn_scale_kernel_base {
-public:
-    using Parent = fused_conv_bn_scale_kernel_base;
-
-    fused_conv_bn_scale_kernel_ref() : fused_conv_bn_scale_kernel_base("fused_conv_bn_scale_kernel_ref") {}
-    virtual ~fused_conv_bn_scale_kernel_ref() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    WeightsLayout GetPreferredWeightsLayout(const fused_conv_bn_scale_params &) const override {
-        return WeightsLayout::oiyx;
-    }
-    DispatchData SetDefault(const fused_conv_bn_scale_params& arg) const override;
-    JitConstants GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& kd) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp
deleted file mode 100644
index 04674987c7e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fused_conv_bn_scale_kernel_selector.h"
-#include "fused_conv_bn_scale_kernel_ref.h"
-
-namespace kernel_selector {
-fused_conv_bn_scale_kernel_selector::fused_conv_bn_scale_kernel_selector() { Attach<fused_conv_bn_scale_kernel_ref>(); }
-
-KernelsData fused_conv_bn_scale_kernel_selector::GetBestKernels(const Params& params,
-                                                                const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::FUSED_CONV_BN_SCALE);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h
deleted file mode 100644
index db78aaa79fe..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h
+++ /dev/null
@@ -1,34 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class fused_conv_bn_scale_kernel_selector : public kernel_selector_base {
-public:
-    static fused_conv_bn_scale_kernel_selector& Instance() {
-        static fused_conv_bn_scale_kernel_selector instance_;
-        return instance_;
-    }
-
-    fused_conv_bn_scale_kernel_selector();
-
-    virtual ~fused_conv_bn_scale_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp
deleted file mode 100644
index 1ecf94d0916..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
-// Copyright (c) 2019-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fused_conv_eltwise_kernel_af32_imad_1x1.h"
-
-static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) {
-    for (int tile_len = 14; tile_len > 0; tile_len--) {
-        // Kernel writes 32 output features per HW thread
-        size_t threads = (out_xy / tile_len) * out_xy * out_f / 32;
-        // Chose largest valid tile with enough HW threads
-        if ((out_xy % tile_len == 0) && (threads >= min_threads)) {
-            return tile_len;
-        }
-    }
-    return out_xy % 8 ? (out_xy % 7 ? 1 : 7) : 8;
-}
-
-namespace kernel_selector {
-
-ParamsKey fused_conv_eltwise_kernel_af32_imad_1x1::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::byxf_af32);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-    k.EnableDifferentTypes();
-    k.EnableDifferentInputWeightsTypes();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableBiasPerOutput();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableSplitSupport();
-    k.EnableDepthwiseSeparableOpt();
-    k.EnableInt8Quantization();
-    k.EnableOutputCalibration();
-    k.EnableFusedConvEltwInt8Quantization();
-    k.EnableFusedConvEltwOutputCalibration();
-    k.DisableTuning();
-    k.EnableFusedConvEltwiseRWOutOpt();
-    k.EnableEltwiseStride();
-    return k;
-}
-
-bool fused_conv_eltwise_kernel_af32_imad_1x1::Validate(const Params& p, const optional_params& o) const {
-    if (!Parent::Validate(p, o)) {
-        return false;
-    }
-
-    KernelData kd = KernelData::Default<fused_conv_eltwise_params>(p);
-    fused_conv_eltwise_params& newParams = *static_cast<fused_conv_eltwise_params*>(kd.params.get());
-
-    if (newParams.conv.filterSize.x != 1 || newParams.conv.filterSize.y != 1)
-        return false;
-
-    if (newParams.conv.padding.x != 0 || newParams.conv.padding.y != 0)
-        return false;
-
-    if (newParams.output.Feature().v % 32 != 0)
-        return false;
-
-    const auto& input = newParams.inputs[0];
-
-    // we do not support padded input
-    if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0)
-        return false;
-
-    if (newParams.conv.split != 1)
-        return false;
-
-    return true;
-}
-
-fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_af32_imad_1x1::SetDefault(
-    const fused_conv_eltwise_params& arg,
-    int) const {
-    DispatchData runInfo = Parent::SetDefault(arg);
-
-    // Sub-group size
-    constexpr size_t sub_group_size = 8;
-
-    const auto of_maps = arg.output.Feature().v;
-    const size_t of_maps_per_batch = RoundUp(of_maps, 32);
-    const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v;
-
-    // Need to have at least 4 HW threads per EU
-    const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4);
-    runInfo.cldnnStyle.blockWidth = tile_length;
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length;
-    runInfo.gws1 = of_maps_total / 4;  // TILE_DEPTH==4
-    runInfo.gws2 = 1;
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = sub_group_size;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-JitConstants fused_conv_eltwise_kernel_af32_imad_1x1::GetJitConstants(const fused_conv_eltwise_params& params,
-                                                                      const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
-
-    jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth));
-    jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4));
-
-    if (params.non_conv_scale != 1.0f)
-        jit.AddConstant(MakeJitConstant("NON_CONV_SCALE", params.non_conv_scale));
-
-    jit.Merge(MakeActivationJitConstants(params.conv.activations, GetUnitType(params), "_CONV_TYPED", true));
-    jit.Merge(MakeActivationJitConstants(params.activations,  GetUnitType(params), "_ELTW_TYPED", true));
-    jit.Merge(MakeTypeJitConstants(Datatype::F32, "float"));
-
-    return jit;
-}
-
-KernelsData fused_conv_eltwise_kernel_af32_imad_1x1::GetKernelsData(const Params& params,
-                                                                    const optional_params& options) const {
-    return GetTunedKernelsDataByIndex(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h
deleted file mode 100644
index aa0f954e884..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h
+++ /dev/null
@@ -1,40 +0,0 @@
-﻿// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "fused_conv_eltwise_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class fused_conv_eltwise_kernel_af32_imad_1x1 : public fused_conv_eltwise_kernel_base {
-public:
-    using Parent = fused_conv_eltwise_kernel_base;
-    fused_conv_eltwise_kernel_af32_imad_1x1()
-        : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_af32_imad_1x1") {}
-    virtual ~fused_conv_eltwise_kernel_af32_imad_1x1() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override {
-        return WeightsLayout::os_is_osv32_isv32_swizzled_by_4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp
index ac48606f673..515e2b26264 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp
@@ -74,14 +74,6 @@ ParamsKey fused_conv_eltwise_params::GetParamsKey() const {
         k.EnableFusedConvEltwTranspose();
     }
 
-    if (conv.int8_quantization) {
-        k.EnableFusedConvEltwInt8Quantization();
-    }
-
-    if (conv.output_calibration) {
-        k.EnableFusedConvEltwOutputCalibration();
-    }
-
     if (conv.local_convolution) {
         k.EnableFusedConvEltwLocalConvolution();
     }
@@ -133,22 +125,8 @@ JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_el
         MakeJitConstant("FILTER_ARRAY_NUM", params.conv.split),
         MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding),
         MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", params.conv.depthwise_separable_opt),
-        MakeJitConstant("QUANTIZATION_TERM", params.conv.int8_quantization),
     });
 
-    if (params.conv.int8_quantization) {
-        mem_consts.AddConstants({MakeJitConstant("W_QF", params.conv.weights_quantization_factors[0])});
-        mem_consts.AddConstants({MakeJitConstant("I_QF", params.conv.input_quantization_factor)});
-
-        if (params.conv.output_calibration) {
-            mem_consts.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.conv.output_calibration));
-            mem_consts.AddConstant(MakeJitConstant("O_QF", params.conv.output_calibration_factors[0]));
-
-        } else {
-            mem_consts.AddConstants({MakeJitConstant("O_QF", params.conv.output_quantization_factor)});
-        }
-    }
-
     if (params.conv.local_convolution) {
         mem_consts.AddConstants({MakeJitConstant("LOCAL_CONVOLUTION", params.conv.local_convolution)});
     }
@@ -157,7 +135,6 @@ JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_el
     mem_consts.Merge(eltw_activations);
     JitConstants conv_activations = MakeActivationJitConstants(params.conv.activations, GetUnitType(params), "_CONV");
     mem_consts.Merge(conv_activations);
-    mem_consts.AddConstant(MakeJitConstant("ELTW_CALIBRATION_TERM", params.eltw.output_calibration));
 
     if (!params.eltw.stride.empty()) {
         mem_consts.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x));
@@ -332,8 +309,6 @@ KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& p
     } else {
         kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
     }
-    if (!newParams.eltw.output_calibration_factors.empty())
-        kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 1});
 
     kd.estimatedTime = runInfo.efficiency;
     kd.autoTuneIndex = autoTuneIndex;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h
index 43d3c814277..4d1d1aa9856 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h
@@ -37,13 +37,7 @@ struct fused_conv_eltwise_params : public weight_bias_params {
         uint32_t split = 1;
         bool depthwise_separable_opt = false;
         bool transposed = false;
-        bool int8_quantization = false;
-        bool output_calibration = false;
         bool local_convolution = false;
-        float input_quantization_factor = 1.0f;
-        float output_quantization_factor = 1.0f;
-        MultiDataTensor weights_quantization_factors;
-        MultiDataTensor output_calibration_factors;
 
         std::vector<base_activation_params> activations;
     } conv;
@@ -55,14 +49,8 @@ struct fused_conv_eltwise_params : public weight_bias_params {
         std::vector<uSize> stride;
 
         bool layoutBased = false;
-        bool int8_quantization = false;
-        bool output_calibration = false;
-        float output_quantization_factor = 1.0f;
-
-        MultiDataTensor output_calibration_factors;
     } eltw;
 
-    float non_conv_scale = 1.0f;
     bool second_input_in_output = false;
     bool depth_to_space_already_fused = false;
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp
deleted file mode 100644
index 837430da70d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fused_conv_eltwise_kernel_gemm.h"
-#include "kernel_selector_utils.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-
-ParamsKey fused_conv_eltwise_kernel_gemm::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F16);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableSubGroup();
-    // k.EnableSubGroupShort(); // we need it for FP16 only. we check it on the Validate phase
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableFusedConvEltwSplitSupport();
-    return k;
-}
-
-std::string fused_conv_eltwise_kernel_gemm::GetKernelName(const fused_conv_eltwise_params& params) const {
-    if (params.inputs[0].GetDType() == Datatype::F32) {
-        return kernelName + "_fp32";
-    } else {
-        return kernelName + "_fp16";
-    }
-}
-
-bool fused_conv_eltwise_kernel_gemm::Validate(const Params& p, const optional_params& o) const {
-    if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // make sure it's 1x1 conv
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
-        return false;
-
-    // make sure stride is 1x1
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    // input padding not supported
-    if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 ||
-        cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0)
-        return false;
-
-    // input and output spatial sizes must match
-    if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
-        return false;
-
-    return true;
-}
-
-WeightsLayout fused_conv_eltwise_kernel_gemm::GetPreferreddWeightsLayout(
-        const fused_conv_eltwise_params &params) const {
-    if (params.inputs[0].GetDType() == Datatype::F16) {
-        return WeightsLayout::iy_xs_os_xsv2_osv16__ao32;
-    } else {
-        return WeightsLayout::iy_xs_os_xsv2_osv8__ao32;
-    }
-}
-
-fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_gemm::SetDefault(
-    const fused_conv_eltwise_params& arg,
-    int) const {
-    DispatchData runInfo = Parent::SetDefault(arg);
-
-    runInfo.lws0 = 1;
-    runInfo.lws2 = 1;
-
-    if (arg.inputs[0].GetDType() == Datatype::F16) {
-        runInfo.gemmStyle = {1, arg.conv.filterSize.x, 32, 32, 1, 1};
-        runInfo.lws1 = 16;
-        runInfo.efficiency = FORCE_PRIORITY_6;
-    } else {
-        runInfo.gemmStyle = {2, arg.conv.filterSize.x, 32, 32, 2, 1};
-        runInfo.lws1 = 8;
-        runInfo.efficiency = FORCE_PRIORITY_8;
-    }
-
-    size_t sgemm_m = RoundUp(arg.output.X().v * arg.output.Y().v, runInfo.gemmStyle.subBlockDimM);
-    size_t sgemm_n = RoundUp(arg.output.Feature().v, runInfo.gemmStyle.subBlockDimN);
-
-    runInfo.gws0 = RoundUp(CeilDiv(sgemm_n, runInfo.gemmStyle.globalWorkSizeDX), runInfo.lws0);
-    runInfo.gws1 = RoundUp(CeilDiv(sgemm_m, runInfo.gemmStyle.globalWorkSizeDY), runInfo.lws1);
-    runInfo.gws2 = arg.output.Batch().v;
-
-    return runInfo;
-}
-
-JitConstants fused_conv_eltwise_kernel_gemm::GetJitConstants(const fused_conv_eltwise_params& params,
-                                                             const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstants({
-        MakeJitConstant("ALIGNED_OFM", RoundUp(params.output.Feature().v, runInfo.gemmStyle.subBlockDimN)),
-        MakeJitConstant("DX", runInfo.gemmStyle.globalWorkSizeDX),
-        MakeJitConstant("DY", runInfo.gemmStyle.globalWorkSizeDY),
-        MakeJitConstant("FILTER_SIZE_X_DIV2", params.conv.filterSize.x / 2),
-        MakeJitConstant("INPUT_BUFFER_WIDTH_PADDED", ""),  // TODO: enable non padding path again
-        MakeJitConstant("INPUT_BUFFER_HEIGHT_PADDED", ""),
-    });
-
-    if (CeilDiv(RoundUp(params.output.X().v * params.output.Y().v, runInfo.gemmStyle.subBlockDimM),
-                runInfo.gemmStyle.globalWorkSizeDY) %
-            runInfo.lws1 !=
-        0)
-        jit.AddConstant(MakeJitConstant("LEFTOVERS", 1));
-
-    return jit;
-}
-
-KernelsData fused_conv_eltwise_kernel_gemm::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetTunedKernelsDataByIndex(params, options);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h
deleted file mode 100644
index 9696f96253a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fused_conv_eltwise_kernel_base.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-
-class fused_conv_eltwise_kernel_gemm : public fused_conv_eltwise_kernel_base {
-public:
-    using Parent = fused_conv_eltwise_kernel_base;
-    fused_conv_eltwise_kernel_gemm() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_gemm") {}
-
-    virtual ~fused_conv_eltwise_kernel_gemm() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override;
-    std::string GetKernelName(const fused_conv_eltwise_params& params) const override;
-    bool NeedPaddedInput() const override { return true; }
-    JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
-    bool Validate(const Params& p, const optional_params& o) const override;
-    DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp
deleted file mode 100644
index e299c2cfaa8..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-﻿// Copyright (c) 2019-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "fused_conv_eltwise_kernel_imad.h"
-#include "common_tools.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-
-//
-// Kernel specific constants
-//
-#define SIMD_SIZE 16
-
-static bool getOutBlock_WH(size_t output_size,
-                           size_t stride,
-                           size_t kernel_size,
-                           size_t& output_block_w,
-                           size_t& output_block_h) {
-    bool verify_output_ranges = false;
-
-    output_block_w = output_block_h = 0;
-
-    size_t upper_border = output_size < SIMD_SIZE ? output_size : SIMD_SIZE;
-
-    size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1)) / stride;
-
-    size_t max_posible_tile_size = upper_border < stride_restrictions ? upper_border : stride_restrictions;
-
-    if (output_size % max_posible_tile_size == 0) {
-        output_block_w = max_posible_tile_size;
-    } else {
-        size_t min_horisontal_block_size = 2;  // 4;
-
-        size_t block_size = 0;
-
-        for (size_t i = min_horisontal_block_size; i < max_posible_tile_size; i++) {
-            if (output_size % i == 0)
-                block_size = i;
-        }
-
-        if (block_size != 0) {
-            output_block_w = block_size;
-        } else {
-            output_block_w = max_posible_tile_size;
-            verify_output_ranges = true;
-        }
-    }
-
-    if (output_block_w <= 4)
-        output_block_h = output_block_w;
-    else
-        output_block_h = 1;
-
-    return verify_output_ranges;
-}
-namespace kernel_selector {
-
-ParamsKey fused_conv_eltwise_kernel_imad::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputWeightsType(WeightsType::UINT8);
-    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
-    k.EnableOutputLayout(DataLayout::byxf_af32);
-
-    k.EnableDifferentTypes();
-    k.EnableDifferentInputWeightsTypes();
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableDilation();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableBatching();
-    k.EnableFusedConvEltwInt8Quantization();
-    k.EnableFusedConvEltwOutputCalibration();
-    k.DisableTuning();
-    k.EnableFusedConvEltwiseRWOutOpt();
-    k.EnableEltwiseStride();
-    return k;
-}
-
-KernelsData fused_conv_eltwise_kernel_imad::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options);
-}
-
-JitConstants fused_conv_eltwise_kernel_imad::GetJitConstants(const fused_conv_eltwise_params& params,
-                                                             const DispatchData& kd) const {
-    auto mem_consts = Parent::GetJitConstants(params, kd);
-
-    const auto& input = params.inputs[0];
-    const auto& output = params.output;
-    mem_consts.Merge(MakeActivationJitConstants(params.conv.activations, GetUnitType(params), "_CONV_TYPED", true));
-    mem_consts.Merge(MakeActivationJitConstants(params.activations, GetUnitType(params), "_ELTW_TYPED", true));
-    mem_consts.Merge(MakeTypeJitConstants(Datatype::F32, "float"));
-
-    const auto& iDims = input.GetDims();
-    const auto& oDims = output.GetDims();
-    const auto& weights = params.weights;
-    const auto& wDims = weights.GetDims();
-    const int iX = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::X);
-    const int iY = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::Y);
-    const int iF = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::FEATURE);
-    const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
-    const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
-    const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
-    mem_consts.AddConstants({
-        MakeJitConstant("_IW", iDims[iX].v),
-        MakeJitConstant("_IH", iDims[iY].v),
-        MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)),
-        MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after),
-        MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after),
-        MakeJitConstant("_OW", oDims[oX].v),
-        MakeJitConstant("_OH", oDims[oY].v),
-        MakeJitConstant("_OD", wDims[wOD].v),
-        MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after),
-        MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after),
-        MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
-        MakeJitConstant("K_HEIGHT", wDims[iY].v),
-        MakeJitConstant("K_WIDTH", wDims[iX].v),
-        MakeJitConstant("K_STRIDE", params.conv.stride.x),  // X and Y must be equal
-        MakeJitConstant("NON_BLOCK_LOAD", 1),
-    });
-
-    size_t obw, obh;
-    bool verify_output_ranges = getOutBlock_WH(oDims[oX].v, params.conv.stride.x, wDims[iX].v, obw, obh);
-    mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", obw),
-                             MakeJitConstant("OUT_BLOCK_HEIGHT", obh),
-                             MakeJitConstant("NEED_TO_VERIFY_OUTPUT_RANGES", verify_output_ranges)});
-    if (params.non_conv_scale != 1.0f)
-        mem_consts.AddConstant(MakeJitConstant("NON_CONV_SCALE", params.non_conv_scale));
-
-    return mem_consts;
-}  // GetJitConstants
-
-fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_imad::SetDefault(
-    const fused_conv_eltwise_params& params,
-    int) const {
-    DispatchData kd;
-
-    const auto& in = params.inputs[0];
-    const auto& weights = params.weights;
-    const auto& iDims = in.GetDims();
-    const auto& wDims = weights.GetDims();
-    const int iX = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::X);
-    const int iY = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::Y);
-    const int iB = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::BATCH);
-    const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
-
-    size_t otw, oth;
-    getOutBlock_WH(iDims[iX].v, params.conv.stride.x, iDims[iX].pad.before + iDims[iX].pad.after, otw, oth);
-
-    size_t dim_add = ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE);
-    if (dim_add != 0)
-        dim_add = SIMD_SIZE - dim_add;
-
-
-    std::vector<size_t> global = {// globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
-                                  // number of tiles needed to cover output width
-                                  (((iDims[iX].v / params.conv.stride.x) + (otw - 1)) / otw),
-
-                                  // globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
-                                  // number of tiles needed to cover output height
-                                  (((iDims[iY].v / params.conv.stride.y) + (oth - 1)) / oth),
-
-                                  // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
-                                  // round depth range up
-                                  ((wDims[wOD].v * iDims[iB].v) + dim_add)};
-
-    std::vector<size_t> local = {1, 1, SIMD_SIZE};
-
-    kd.gws0 = global[0];
-    kd.gws1 = global[1];
-    kd.gws2 = global[2];
-
-    kd.lws0 = local[0];
-    kd.lws1 = local[1];
-    kd.lws2 = local[2];
-
-    kd.cldnnStyle = {0, 0, 0, 0, 0};
-    kd.gemmStyle = {0, 0, 0, 0, 0, 0};
-    kd.efficiency = FORCE_PRIORITY_2;
-
-    return kd;
-}  // SetDefault
-
-bool fused_conv_eltwise_kernel_imad::Validate(const Params& params, const optional_params& options) const {
-    if (!Parent::Validate(params, options)) {
-        return false;
-    }
-
-    KernelData kd = KernelData::Default<fused_conv_eltwise_params>(params);
-    fused_conv_eltwise_params& newParams = *static_cast<fused_conv_eltwise_params*>(kd.params.get());
-
-    if (newParams.conv.stride.x != newParams.conv.stride.y) {
-        // Strides must be equial
-        return false;
-    } else if ((newParams.conv.filterSize.x != m_FilterSizeX) || (newParams.conv.filterSize.y != m_FilterSizeY)) {
-        // Kernel does not support such filter size
-        return false;
-    }
-
-    return true;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h
deleted file mode 100644
index 7af500b2975..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "fused_conv_eltwise_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class fused_conv_eltwise_kernel_imad : public fused_conv_eltwise_kernel_base {
-public:
-    using Parent = fused_conv_eltwise_kernel_base;
-    fused_conv_eltwise_kernel_imad() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_imad") {}
-
-    virtual ~fused_conv_eltwise_kernel_imad() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& params, const optional_params& options) const override;
-    JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
-    bool NeedPaddedInput() const override { return true; }
-    WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override {
-        return WeightsLayout::os_is_yx_osv16_isv4;
-    }
-
-    size_t m_FilterSizeX = 1;
-    size_t m_FilterSizeY = 1;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp
deleted file mode 100644
index 4859a0c9729..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-static const size_t _SG_TILE_M = 32;
-static const size_t _SG_TILE_N = 32;
-static const size_t _SG_SIZE = 8;         // sub group size
-static const size_t _TILES_PER_SG_X = 1;  // Persistent threads
-static const size_t _TILES_PER_SG_Y = 1;  // Persistent threads
-
-ParamsKey fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableFusedConvEltwInt8Quantization();
-    k.EnableFusedConvEltwOutputCalibration();
-    k.DisableTuning();
-    k.EnableFusedConvEltwiseRWOutOpt();
-    return k;
-}
-
-bool fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p,
-                                                                         const optional_params& o) const {
-    if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) {
-        return false;
-    }
-
-    const fused_conv_eltwise_params& cp = static_cast<const fused_conv_eltwise_params&>(p);
-
-    // make sure it's 1x1 conv
-    if (cp.conv.filterSize.x != 1 || cp.conv.filterSize.y != 1)
-        return false;
-
-    // make sure stride is 1x1
-    if (cp.conv.stride.x != 1 || cp.conv.stride.y != 1)
-        return false;
-
-    // input padding not supported
-    if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 ||
-        cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0)
-        return false;
-
-    // input and output spatial sizes must match
-    if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
-        return false;
-
-    const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v;
-    const auto k = cp.inputs[0].Feature().v;
-    const auto n = cp.output.Feature().v;
-
-    if (m % 32 != 0 && m % 128 != 0)  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128
-        return false;
-
-    if (k % 32 != 0)  // Matrix size K, Must be mutliple of 32
-        return false;
-
-    if (n % 32 != 0 && n % 128 != 0)  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128
-        return false;
-
-    return true;
-}
-
-fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault(
-    const fused_conv_eltwise_params& arg,
-    int) const {
-    DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
-    size_t mat_n = arg.output.Feature().v;
-
-    size_t _MATRIX_M = mat_m;
-    size_t _MATRIX_N = mat_n;
-
-    size_t _WG_TILE_M = 128;
-    size_t _WG_TILE_N = 128;
-
-    // Calculate number of threads needed
-    const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
-    const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y;
-
-    // Define execution setup for kernel:
-    size_t globalWorkSize[3] = {threadsX, threadsY, 1};
-    size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1};
-
-    runInfo.gws0 = globalWorkSize[0];
-    runInfo.gws1 = globalWorkSize[1];
-    runInfo.gws2 = globalWorkSize[2];
-
-    runInfo.lws0 = localWorkSize[0];
-    runInfo.lws1 = localWorkSize[1];
-    runInfo.lws2 = localWorkSize[2];
-
-    return runInfo;
-}
-
-JitConstants fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants(
-    const fused_conv_eltwise_params& params,
-    const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("WG_TILE_M", 128));     // Work-Group tile size M, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("WG_TILE_N", 128));     // Work-Group tile size N, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1));  // Persistent threads
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1));  // Persistent threads
-
-    // Do not change values below
-    jit.AddConstant(MakeJitConstant("DIM_X", 0));
-    jit.AddConstant(MakeJitConstant("DIM_Y", 1));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
-    jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
-    jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
-    jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
-    jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
-
-    jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
-    jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
-
-    const auto& input = params.inputs[0];
-    const auto& output = params.output;
-
-    auto m = output.X().v * output.Y().v * output.Batch().v;
-    auto k = input.Feature().v;
-    auto n = output.Feature().v;
-
-    jit.AddConstant(MakeJitConstant("MATRIX_M", m));
-    jit.AddConstant(MakeJitConstant("MATRIX_K", k));
-    jit.AddConstant(MakeJitConstant("MATRIX_N", n));
-
-    const size_t out_x_pitch = 32 * 4;
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-    const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
-    const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
-    const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
-
-    bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0;
-    jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding));
-
-    bool eltw_padding = false;
-    if (!params.second_input_in_output) {
-        // for second input
-        const size_t in2_x_pitch = 32 * 4;
-        const size_t in2_y_pitch = 32 * 4 * params.inputs[1].X().LogicalDimPadded();
-        const size_t in2_b_block_pitch = in2_y_pitch * params.inputs[1].Y().LogicalDimPadded();
-        const size_t in2_f_block_pitch = in2_b_block_pitch * ((params.inputs[1].Batch().v + 3) / 4);
-        const size_t in2_offset =
-            in2_x_pitch * params.inputs[1].X().pad.before + in2_y_pitch * params.inputs[1].Y().pad.before;
-
-        jit.AddConstant(MakeJitConstant("IN2_X_PITCH", in2_x_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_Y_PITCH", in2_y_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_B_BLOCK_PITCH", in2_b_block_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_F_BLOCK_PITCH", in2_f_block_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_OFFSET", in2_offset));
-
-        eltw_padding = params.inputs[1].X().pad.Total() != 0 || params.inputs[1].Y().pad.Total() != 0;
-    } else {
-        eltw_padding = out_padding;
-    }
-
-    jit.AddConstant(MakeJitConstant("ELTW_WITH_PADDING", eltw_padding));
-
-    return jit;
-}
-
-KernelsData fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData(
-    const Params& params,
-    const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_1;  // _3
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h
deleted file mode 100644
index a02d08ca643..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h
+++ /dev/null
@@ -1,42 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fused_conv_eltwise_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8 : public fused_conv_eltwise_kernel_base {
-public:
-    using Parent = fused_conv_eltwise_kernel_base;
-    fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8()
-        : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8") {}
-
-    virtual ~fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override {
-        return WeightsLayout::is_o32_yx_isv32_swizzled_by_4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp
deleted file mode 100644
index c0d8cd6f17a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-
-static const size_t _SG_TILE_M = 32;
-static const size_t _SG_TILE_N = 32;
-static const size_t _SG_SIZE = 8;         // sub group size
-static const size_t _TILES_PER_SG_X = 1;  // Persistent threads
-static const size_t _TILES_PER_SG_Y = 1;  // Persistent threads
-
-ParamsKey fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableBatching();
-    k.EnableFusedConvEltwInt8Quantization();
-    k.EnableFusedConvEltwOutputCalibration();
-    k.DisableTuning();
-    k.EnableFusedConvEltwiseRWOutOpt();
-    return k;
-}
-
-bool fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p,
-                                                                         const optional_params& o) const {
-    if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) {
-        return false;
-    }
-
-    const convolution_params& cp = static_cast<const convolution_params&>(p);
-
-    // make sure it's 1x1 conv
-    if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
-        return false;
-
-    // make sure stride is 1x1
-    if (cp.stride.x != 1 || cp.stride.y != 1)
-        return false;
-
-    // input padding not supported
-    if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 ||
-        cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0)
-        return false;
-
-    // input and output spatial sizes must match
-    if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v))
-        return false;
-
-    const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v;
-    const auto k = cp.inputs[0].Feature().v;
-    const auto n = cp.output.Feature().v;
-
-    if (m % 32 != 0 && m % 224 != 0)  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128
-        return false;
-
-    if (k % 32 != 0)  // Matrix size K, Must be mutliple of 32
-        return false;
-
-    if (n % 32 != 0 && n % 128 != 0)  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128
-        return false;
-
-    return true;
-}
-
-fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault(
-    const fused_conv_eltwise_params& arg,
-    int) const {
-    DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg);
-
-    runInfo.efficiency = FORCE_PRIORITY_1;
-
-    size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v;
-    size_t mat_n = arg.output.Feature().v;
-
-    size_t _MATRIX_M = mat_m;
-    size_t _MATRIX_N = mat_n;
-
-    size_t _WG_TILE_M = 224;
-    size_t _WG_TILE_N = 128;
-
-    // Calculate number of threads needed
-    const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X;
-    const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y;
-
-    // Define execution setup for kernel:
-    size_t globalWorkSize[3] = {threadsX, threadsY, 1};
-    size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1};
-
-    runInfo.gws0 = globalWorkSize[0];
-    runInfo.gws1 = globalWorkSize[1];
-    runInfo.gws2 = globalWorkSize[2];
-
-    runInfo.lws0 = localWorkSize[0];
-    runInfo.lws1 = localWorkSize[1];
-    runInfo.lws2 = localWorkSize[2];
-
-    return runInfo;
-}
-
-JitConstants fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants(
-    const fused_conv_eltwise_params& params,
-    const DispatchData& runInfo) const {
-    auto jit = Parent::GetJitConstants(params, runInfo);
-
-    jit.AddConstant(MakeJitConstant("WG_TILE_M", 224));  // Work-Group tile size M, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("WG_TILE_N", 128));  // Work-Group tile size N, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X));
-    jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y));
-
-    // Do not change values below
-    jit.AddConstant(MakeJitConstant("DIM_X", 0));
-    jit.AddConstant(MakeJitConstant("DIM_Y", 1));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32));
-    jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16));
-    jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M));
-    jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N));
-    jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M"));
-    jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)"));
-    jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)"));
-
-    jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", ""));
-    jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", ""));
-    jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", ""));
-
-    const auto& input = params.inputs[0];
-    const auto& output = params.output;
-
-    auto m = output.X().v * output.Y().v * output.Batch().v;
-    auto k = input.Feature().v;
-    auto n = output.Feature().v;
-
-    jit.AddConstant(MakeJitConstant("MATRIX_M", m));  // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M
-    jit.AddConstant(MakeJitConstant("MATRIX_K", k));  // Matrix size K, Must be mutliple of 32
-    jit.AddConstant(MakeJitConstant("MATRIX_N", n));  // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N
-
-    const size_t out_x_pitch = 32 * 4;
-    const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
-    const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded();
-    const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4);
-    const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset));
-
-    bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0;
-    jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding));
-
-    bool eltw_padding = false;
-    if (!params.second_input_in_output) {
-        // for second input
-        const size_t in2_x_pitch = 32 * 4;
-        const size_t in2_y_pitch = 32 * 4 * params.inputs[1].X().LogicalDimPadded();
-        const size_t in2_b_block_pitch = in2_y_pitch * params.inputs[1].Y().LogicalDimPadded();
-        const size_t in2_f_block_pitch = in2_b_block_pitch * ((params.inputs[1].Batch().v + 3) / 4);
-        const size_t in2_offset =
-            in2_x_pitch * params.inputs[1].X().pad.before + in2_y_pitch * params.inputs[1].Y().pad.before;
-
-        jit.AddConstant(MakeJitConstant("IN2_X_PITCH", in2_x_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_Y_PITCH", in2_y_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_B_BLOCK_PITCH", in2_b_block_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_F_BLOCK_PITCH", in2_f_block_pitch));
-        jit.AddConstant(MakeJitConstant("IN2_OFFSET", in2_offset));
-
-        eltw_padding = params.inputs[1].X().pad.Total() != 0 || params.inputs[1].Y().pad.Total() != 0;
-    } else {
-        eltw_padding = out_padding;
-    }
-
-    jit.AddConstant(MakeJitConstant("ELTW_WITH_PADDING", eltw_padding));
-
-    return jit;
-}
-
-KernelsData fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData(
-    const Params& params,
-    const optional_params& options) const {
-    KernelsData kd = GetCommonKernelsData(params, options);
-    if (!kd.empty())
-        kd[0].estimatedTime = FORCE_PRIORITY_1;  // _3
-    return kd;
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h
deleted file mode 100644
index a43f3232824..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h
+++ /dev/null
@@ -1,42 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "fused_conv_eltwise_kernel_base.h"
-#include <vector>
-
-namespace kernel_selector {
-
-class fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8 : public fused_conv_eltwise_kernel_base {
-public:
-    using Parent = fused_conv_eltwise_kernel_base;
-    fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8()
-        : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8") {}
-
-    virtual ~fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params& p, const optional_params& o) const override;
-    JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
-    DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
-    WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override {
-        return WeightsLayout::is_o32_yx_isv32_swizzled_by_4;
-    }
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp
index fe1976bc44d..da1c3c8e16f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp
@@ -14,26 +14,16 @@
 
 
 #include "fused_conv_eltwise_kernel_selector.h"
-#include "fused_conv_eltwise_kernel_gemm.h"
 #include "fused_conv_eltwise_kernel_bfyx_1x1_opt.h"
 #include "fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h"
-#include "fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h"
-#include "fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h"
 #include "fused_conv_eltwise_kernel_yxfb_yxio_b16.h"
-#include "fused_conv_eltwise_kernel_imad.h"
-#include "fused_conv_eltwise_kernel_af32_imad_1x1.h"
 #include "fused_conv_eltwise_kernel_bfyx_iyxo.h"
 
 namespace kernel_selector {
 fused_conv_eltwise_kernel_selector::fused_conv_eltwise_kernel_selector() {
-    //        Attach<fused_conv_eltwise_kernel_gemm>();
     Attach<fused_conv_eltwise_kernel_yxfb_yxio_b16>();
     Attach<fused_conv_eltwise_kernel_bfyx_1x1_opt>();
     Attach<fused_conv_eltwise_kernel_bfyx_os_iyx_osv16>();
-    Attach<fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8>();
-    Attach<fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8>();
-    Attach<fused_conv_eltwise_kernel_imad>();
-    Attach<fused_conv_eltwise_kernel_af32_imad_1x1>();
     Attach<fused_conv_eltwise_kernel_bfyx_iyxo>();
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp
deleted file mode 100644
index 6afc7a8adf7..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-﻿// Copyright (c) 2018-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "index_select_kernel_base.h"
-
-#include "kernel_selector_utils.h"
-#include <string>
-#include <vector>
-
-namespace kernel_selector {
-JitConstants IndexSelectKernelBase::GetJitConstants(const index_select_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    jit.AddConstant(MakeJitConstant("AXES_NUMBER", params.axes.size()));
-
-    if (params.reverse) {
-        jit.AddConstant(MakeJitConstant("REVERSE", 1));
-    }
-
-    for (size_t i = 0; i < params.axes.size(); i++) {
-        std::string size_name = "REVERSE_AXIS_SIZE";
-        size_t size_value = 0;
-        if (params.axes.size() > 1) {
-            std::stringstream ss;
-            ss << "REVERSE_" << toString(params.axes[i]) << "_SIZE";
-            size_name = ss.str();
-        }
-        jit.AddConstant(MakeJitConstant(toString(params.axes[i]), ""));
-        if (params.reverse) {
-            if (params.axes[i] == IndexSelectAxis::BATCH) {
-                size_value = params.inputs.at(0).Batch().v;
-            } else if (params.axes[i] == IndexSelectAxis::X) {
-                size_value = params.inputs.at(0).X().v;
-            } else if (params.axes[i] == IndexSelectAxis::Y) {
-                size_value = params.inputs.at(0).Y().v;
-            } else if (params.axes[i] == IndexSelectAxis::FEATURE) {
-                size_value = params.inputs.at(0).Feature().v;
-            }
-        }
-        jit.AddConstant(MakeJitConstant(size_name, size_value));
-    }
-
-    return jit;
-}
-
-IndexSelectKernelBase::DispatchData IndexSelectKernelBase::SetDefault(const index_select_params& params) {
-    const auto& output = params.output;
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    std::vector<size_t> global;
-
-    if (params.axes.size() == 1) {
-        if (params.reverse) {
-            if (params.axes[0] == IndexSelectAxis::BATCH) {
-                global = {1, params.inputs.at(0).Batch().v, output.Feature().v};
-            } else if (params.axes[0] == IndexSelectAxis::X) {
-                global = {output.Batch().v, params.inputs.at(0).X().v, output.Feature().v};
-            } else if (params.axes[0] == IndexSelectAxis::Y) {
-                global = {output.Batch().v, params.inputs.at(0).Y().v, output.Feature().v};
-            } else if (params.axes[0] == IndexSelectAxis::FEATURE) {
-                global = {output.Batch().v, params.inputs.at(0).Feature().v, output.Y().v};
-            }
-        } else {
-            const auto indices = params.inputs.at(1).X().v;
-
-            if (params.axes[0] == IndexSelectAxis::BATCH) {
-                global = {1, indices, output.Feature().v};
-            } else if (params.axes[0] == IndexSelectAxis::X || params.axes[0] == IndexSelectAxis::Y) {
-                global = {output.Batch().v, indices, output.Feature().v};
-            } else if (params.axes[0] == IndexSelectAxis::FEATURE) {
-                global = {output.Batch().v, indices, output.Y().v};
-            }
-        }
-    } else {
-        if (params.reverse) {
-            global = {output.Batch().v, output.Y().v, output.Feature().v};
-        }
-    }
-
-    const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
-    kd.gws0 = global[0];
-    kd.gws1 = global[1];
-    kd.gws2 = global[2];
-
-    kd.lws0 = local[0];
-    kd.lws1 = local[1];
-    kd.lws2 = local[2];
-
-    return kd;
-}
-
-KernelsData IndexSelectKernelBase::GetCommonKernelsData(const Params& params,
-                                                        const optional_params& options,
-                                                        float estimated_time) const {
-    assert(params.GetType() == KernelType::INDEX_SELECT);
-
-    const auto& prim_params =
-        static_cast<const index_select_params&>(params);
-
-    auto run_info = SetDefault(prim_params);
-    KernelData k_data = KernelData::Default<index_select_params>(params);
-
-    auto cldnn_jit = GetJitConstants(prim_params);
-    auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = k_data.kernels[0];
-    FillCLKernelData(kernel,
-                     run_info,
-                     params.engineInfo,
-                     kernelName,
-                     jit,
-                     entry_point,
-                     DEFAULT,
-                     false,
-                     false,
-                     (uint32_t)prim_params.inputs.size());
-
-    k_data.estimatedTime = estimated_time;
-
-    return {k_data};
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h
deleted file mode 100644
index 3d19510f333..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h
+++ /dev/null
@@ -1,54 +0,0 @@
-﻿// Copyright (c) 2018-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "common_kernel_base.h"
-#include "kernel_selector_params.h"
-#include <vector>
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// index_select_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct index_select_params : public base_params {
-    index_select_params() : base_params(KernelType::INDEX_SELECT) {}
-
-    std::vector<IndexSelectAxis> axes = {IndexSelectAxis::BATCH};
-    bool reverse = false;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// index_select_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct index_select_optional_params : optional_params {
-    index_select_optional_params() : optional_params(KernelType::INDEX_SELECT) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// IndexSelectKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class IndexSelectKernelBase : public common_kernel_base {
-public:
-    using common_kernel_base::common_kernel_base;
-    virtual ~IndexSelectKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    JitConstants GetJitConstants(const index_select_params& params) const;
-    static DispatchData SetDefault(const index_select_params& params);
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp
deleted file mode 100644
index 47f4a7554d2..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "index_select_kernel_ref.h"
-
-namespace kernel_selector {
-ParamsKey IndexSelectKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputDataType(Datatype::UINT8);
-    k.EnableInputDataType(Datatype::INT32);
-
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::INT32);
-
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::yxfb);
-
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::yxfb);
-
-    k.EnableBatching();
-
-    k.EnableIndexSelectAxis(IndexSelectAxis::BATCH);
-    k.EnableIndexSelectAxis(IndexSelectAxis::FEATURE);
-    k.EnableIndexSelectAxis(IndexSelectAxis::Y);
-    k.EnableIndexSelectAxis(IndexSelectAxis::X);
-
-    k.EnableDifferentTypes();
-
-    return k;
-}
-
-KernelsData IndexSelectKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h
deleted file mode 100644
index a185b0deb3e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "index_select_kernel_base.h"
-
-namespace kernel_selector {
-class IndexSelectKernelRef : public IndexSelectKernelBase {
-public:
-    IndexSelectKernelRef() : IndexSelectKernelBase("index_select_gpu_ref") {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp
deleted file mode 100644
index 674d5ca544f..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "index_select_kernel_selector.h"
-#include "index_select_kernel_ref.h"
-
-namespace kernel_selector {
-index_select_kernel_selector::index_select_kernel_selector() { Attach<IndexSelectKernelRef>(); }
-
-KernelsData index_select_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::INDEX_SELECT);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h
deleted file mode 100644
index f8030c98432..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h
+++ /dev/null
@@ -1,31 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class index_select_kernel_selector : public kernel_selector_base {
-public:
-    static index_select_kernel_selector& Instance() {
-        static index_select_kernel_selector instance;
-        return instance;
-    }
-
-    index_select_kernel_selector();
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp
deleted file mode 100644
index cca47b2aa18..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "lookup_table_kernel_axis.h"
-#include <algorithm>
-
-namespace kernel_selector {
-ParamsKey LookUpTableKernelAxis::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableLookUpTableIndicesFormat(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableLookUpTableAxis(LookUpTableAxis::BATCH);
-    k.EnableLookUpTableAxis(LookUpTableAxis::X);
-    k.EnableLookUpTableAxis(LookUpTableAxis::Y);
-    k.EnableLookUpTableAxis(LookUpTableAxis::FEATURE);
-    k.EnableBatching();
-    return k;
-}
-
-KernelsData LookUpTableKernelAxis::GetKernelsData(const Params& params, const optional_params& options) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    const lookup_table_params& orgParams = static_cast<const lookup_table_params&>(params);
-
-    DispatchData runInfo;
-    runInfo.fp16UnitUsed = orgParams.inputs[0].GetDType() == Datatype::F16;
-
-    if (orgParams.lookUpTableAxis == LookUpTableAxis::BATCH) {
-        runInfo.gws0 = orgParams.inputs[0].X().v;
-        runInfo.gws1 = orgParams.inputs[0].Y().v;
-        runInfo.gws2 = orgParams.inputs[0].Feature().v;
-    } else if (orgParams.lookUpTableAxis == LookUpTableAxis::FEATURE) {
-        runInfo.gws0 = orgParams.inputs[0].X().v;
-        runInfo.gws1 = orgParams.inputs[0].Y().v;
-        runInfo.gws2 = orgParams.inputs[0].Batch().v;
-    } else if (orgParams.lookUpTableAxis == LookUpTableAxis::Y) {
-        runInfo.gws0 = orgParams.inputs[0].X().v;
-        runInfo.gws1 = orgParams.inputs[0].Feature().v;
-        runInfo.gws2 = orgParams.inputs[0].Batch().v;
-    } else if (orgParams.lookUpTableAxis == LookUpTableAxis::X) {
-        runInfo.gws0 = orgParams.inputs[0].Y().v;
-        runInfo.gws1 = orgParams.inputs[0].Feature().v;
-        runInfo.gws2 = orgParams.inputs[0].Batch().v;
-    }
-
-    runInfo.lws0 = std::min(std::max(runInfo.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
-    while (runInfo.gws0 % runInfo.lws0 != 0) {
-        --runInfo.lws0;
-    }
-    runInfo.lws1 = 1;
-    runInfo.lws2 = 1;
-
-    KernelData kd = KernelData::Default<lookup_table_params>(params);
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2);
-
-    kd.estimatedTime = FORCE_PRIORITY_9;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h
deleted file mode 100644
index 90bb61011b4..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "lookup_table_kernel_base.h"
-
-namespace kernel_selector {
-class LookUpTableKernelAxis : public LookUpTableKernelBase {
-public:
-    LookUpTableKernelAxis() : LookUpTableKernelBase("lookup_table_axis") {}
-    virtual ~LookUpTableKernelAxis() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp
deleted file mode 100644
index 6874efa07fd..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "lookup_table_kernel_base.h"
-#include <algorithm>
-
-namespace kernel_selector {
-bool LookUpTableKernelBase::Validate(const Params& p, const optional_params& o) const {
-    if (p.GetType() != KernelType::LOOKUP_TABLE || o.GetType() != KernelType::LOOKUP_TABLE) {
-        return false;
-    }
-
-    return true;
-}
-
-JitConstants LookUpTableKernelBase::GetJitConstants(const lookup_table_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    jit.AddConstants({
-        MakeJitConstant("VAL_NUM", params.numberOfValues),
-        MakeJitConstant(toString(params.lookUpTableAxis) + "_AXIS", 1),
-    });
-
-    return jit;
-}
-
-LookUpTableKernelBase::DispatchData LookUpTableKernelBase::SetDefault(const lookup_table_params& params) const {
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    // Determine global work sizes.
-    kd.gws0 = params.inputIndices.X().v;
-    kd.gws1 = params.inputIndices.Batch().v;  // B
-    kd.gws2 = 1;
-
-    kd.lws0 = std::min(std::max(kd.gws0, static_cast<size_t>(1)), static_cast<size_t>(32));
-    while (kd.gws0 % kd.lws0 != 0) {
-        --kd.lws0;
-    }
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-
-    return kd;
-}
-
-KernelsData LookUpTableKernelBase::GetCommonKernelsData(const Params& params,
-                                                        const optional_params& options,
-                                                        float estimatedTime) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    const lookup_table_params& orgParams = static_cast<const lookup_table_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-
-    KernelData kd = KernelData::Default<lookup_table_params>(params);
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2);
-
-    kd.estimatedTime = estimatedTime;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h
deleted file mode 100644
index a221f417d68..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "common_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// lookup_table_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct lookup_table_params : public base_params {
-    lookup_table_params() : base_params(KernelType::LOOKUP_TABLE) {}
-
-    LookUpTableAxis lookUpTableAxis = LookUpTableAxis::XYF;
-    uint32_t numberOfValues = 0;
-    DataTensor inputIndices;
-
-    virtual ParamsKey GetParamsKey() const {
-        ParamsKey k = base_params::GetParamsKey();
-        k.EnableLookUpTableAxis(lookUpTableAxis);
-        k.EnableLookUpTableIndicesFormat(inputIndices.GetDType());
-        return k;
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// lookup_table_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct lookup_table_optional_params : optional_params {
-    lookup_table_optional_params() : optional_params(KernelType::LOOKUP_TABLE) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// lookup_table_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class LookUpTableKernelBase : public common_kernel_base {
-public:
-    using common_kernel_base::common_kernel_base;
-    virtual ~LookUpTableKernelBase() {}
-
-    struct DispatchData : public CommonDispatchData {};
-
-protected:
-    bool Validate(const Params&, const optional_params&) const override;
-    virtual JitConstants GetJitConstants(const lookup_table_params& params) const;
-    virtual DispatchData SetDefault(const lookup_table_params& params) const;
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp
deleted file mode 100644
index cd4006c8292..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "lookup_table_kernel_ref.h"
-
-namespace kernel_selector {
-ParamsKey LookUpTableKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableLookUpTableIndicesFormat(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableLookUpTableAxis(LookUpTableAxis::XYF);
-    k.EnableBatching();
-    return k;
-}
-
-KernelsData LookUpTableKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h
deleted file mode 100644
index fab406cba16..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "lookup_table_kernel_base.h"
-
-namespace kernel_selector {
-class LookUpTableKernelRef : public LookUpTableKernelBase {
-public:
-    LookUpTableKernelRef() : LookUpTableKernelBase("lookup_table_ref") {}
-    virtual ~LookUpTableKernelRef() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp
deleted file mode 100644
index 3ad1358bbb3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "lookup_table_kernel_selector.h"
-#include "lookup_table_kernel_ref.h"
-#include "lookup_table_kernel_axis.h"
-
-namespace kernel_selector {
-
-lookup_table_kernel_selector::lookup_table_kernel_selector() {
-    Attach<LookUpTableKernelRef>();
-    Attach<LookUpTableKernelAxis>();
-}
-
-KernelsData lookup_table_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::LOOKUP_TABLE);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h
deleted file mode 100644
index 7dcc535411a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class lookup_table_kernel_selector : public kernel_selector_base {
-public:
-    static lookup_table_kernel_selector& Instance() {
-        static lookup_table_kernel_selector instance_;
-        return instance_;
-    }
-
-    lookup_table_kernel_selector();
-
-    virtual ~lookup_table_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp
index 28eb5adc1ee..65b7b99d089 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp
@@ -101,7 +101,7 @@ KernelsData PermuteKernelRef::GetKernelsData(const Params& params, const optiona
     kernel.workGroups.global = {in.X().v, in.Y().v * in.Z().v * in.W().v, in.Feature().v * in.Batch().v};
     kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global, params.engineInfo);
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
-    kernel.arguments = GetArgsDesc(1, false, false, false, false, GetFusedPrimitiveInputsCount(params));
+    kernel.arguments = GetArgsDesc(1, false, false, GetFusedPrimitiveInputsCount(params));
 
     kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp
deleted file mode 100644
index 5e20ef6349a..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-﻿// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "pooling_kernel_gpu_average_opt.h"
-
-namespace kernel_selector {
-ParamsKey PoolingKernelGPUAverageOpt::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnablePoolType(PoolType::AVG);
-    k.EnablePoolRemainder(PoolRemainder::FLOOR);
-    k.EnablePoolRemainder(PoolRemainder::CEIL);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
-    k.EnableDifferentTypes();
-    return k;
-}
-
-bool PoolingKernelGPUAverageOpt::Validate(const Params& p, const optional_params& o) const {
-    if (!PoolingKernelBase::Validate(p, o)) {
-        return false;
-    }
-
-    const pooling_params& params = static_cast<const pooling_params&>(p);
-
-    if (!params.activations.empty()) {
-        return {};
-    }
-
-    if ((params.poolSize.x != 3) || (params.poolSize.y != 3) || (params.poolStride.x != 1) ||
-        (params.poolStride.y != 1) || (params.poolPad.x != 1) || (params.poolPad.y != 1) ||
-        !(params.inputs[0] == params.output) || params.inputs[0].PitchesDifferFromLogicalDims() ||
-        params.output.PitchesDifferFromLogicalDims()) {
-        return false;
-    }
-
-    return true;
-}
-
-static uSize GetTileDimentions() {
-    constexpr int simdSize = 16;
-
-    return {simdSize - 2, 7};
-}
-
-PoolingKernelBase::DispatchData PoolingKernelGPUAverageOpt::SetDefault(const pooling_params& params) const {
-    constexpr int simdSize = 16;
-
-    DispatchData runInfo = PoolingKernelBase::SetDefault(params);
-
-    auto tileDims = GetTileDimentions();
-
-    const int numTilesX =
-        static_cast<int>(std::ceil(static_cast<float>(params.inputs[0].X().v) / static_cast<float>(tileDims.x)));
-    const int numTilesY =
-        static_cast<int>(std::ceil(static_cast<float>(params.inputs[0].Y().v) / static_cast<float>(tileDims.y)));
-
-    runInfo.gws0 = numTilesX * simdSize;
-    runInfo.gws1 = numTilesY;
-    runInfo.gws2 = params.inputs[0].Feature().v;
-    runInfo.lws0 = simdSize;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = 1;
-
-    return runInfo;
-}
-
-JitConstants PoolingKernelGPUAverageOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
-    auto tileDims = GetTileDimentions();
-    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
-
-    if (tileDims.y != 0 && tileDims.x != 0) {
-        jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0));
-        jit.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y));
-        jit.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x));
-        jit.AddConstant(MakeJitConstant("ONE_OVER_POOL_SIZE", 1.f / (params.poolSize.x * params.poolSize.y)));
-    }
-
-    return jit;
-}
-
-KernelsData PoolingKernelGPUAverageOpt::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_7);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h
deleted file mode 100644
index 828434705fa..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-﻿// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "pooling_kernel_base.h"
-
-namespace kernel_selector {
-class PoolingKernelGPUAverageOpt : public PoolingKernelBase {
-public:
-    PoolingKernelGPUAverageOpt() : PoolingKernelBase("pooling_gpu_average_opt") {}
-    virtual ~PoolingKernelGPUAverageOpt() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-protected:
-    bool Validate(const Params&, const optional_params&) const override;
-    JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
-    DispatchData SetDefault(const pooling_params& params) const override;
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp
deleted file mode 100644
index 9f5a2520440..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-﻿// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h"
-
-namespace kernel_selector {
-ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnablePoolType(PoolType::MAX);
-    k.EnablePoolType(PoolType::AVG);
-    k.EnablePoolRemainder(PoolRemainder::FLOOR);
-    k.EnablePoolRemainder(PoolRemainder::CEIL);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
-    k.EnableDifferentTypes();
-    return k;
-}
-
-PoolingKernelBase::DispatchData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::SetDefault(const pooling_params& params) const {
-    constexpr int simdSize = 8;
-
-    DispatchData runInfo = PoolingKernelBase::SetDefault(params);
-
-    runInfo.gws0 = params.output.X().v;
-    runInfo.gws1 = params.output.Y().v;
-    // we got fs_bs_yx_bsv4_fsv32 format, we process 4 batches and 4 features per workitem
-    runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * RoundUp(params.output.Batch().v, 4)) / (4 * 4);
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = simdSize;
-
-    return runInfo;
-}
-
-JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetJitConstants(const pooling_params& params, DispatchData kd) const {
-    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
-    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
-
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf = {"",
-                                     {"b + bi", "f", "y", "x"},
-                                     "char_result",
-                                     input_dt,
-                                     4,
-                                     LoadType::LT_UNALIGNED,
-                                     BoundaryCheck::ENABLED,
-                                     IndexType::TENSOR_COORD,
-                                     Tensor::DataChannelName::FEATURE};
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
-    }
-
-    return jit;
-}
-
-bool PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::Validate(const Params& params, const optional_params& options) const {
-    if (!PoolingKernelBase::Validate(params, options)) {
-        return false;
-    }
-
-    auto p = dynamic_cast<const pooling_params&>(params);
-
-    if (p.quantization != QuantizationType::NONE && p.poolType == PoolType::AVG) {
-        return false;
-    }
-
-    return true;
-}
-KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params,
-                                                                const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_2);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h
deleted file mode 100644
index 307b426a563..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h
+++ /dev/null
@@ -1,39 +0,0 @@
-﻿// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "pooling_kernel_base.h"
-
-namespace kernel_selector {
-class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32 : public PoolingKernelBase {
-public:
-    PoolingKerneGPU_fs_bs_yx_bsv4_fsv32() : PoolingKernelBase("pooling_gpu_fs_bs_yx_bsv4_fsv32") {}
-    virtual ~PoolingKerneGPU_fs_bs_yx_bsv4_fsv32() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-    DispatchData SetDefault(const pooling_params& params) const override;
-
-protected:
-    JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
-    bool Validate(const Params&, const optional_params&) const override;
-    std::vector<FusedOpType> GetSupportedFusedOps() const override {
-         return { FusedOpType::QUANTIZE,
-                  FusedOpType::SCALE,
-                  FusedOpType::ACTIVATION };
-    }
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp
deleted file mode 100644
index 34f97ab9cc9..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-﻿// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h"
-
-namespace kernel_selector {
-ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
-    k.EnableOutputDataType(Datatype::UINT8);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    k.EnablePoolType(PoolType::MAX);
-    k.EnablePoolRemainder(PoolRemainder::FLOOR);
-    k.EnablePoolRemainder(PoolRemainder::CEIL);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
-    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
-    k.EnableDifferentTypes();
-    return k;
-}
-
-size_t static get_batch_sub_groups_count(const pooling_params& params) {
-    if (params.inputs[0].Batch().v % 32 == 0)
-        return 8;  // divided by 4 because we process 4 batches per subgroup
-    return 1;
-}
-
-PoolingKernelBase::DispatchData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::SetDefault(
-    const pooling_params& params) const {
-    constexpr int simdSize = 32;
-
-    DispatchData runInfo = PoolingKernelBase::SetDefault(params);
-
-    runInfo.gws0 = params.output.X().v;
-    runInfo.gws1 = params.output.Y().v;
-    // we got fs_bs_yx_bsv4_fsv32 format, we process 4 batches and 4 features per workitem
-    runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * RoundUp(params.output.Batch().v, 4)) / (4);  // *4);
-
-    runInfo.lws0 = 1;
-    runInfo.lws1 = 1;
-    runInfo.lws2 = simdSize * get_batch_sub_groups_count(params);
-
-    return runInfo;
-}
-
-JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetJitConstants(const pooling_params& params,
-                                                                         DispatchData kd) const {
-    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
-
-    const size_t in_x_pitch = 32 * 4;
-    const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
-    const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
-    const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
-    const size_t in_offset =
-        in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
-
-    jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
-    jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
-    jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
-    jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
-    jit.AddConstant(MakeJitConstant("BATCH_SG_COUNT", get_batch_sub_groups_count(params)));
-    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
-    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
-
-    if (!params.fused_ops.empty()) {
-        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf = {"",
-                                     {"b", "f", "y", "x"},
-                                     "pool_result",
-                                     input_dt,
-                                     4,
-                                     LoadType::LT_UNALIGNED,
-                                     BoundaryCheck::ENABLED,
-                                     IndexType::TENSOR_COORD,
-                                     Tensor::DataChannelName::FEATURE};
-        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
-    }
-
-    return jit;
-}
-
-KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetKernelsData(const Params& params,
-                                                                       const optional_params& options) const {
-    return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h
deleted file mode 100644
index 3e2de8f1a3b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h
+++ /dev/null
@@ -1,38 +0,0 @@
-﻿// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "pooling_kernel_base.h"
-
-namespace kernel_selector {
-class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32 : public PoolingKernelBase {
-public:
-    PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32() : PoolingKernelBase("pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32") {}
-    virtual ~PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-    DispatchData SetDefault(const pooling_params& params) const override;
-
-protected:
-    JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
-        std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::QUANTIZE,
-                 FusedOpType::SCALE,
-                 FusedOpType::ACTIVATION };
-        }
-};
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
index f6a5bf0aa73..11ae5cf90ae 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
@@ -16,14 +16,11 @@
 #include "pooling_kernel_selector.h"
 #include "pooling_kernel_gpu_ref.h"
 #include "pooling_kernel_gpu_byxf_opt.h"
-#include "pooling_kernel_gpu_average_opt.h"
 #include "pooling_kernel_gpu_bfyx_block_opt.h"
 #include "pooling_kernel_gpu_byxf_padding_opt.h"
 #include "pooling_kernel_gpu_byxf_af32.h"
 #include "pooling_kernel_gpu_int8_ref.h"
-#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h"
 #include "pooling_kernel_gpu_b_fs_yx_fsv4.h"
-#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h"
 #include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
 #include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
 #include "pooling_kernel_gpu_bsv16_fsv16.h"
@@ -34,15 +31,12 @@ namespace kernel_selector {
 
 pooling_kernel_selector::pooling_kernel_selector() {
     Attach<PoolingKernelGPURef>();
-    //Attach<PoolingKernelGPUAverageOpt>(); TODO: fix the kernel as it reads out of bounds now
     Attach<PoolingKernelGPUByxfOpt>();
     Attach<PoolingKernelGPUBfyxBlockOpt>();
     Attach<PoolingKernelGPUByxfPaddingOpt>();
     Attach<PoolingKernelGPUInt8Ref>();
     Attach<PoolingKerneGPU_byxf_af32>();
-    Attach<PoolingKerneGPU_fs_bs_yx_bsv4_fsv32>();
     Attach<PoolingKerneGPU_b_fs_yx_fsv4>();
-    Attach<PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32>();
     Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
     Attach<PoolingKernel_b_fs_yx_fsv16>();
     Attach<PoolingKernel_bsv16_fsv16>();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp
index e9c1b39c8df..5ec60546c3b 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2019 Intel Corporation
+﻿// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ KernelsData QuantizeKernelBase::GetKernelsData(const Params& params, const optio
     kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2};
     kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
     kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT);
-    kernel.arguments = GetArgsDesc(static_cast<int>(newParams.inputs.size()), false, false, false, false);
+    kernel.arguments = GetArgsDesc(static_cast<int>(newParams.inputs.size()), false, false);
 
     kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp
deleted file mode 100644
index c34f803bd07..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_weights_kernel_base.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-JitConstants ScaleGradWeightsKernelBase::GetJitConstants(const scale_grad_weights_params& params) const {
-    JitConstants jit = training_kernel_base::GetJitConstants(params);
-
-    return jit;
-}
-
-ScaleGradWeightsKernelBase::DispatchData ScaleGradWeightsKernelBase::SetDefault(
-    const scale_grad_weights_params& params) const {
-    DispatchData kd;
-
-    kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    kd.gws0 = params.inputs[0].Batch().v;
-    kd.gws1 = params.inputs[0].Feature().v;
-    kd.gws2 = 1;
-
-    kd.lws0 = params.inputs[0].Batch().v;
-    kd.lws1 = 1;
-    kd.lws2 = 1;
-    kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-    return kd;
-}
-
-KernelsData ScaleGradWeightsKernelBase::GetKernelsData(const Params& params, const optional_params& options) const {
-    assert(params.GetType() == KernelType::SCALE_GRAD_WEIGHTS);
-
-    const scale_grad_weights_params& orgParams = static_cast<const scale_grad_weights_params&>(params);
-
-    DispatchData runInfo = SetDefault(orgParams);
-    KernelData kd = KernelData::Default<scale_grad_weights_params>(params);
-
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel,
-                     runInfo,
-                     params.engineInfo,
-                     kernelName,
-                     jit,
-                     entry_point,
-                     DEFAULT,
-                     true,
-                     !orgParams.bias.empty(),
-                     2);
-
-    if (orgParams.use_momentum) {
-        kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0});
-        if (!orgParams.bias.empty())
-            kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_BIAS_GRADIENT, 0});
-    }
-    kernel.arguments.push_back({ArgumentDescriptor::Types::LEARNING_RATE, 0});
-
-    kd.estimatedTime = runInfo.efficiency;
-
-    return {kd};
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h
deleted file mode 100644
index 3f83d746146..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "training_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// scale_grad_weights_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct scale_grad_weights_params : public training_params {
-    scale_grad_weights_params() : training_params(KernelType::SCALE_GRAD_WEIGHTS) {}
-
-    virtual ParamsKey GetParamsKey() const {
-        ParamsKey k = training_params::GetParamsKey();
-
-        return k;
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// scale_grad_weights_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct scale_grad_weights_optional_params : training_optional_params {
-    scale_grad_weights_optional_params() : training_optional_params(KernelType::SCALE_GRAD_WEIGHTS) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// ScaleGradWeightsKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class ScaleGradWeightsKernelBase : public training_kernel_base {
-public:
-    using training_kernel_base::training_kernel_base;
-    virtual ~ScaleGradWeightsKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const;
-    virtual JitConstants GetJitConstants(const scale_grad_weights_params& params) const;
-    virtual DispatchData SetDefault(const scale_grad_weights_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp
deleted file mode 100644
index 25bf58b0513..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_weights_kernel_ref.h"
-
-namespace kernel_selector {
-
-ParamsKey ScaleGradWeightsKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableInputWeightsType(WeightsType::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBiasPerFeature();
-    k.EnableNonBiasTerm();
-    k.EnableGradient();
-    k.EnableBatching();
-    k.DisableTuning();
-    k.EnableMomentum();
-    return k;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h
deleted file mode 100644
index f0735a2bd55..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "scale_grad_weights_kernel_base.h"
-
-namespace kernel_selector {
-
-class ScaleGradWeightsKernelRef : public ScaleGradWeightsKernelBase {
-public:
-    ScaleGradWeightsKernelRef() : ScaleGradWeightsKernelBase("scale_grad_weights_gpu_ref") {}
-    virtual ~ScaleGradWeightsKernelRef() {}
-
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp
deleted file mode 100644
index 937ade1ab46..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_weights_kernel_selector.h"
-#include "scale_grad_weights_kernel_ref.h"
-
-namespace kernel_selector {
-scale_grad_weights_kernel_selector::scale_grad_weights_kernel_selector() { Attach<ScaleGradWeightsKernelRef>(); }
-
-KernelsData scale_grad_weights_kernel_selector::GetBestKernels(const Params& params,
-                                                               const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::SCALE_GRAD_WEIGHTS);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h
deleted file mode 100644
index 7022f96936b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class scale_grad_weights_kernel_selector : public kernel_selector_base {
-public:
-    static scale_grad_weights_kernel_selector& Instance() {
-        static scale_grad_weights_kernel_selector instance_;
-        return instance_;
-    }
-
-    scale_grad_weights_kernel_selector();
-
-    virtual ~scale_grad_weights_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp
deleted file mode 100644
index 256889e1b6b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-﻿// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "softmax_loss_grad_kernel_base.h"
-#include "kernel_selector_utils.h"
-#include <vector>
-
-namespace kernel_selector {
-JitConstants SoftmaxLossGradKernelBase::GetJitConstants(const softmax_loss_grad_params& params) const {
-    return MakeBaseParamsJitConstants(params);
-}
-
-CommonDispatchData SoftmaxLossGradKernelBase::SetDefault(const softmax_loss_grad_params& params,
-                                                         const optional_params&) const {
-    CommonDispatchData runInfo;
-
-    std::vector<size_t> global = {params.output.Batch().v * params.output.X().v, 1, 1};
-
-    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
-    runInfo.gws0 = global[0];
-    runInfo.gws1 = global[1];
-    runInfo.gws2 = global[2];
-
-    runInfo.lws0 = local[0];
-    runInfo.lws1 = local[1];
-    runInfo.lws2 = local[2];
-
-    runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
-    runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
-
-    return runInfo;
-}
-
-bool SoftmaxLossGradKernelBase::Validate(const Params& p, const optional_params& o) const {
-    if (p.GetType() != KernelType::SOFT_MAX_LOSS_GRAD || o.GetType() != KernelType::SOFT_MAX_LOSS_GRAD) {
-        return false;
-    }
-
-    return true;
-}
-
-KernelsData SoftmaxLossGradKernelBase::GetCommonKernelsData(const Params& params,
-                                                            const optional_params& options) const {
-    if (!Validate(params, options)) {
-        return {};
-    }
-
-    const softmax_loss_grad_params& orgParams = static_cast<const softmax_loss_grad_params&>(params);
-    KernelData kd = KernelData::Default<softmax_loss_grad_params>(params);
-
-    auto runInfo = SetDefault(orgParams, options);
-    auto cldnn_jit = GetJitConstants(orgParams);
-    auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
-
-    kd.estimatedTime = runInfo.efficiency;
-
-    return {kd};
-}
-}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h
deleted file mode 100644
index e16128da90f..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h
+++ /dev/null
@@ -1,49 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "common_kernel_base.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// SoftmaxLossGradParams
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct softmax_loss_grad_params : public base_params {
-    softmax_loss_grad_params() : base_params(KernelType::SOFT_MAX_LOSS_GRAD) {}
-
-    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// SoftmaxLossGradOptionalParams
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct softmax_loss_grad_optional_params : optional_params {
-    softmax_loss_grad_optional_params() : optional_params(KernelType::SOFT_MAX_LOSS_GRAD) {}
-};
-
-class SoftmaxLossGradKernelBase : public common_kernel_base {
-public:
-    using common_kernel_base::common_kernel_base;
-    virtual ~SoftmaxLossGradKernelBase() {}
-
-protected:
-    virtual bool Validate(const Params&, const optional_params&) const;
-    virtual JitConstants GetJitConstants(const softmax_loss_grad_params& params) const;
-    virtual CommonDispatchData SetDefault(const softmax_loss_grad_params& params,
-                                          const optional_params& optParams) const;
-    KernelsData GetCommonKernelsData(const Params& params, const optional_params& optParams) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp
deleted file mode 100644
index db6af855c47..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "softmax_loss_grad_kernel_ref.h"
-#include "kernel_selector_utils.h"
-
-namespace kernel_selector {
-ParamsKey SoftmaxLossGradKernelRef::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableInputLayout(DataLayout::bf);
-    k.EnableInputLayout(DataLayout::yxfb);
-    k.EnableInputLayout(DataLayout::byxf);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bf);
-    k.EnableOutputLayout(DataLayout::yxfb);
-    k.EnableOutputLayout(DataLayout::byxf);
-    k.EnableBatching();
-    k.EnableGradient();
-    return k;
-}
-
-KernelsData SoftmaxLossGradKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    return GetCommonKernelsData(params, options);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h
deleted file mode 100644
index 52f9b66a5be..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h
+++ /dev/null
@@ -1,29 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "softmax_loss_grad_kernel_base.h"
-
-namespace kernel_selector {
-class SoftmaxLossGradKernelRef : public SoftmaxLossGradKernelBase {
-public:
-    using Parent = SoftmaxLossGradKernelBase;
-    SoftmaxLossGradKernelRef() : Parent("softmax_loss_grad_gpu_ref") {}
-    virtual ~SoftmaxLossGradKernelRef() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp
deleted file mode 100644
index e4c1a71344b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "softmax_loss_grad_kernel_selector.h"
-#include "softmax_loss_grad_kernel_ref.h"
-
-namespace kernel_selector {
-
-softmax_loss_grad_kernel_selector::softmax_loss_grad_kernel_selector() { Attach<SoftmaxLossGradKernelRef>(); }
-
-KernelsData softmax_loss_grad_kernel_selector::GetBestKernels(const Params& params,
-                                                              const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::SOFT_MAX_LOSS_GRAD);
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h
deleted file mode 100644
index 03e00d1edd3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h
+++ /dev/null
@@ -1,33 +0,0 @@
-﻿// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class softmax_loss_grad_kernel_selector : public kernel_selector_base {
-public:
-    static softmax_loss_grad_kernel_selector& Instance() {
-        static softmax_loss_grad_kernel_selector instance_;
-        return instance_;
-    }
-
-    softmax_loss_grad_kernel_selector();
-
-    virtual ~softmax_loss_grad_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp
index 78c15188611..b5b0acfdaa9 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ static void makeJitConstForParam(JitConstants& jit, const std::string name, cons
         jit.AddConstant(MakeJitConstant(name + "_Y", vec[2]));
         jit.AddConstant(MakeJitConstant(name + "_X", vec[3]));
     }
-};
+}
 
 static size_t GetUsedOutDimsCount(const strided_slice_params& params) {
     auto dims = params.output.GetDims();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl
deleted file mode 100644
index aaf60c3ce14..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define LOCAL_SIZE INPUT0_BATCH_NUM
-
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-KERNEL(batch_norm_gpu)(
-    const __global UNIT_TYPE* input,
-	#ifdef MEAN_VAR_OUT
-		__global UNIT_TYPE* mean_out,
-		__global UNIT_TYPE* variance_out,
-	#endif
-	#ifdef SCALE_SHIFT
-	     __global UNIT_TYPE* scale,
-		 __global UNIT_TYPE* shift,
-	#endif
-	#ifdef FORWARD
-		__global UNIT_TYPE* inv_var,
-	#endif
-       __global UNIT_TYPE* output)
-{
-    __local ACCUMULATOR_TYPE sum[LOCAL_SIZE];
-
-    const uint local_idx = (uint)get_global_id(0);
-    const uint f = (uint)get_global_id(1);
-
-    sum[local_idx] = 0;
-
-    uint input_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0);
-    for (uint y = 0; y < INPUT0_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < INPUT0_SIZE_X; x++)
-        {
-            UNIT_TYPE in = input[input_idx];
-            sum[local_idx] += in;
-            input_idx += INPUT0_X_PITCH;
-        }
-        input_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
-    {
-        if (local_idx < offset) 
-        {
-            sum[local_idx] += sum[local_idx + offset];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
-#ifdef MEAN_VAR_OUT
-		mean_out[f] = mean;
-#endif
-    sum[local_idx] = 0;
-
-    input_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0);
-    for (uint y = 0; y < INPUT0_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < INPUT0_SIZE_X; x++)
-        {
-            UNIT_TYPE in = input[input_idx] - mean;
-            sum[local_idx] += in * in;
-            input_idx += INPUT0_X_PITCH;
-        }
-        input_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
-    {
-        if (local_idx < offset) 
-        {
-            sum[local_idx] += sum[local_idx + offset];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
-#ifdef MEAN_VAR_OUT
-	variance_out[f] = variance;
-#endif
-    float inv_variance = (float)(1.0 / sqrt(variance + EPSILON));
-#ifdef FORWARD
-    if (local_idx == 0)
-        inv_var[f] = inv_variance;
-#endif
-
-    uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
-    for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < OUTPUT_SIZE_X; x++)
-        {
-			#ifdef SCALE_SHIFT
-				output[out_idx] = (inv_variance * (input[out_idx] - mean)) * scale[f] + shift[f];
-			#else
-				output[out_idx] = inv_variance * (input[out_idx] - mean);
-			#endif
-            out_idx += OUTPUT_X_PITCH;
-        }
-        out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
-    }
-}
-
-#undef LOCAL_SIZE
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl
deleted file mode 100644
index 0c698de0f91..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define LOCAL_SIZE INPUT0_BATCH_NUM
-
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-KERNEL(batch_norm_grad_gpu)(const __global UNIT_TYPE* input_grad, __global UNIT_TYPE* input, __global UNIT_TYPE* inv_var,  __global UNIT_TYPE* output)
-{
-    __local ACCUMULATOR_TYPE grad_sum[LOCAL_SIZE];
-    __local ACCUMULATOR_TYPE grad_sum_in[LOCAL_SIZE];
-
-    const uint local_idx = (uint)get_local_id(0);
-    const uint f = (uint)get_global_id(1);
-
-    grad_sum[local_idx] = 0;
-    grad_sum_in[local_idx] = 0;
-
-    uint grad_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0);
-    for (uint y = 0; y < INPUT0_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < INPUT0_SIZE_X; x++)
-        {
-            UNIT_TYPE in_g = input_grad[grad_idx];
-            grad_sum[local_idx] += in_g;
-            grad_sum_in[local_idx] += in_g * input[grad_idx]; 
-            grad_idx += INPUT0_X_PITCH;
-        }
-        grad_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
-    {
-        if (local_idx < offset) 
-        {
-            grad_sum[local_idx] += grad_sum[local_idx + offset];
-            grad_sum_in[local_idx] += grad_sum_in[local_idx + offset];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    UNIT_TYPE grad_mean = grad_sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
-    UNIT_TYPE grad_mean_in = grad_sum_in[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
-
-    uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
-    for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < OUTPUT_SIZE_X; x++)
-        {
-            UNIT_TYPE grad_out = inv_var[f] * (input_grad[out_idx] - grad_mean - grad_mean_in * input[out_idx]);
-
-            if (grad_out > 5.0f)
-                grad_out = 5.0f;
-            else if (grad_out < -5.0f)
-                grad_out = -5.0f;
-            
-            output[out_idx] = grad_out;
-            out_idx += OUTPUT_X_PITCH;
-        }
-        out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
-    }
-
-}
-
-#undef LOCAL_SIZE
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl
deleted file mode 100644
index b15787539d7..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-
-KERNEL(contract_ref)(
-    const __global INPUT0_TYPE* input,
-    __global INPUT0_TYPE* output)
-{
-    INPUT0_TYPE out_val = REDUCE_SEED;
-
-#if REDUCE_B
-    for (uint in_b = 0; in_b < INPUT0_BATCH_NUM; ++in_b) {
-#else
-    const uint in_b = (uint) get_global_id(DIM_B);
-#endif
-
-#if REDUCE_F
-    for (uint in_f = 0; in_f < INPUT0_FEATURE_NUM; ++in_f) {
-#else
-    const uint in_f = (uint) get_global_id(DIM_F);
-#endif
-
-#if REDUCE_Y
-    for (uint in_y = 0; in_y < INPUT0_SIZE_Y; ++in_y) {
-#else
-    const uint in_y = (uint) get_global_id(DIM_Y);
-#endif
-
-#if REDUCE_X
-    for (uint in_x = 0; in_x < INPUT0_SIZE_X; ++in_x) {
-#else
-    const uint in_x = (uint) get_global_id(DIM_X);
-#endif
-
-    out_val = REDUCE_OPERATION(out_val, input[GET_DATA_INDEX(INPUT0, in_b, in_f, in_y, in_x)]);
-
-#if REDUCE_X
-    }
-#endif
-#if REDUCE_Y
-    }
-#endif
-#if REDUCE_F
-    }
-#endif
-#if REDUCE_B
-    }
-#endif
-
-    output[GET_DATA_INDEX(OUTPUT, 0, get_global_id(0), get_global_id(1), get_global_id(2))] = out_val;
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl
deleted file mode 100644
index 0cb1f5fc289..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_1x1_gemm_MMAD)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if QUANTIZATION_TERM
-    __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    __global float* calibrations,
-#endif
-    uint split_idx)
-{
-    const uint sg_channel = get_sub_group_local_id();
-
-    const uint x = ((uint)get_group_id(0) * 8) % INPUT0_SIZE_X;
-    const uint y = ((uint)get_group_id(0) * 8) / INPUT0_SIZE_X;
-    const uint f = (uint)get_global_id(1) % FILTER_OFM_ALIGNED;
-    const uint b = (uint)get_global_id(1) / FILTER_OFM_ALIGNED;
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
-
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
-    uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH;
-
-    const uint filter_offset = ((uint)get_group_id(1) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    uint filter_idx = filter_offset;
-
-    int8 tileA;
-    int8 tileB;
-    int8 tileC;
-    for(uint i = 0; i < 8; i++)
-    {
-        tileC[i] = 0;
-    }
-
-    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
-    {
-        // load A tile ( input )
-        for(uint i = 0; i < 8; i++)
-        {
-            uint tmp_addr = in_addr + i * INPUT0_X_PITCH;
-            tileA[i] = as_int(intel_sub_group_block_read((const __global uint*)(input + tmp_addr)));
-        }
-
-        // load B tile ( weights )
-        tileB = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-    
-        // compute C tile ( output )
-        tileC = MMAD_8x8(tileA, tileB, tileC);
-
-        in_addr += 32; // 4 features per channel * 8 SIMD channels
-        filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel
-    }
-
-#if BIAS_TERM
-#if   BIAS_PER_OUTPUT
-    const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
-#elif BIAS_PER_OFM
-    const uint bias_index = f;
-#endif
-    for(uint i = 0; i < 8; i++)
-    {
-#if CALIBRATION_TERM
-    tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
-#else  // CALIBRATION_TERM
-    tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
-#endif // CALIBRATION_TERM
-    }
-#endif // BIAS_TERM
-
-    const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
-    // save to output
-    for(uint i = 0; i < 8; i++)
-    {
-        const uint curr_x = (x + i) % INPUT0_SIZE_X;
-        const uint curr_y = y + (x + i) / INPUT0_SIZE_X;
-        if(curr_x < INPUT0_SIZE_X && curr_y < INPUT0_SIZE_Y)
-        {
-            const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, curr_y, curr_x) + out_split_offset;
-            output[dst_index] = ACTIVATION(convert_char(tileC[i]), ACTIVATION_PARAMS);
-        }
-    }
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl
deleted file mode 100644
index 1f9424253a3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_SLICES ((FILTER_IFM_NUM + 3) /4)
-#define FILTER_SIZE_X_SLICES ((FILTER_SIZE_X + 7) / 8)
-
-#define FILTER_IFM_SLICE_PITCH (32 * (FILTER_SIZE_X_SLICES * 8) * FILTER_SIZE_Y)
-#define FILTER_OFM_SLICE_PITCH (FILTER_IFM_SLICE_PITCH * FILTER_IFM_SLICES)
-
-#define OUT_BLOCK_BATCH 2
-#define OUT_BLOCK_HEIGHT 2
-#define WEIGHTS_PER_WORKITEM 4 // currently needs to be set to 4, check output stage and float4 on quantizations etc.
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION \
-    out[w + pb * 4] = convert_uchar_sat((float)dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb][i] * SCALE + bias_f[w]);
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION \
-    out[w + pb * 4] = convert_uchar_sat(dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb][i]);
-
-#else
-
-#define QUANTIZATION \
-    out[w + pb * 4] = as_uchar( ACTIVATION( convert_char( round( ( (float)dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w])), ACTIVATION_PARAMS));
-
-#endif
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(convolution_gpu_byx8_f4_fs_bs_yx_bsv4_fsv32)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
-    __global BIAS_TYPE* biases,
-    __global float* quantizations,
-#if CALIBRATION_TERM
-    __global float* calibrations,
-#endif
-    uint split_idx)
-{
-    const uint x = get_group_id(1) * 8;
-    const uint y = get_group_id(2) * OUT_BLOCK_HEIGHT;
-
-    const uint bf_id = ((uint)get_group_id(0) * WG_BATCH_SIZE + (uint)get_sub_group_id()) * 8 * WEIGHTS_PER_WORKITEM;
-
-    const uint f = (bf_id) % OUTPUT_FEATURE_NUM;
-    const uint b = OUT_BLOCK_BATCH * (bf_id / OUTPUT_FEATURE_NUM);
-
-    int8 dotProd[OUT_BLOCK_BATCH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] =  { 0 };
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    uint filter_offset = (f/8)*FILTER_OFM_SLICE_PITCH;
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET;
-
-    for (uint k = 0; k < FILTER_IFM_SLICES; ++k)
-    {
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
-        for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
-        {
-            const int input_offset_y = input_y + j * DILATION_SIZE_Y;
-
-            __attribute__((opencl_unroll_hint(FILTER_SIZE_X_SLICES)))
-            for(uint i = 0; i < FILTER_SIZE_X_SLICES; i++)
-            {
-                const uint filter_spatial_offset = 32 * (i*8 + (FILTER_SIZE_X_SLICES * 8) * j);
-                
-                int8 act_reg[OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH]; // activations for MMAD
-
-                // preload batch data
-                __attribute__((opencl_unroll_hint(OUT_BLOCK_BATCH)))
-                for(uint pb = 0; pb < OUT_BLOCK_BATCH; pb++)
-                {
-                    // preload spatial data
-                    __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-                    for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
-                    {
-                        uint input_idx = GET_DATA_BYX8_F4_INDEX(INPUT0, b + pb, k * 4, input_offset_y + h * STRIDE_SIZE_Y, input_x + i * 8);
-                        int2 _input_data_01 = as_int2(intel_sub_group_block_read2((__global uint*)(input + input_idx)));
-                        int _input_data_2 = as_int(intel_sub_group_block_read((__global uint*)(input + input_idx + 8 * 8)));
-
-                        act_reg[h * OUT_BLOCK_BATCH + pb][0] = _input_data_01[0];
-                        act_reg[h * OUT_BLOCK_BATCH + pb][1] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 1);
-                        act_reg[h * OUT_BLOCK_BATCH + pb][2] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 2);
-                        act_reg[h * OUT_BLOCK_BATCH + pb][3] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 3);
-                        act_reg[h * OUT_BLOCK_BATCH + pb][4] = _input_data_01[1];
-                        act_reg[h * OUT_BLOCK_BATCH + pb][5] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 1);
-                        act_reg[h * OUT_BLOCK_BATCH + pb][6] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 2);
-                        act_reg[h * OUT_BLOCK_BATCH + pb][7] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 3);
-                    }
-                }
-
-                uint filter_idx = filter_offset + filter_spatial_offset;
-
-                // preload weights
-                int8 _weights[WEIGHTS_PER_WORKITEM];
-                __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-                for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights
-                {
-                    _weights[w] = as_int8(intel_sub_group_block_read8((__global uint*)(weights + filter_idx)));
-                    filter_idx += FILTER_OFM_SLICE_PITCH;
-                }
-
-                __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-                for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights
-                {
-                    __attribute__((opencl_unroll_hint(OUT_BLOCK_BATCH)))
-                    for(uint pb = 0; pb < OUT_BLOCK_BATCH; pb++)
-                    {
-                        __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-                        for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
-                        {
-                            // MMAD on 8x WEIGHTS_PER_WORKITEM input channels elements for 8x outputs in WI
-                            dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb] = MMAD_8x8(act_reg[h * OUT_BLOCK_BATCH + pb], _weights[w], dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb]);
-                        }
-                    }
-                }
-            }
-        }
-        filter_offset += FILTER_IFM_SLICE_PITCH;
-    }
-
-
-const uint sg_local_f = get_sub_group_local_id() * 4;
-float4 quant_f = vload4(0, quantizations + f + sg_local_f);
-float4 bias_f = vload4(0, biases + f + sg_local_f);
-float4 calib_f = vload4(0, calibrations + f + sg_local_f);
-
-__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
-{
-    const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y + h, x);
-
-    __attribute__((opencl_unroll_hint(8)))
-    for(uint i = 0; i < 8; i++)
-    {
-
-    #if WEIGHTS_PER_WORKITEM == 4
-    
-        uchar8 out;
-        __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-        for(uint pb = 0; pb < OUT_BLOCK_BATCH; pb++)
-        {
-            for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-            {
-                QUANTIZATION;
-            }
-        }
-        intel_sub_group_block_write2((__global unsigned int*)(output + dst_index + 32 * 4 * i), as_uint2(out));
-    
-    #else
-        #error NOT IMPLEMENTED
-        __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-        for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-        {
-        #if CALIBRATION_TERM
-            dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w]);
-        #else  // CALIBRATION_TERM
-            dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * O_QF);
-        #endif // CALIBRATION_TERM
-            output[dst_index + 32 * 4 * i + 8 * w] = ACTIVATION(convert_char(dotProd[w*OUT_BLOCK_HEIGHT + h][i]), ACTIVATION_PARAMS);
-        }
-    
-    #endif
-    }
-}
-
-}
-
-#undef OUT_BLOCK_HEIGHT
-#undef WEIGHTS_PER_WORKITEM
-
-#undef FILTER_SIZE_X_SLICES
-#undef FILTER_IFM_SLICES
-
-#undef FILTER_IFM_SLICE_PITCH
-#undef FILTER_OFM_SLICE_PITCH
-
-#undef SCALE
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl
deleted file mode 100644
index 0da4d28d15f..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-#define OBS 8
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(convolution)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if QUANTIZATION_TERM
-    __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    __global float* calibrations,
-#endif
-    uint split_idx)
-{
-    const uint f_pack = ((uint)get_group_id(0) * 32) % OUTPUT_FEATURE_NUM;
-    const uint b = ((uint)get_group_id(0) * 32) / OUTPUT_FEATURE_NUM;
-
-    const uint x = (uint)get_group_id(1) * OBS;
-    const uint y = get_group_id(2);
-
-    int4 dotProd[OBS] = { 0 };
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint filter_offset = f_pack*FILTER_OFM_PITCH;
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET;
-
-    for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
-    {
-        const int input_offset_y = input_y + j;
-        for (uint i = 0; i < FILTER_SIZE_X ; ++i)
-        {
-            const int input_offset_x = input_x + i + STRIDE_SIZE_X * get_sub_group_local_id();
-            uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH;
-            uint filter_idx = filter_offset + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
-
-            char input_data[3];
-            char2 _i = vload2(0, input + input_idx);
-            input_data[0] = _i.s0;
-            input_data[1] = _i.s1;
-            input_data[2] = input[input_idx + 2];
-
-            for (uint k = 0; k < FILTER_IFM_NUM; ++k)
-            {
-                char4 w_data = as_char4(intel_sub_group_block_read((const __global uint*)(weights + filter_idx)));
-                for(uint r = 0; r < OBS; r++)
-                {
-                    char in = intel_sub_group_shuffle(input_data[k], r);
-                    for(uint c = 0; c < 4; c++)
-                    {
-                        dotProd[r][c] += (int)in * (int)w_data[c];
-                    }
-                }
-                filter_idx += FILTER_IFM_PITCH;
-            }
-        }
-    }
-
-
-const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f_pack, y, x + get_sub_group_local_id());
-const uint _f_idx = f_pack + get_sub_group_local_id() * 4;
-float4 quants = vload4(0, quantizations + _f_idx );
-float4 calibs = vload4(0, calibrations + _f_idx );
-float4 bias = vload4(0, biases + _f_idx );
-for(uint r = 0; r < OBS; r++)
-{
-    char4 char_output;
-    for(uint c = 0; c < 4; c++)
-    {
-        const uint f_idx = f_pack + get_sub_group_local_id() * 4 + c;
-    #if BIAS_TERM
-        const uint bias_index = f_idx;
-    #if CALIBRATION_TERM
-        dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * calibs[c]);
-    #else  // CALIBRATION_TERM
-        dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * O_QF);
-    #endif // CALIBRATION_TERM
-    #endif
-        char_output[c] = ACTIVATION(convert_char(dotProd[r][c]), ACTIVATION_PARAMS);
-    }
-    const uint out_idx = intel_sub_group_shuffle(dst_index, r);
-    intel_sub_group_block_write( (__global uint*)(output + out_idx) , as_uint(char_output));
-}
-
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl
deleted file mode 100644
index 61ed1de16f9..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl
+++ /dev/null
@@ -1,396 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/mmad.cl"
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    {\
-        for(uint z = 0; z < 4; z++)\
-        {\
-            regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\
-            regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\
-            regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\
-            regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\
-        }\
-    }
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\
-    regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\
-    regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\
-    regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\
-    \
-    regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\
-    regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\
-    regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\
-    regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\
-    \
-    regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\
-    regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\
-    regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\
-    regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\
-    \
-    regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\
-    regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\
-    regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\
-    regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]);
-
-#else
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-
-#endif
-
-
-inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
-{
-#if OUT_WITH_PADDING == 1
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
-    padded_offset += y_idx * OUT_Y_PITCH;
-    padded_offset += x_idx * OUT_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += OUT_OFFSET;
-
-    return padded_offset;
-#else
-    return cOffset;
-#endif
-}
-
-inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
-                                    __local int8* l_tileB, const uint l_offsetTileB_col0,
-                                    const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
-                                    const uint l_offsetTileB_col3, int8* rowA, int8* colB,
-                                    int8* regC)
-{
-    // Read tile A from SLM to regA
-    uint l_offsetTileATemp = l_offsetTileA;
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
-        l_offsetTileATemp += 8 * SG_SIZE;
-    }
-    // Read tile B from SLM to regB and compute mmad
-    colB[0] = l_tileB[l_offsetTileB_col0];
-    colB[1] = l_tileB[l_offsetTileB_col1];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
-    }
-    colB[0] = l_tileB[l_offsetTileB_col2];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
-    }
-    colB[1] = l_tileB[l_offsetTileB_col3];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
-    }
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
-    }
-}
-
-/*
- *  \brief GEMM kernel to compute MxN matrix using SLM
- *  \param g_inA  - Input matrix 
- *  \param g_inB  - Input matrix 
- *  \param g_outC - Output matrix
- */
-
-__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
-KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8)
-  (
-  __global char* const g_inA,
-  __global int* g_outC,
-  __global char* const g_inB,
-    #if BIAS_TERM
-        __global BIAS_TYPE* biases,
-    #endif
-        __global float* quantizations,
-    #if CALIBRATION_TERM
-        __global float* calibrations,
-    #endif
-        uint split_idx
-
-   )
-{
-
-    __global int4* const g_matrixA = (__global int4*)g_inA;
-    __global int4* const g_matrixB = (__global int4*)g_inB;
-    __global int8* g_matrixC = (__global int8*)g_outC;
-
-    // Each work-group works to compute 128x128 tile.
-    // Each work-group contains 16 sub-groups.
-    // Each sub-group within the work-group works to compute a 32x32 tile.
-    // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128).
-    // 2) Each sub-group works to compute 32x32 tileC (stored in regC).
-    //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
-    // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
-    __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 
-    __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 
-
-    __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA;
-    __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA;
-    __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB;
-
-    const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y);
-
-    const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint);
-    const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8);
-    const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4);
-    const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4);
-
-    // Thread IDs
-    const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY
-    const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX
-    const uint l_tidX = get_local_id(DIM_X);  // 0,...,31 in WG
-    const uint l_tidY = get_local_id(DIM_Y);  // 0,1,2,3  in WG
-    const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; // 0,1,2,...127
-
-    // SubGroup IDs
-    const uint sg_tid = get_sub_group_local_id();            // 0,1,...,8
-    const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);     //{0}/8
-    const uint sg_global_idY = g_tidY;                       //{0}
-
-    const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);      // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3}
-    const uint sg_local_idY = l_tidY;                        // 0,1,2,3
-    const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX;  // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4
-
-    const uint sub_group_id = get_sub_group_id();
-
-
-    // Registers
-    int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts   // (32/8)*4
-    int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA 
-    int8 colB[2];  // each lane will store 32x4 piece of matrixB
-
-    // SLM indices
-    const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
-    const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
-    const uint numElements32x8TileB = numElements32x32TileB / 4;
-    const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
-    const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
-    const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
-
-    // Global indices
-    uint g_idxA[2];
-    uint g_idxB[2];
-#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
-    g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid;
-    g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid;
-    g_idxA[1] = g_idxA[0] + l_groupSize;
-    g_idxB[1] = g_idxB[0] + l_groupSize;
-#else // Row (matrixA) and Col (matrixB) major layout
-    g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-    g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-#endif
-
-    // Initial SLM setup
-    {
-        l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]];
-        l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]];
-        l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]];
-        l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]];
-	   
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    int4 hdcReadValueA[2];
-    int4 hdcReadValueB[2];
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
-    {
-        /*
-         * SLM setup - HDC read only
-         */
-        // Overlap HDC reads with mmad compute
-        hdcReadValueA[0] = g_matrixA[g_idxA[0]];
-        hdcReadValueB[0] = g_matrixB[g_idxB[0]];
-        hdcReadValueA[1] = g_matrixA[g_idxA[1]];
-        hdcReadValueB[1] = g_matrixB[g_idxB[1]];
-
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        /*
-         * mmad compute
-         */
-        FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint],
-                                l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8],
-                                l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
-                                l_offsetTileB_col3, rowA, colB, regC);
-
-        /*
-         * SLM setup - SLM write only
-         */
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0];
-        l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0];
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1];
-        l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1];
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    } // main outer loop
-
-    /*
-     * Last mmad compute iteration (avoids branching in main loop)
-     */
-
-    FUNC_CALL(mmad_32x32_int8)(
-        &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint],
-        l_offsetTileA,
-        &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8],
-        l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB,
-        regC);
-
-#ifdef OUTPUT_TILED_GLOBAL_LAYOUT
-    // Write out in swizzled manner after quantizing
-    __global uchar* g_outC_uchar = (__global uchar*)g_outC;
-    uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
-                   sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
-
-    uchar16 regC_uchar16;
-    uint offset_uc16 = 0;
-
-    const uint workgroup_id_x = get_group_id(0); 
-    uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x 
-    uint feature = get_sub_group_local_id()*4 + feature_off;
-
-    float4 quant_f = vload4(0, quantizations + feature);
-    float4 bias_f = vload4(0, biases + feature);
-    float4 calib_f = vload4(0, calibrations + feature);
-
-#if MMAD_SUPPORTED == 1
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-#endif
-    for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
-    {
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            // B0..3, F0..31
-            QUANTIZATION(0);
-        }
-
-        intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16));
-        cOffset += sizeof(uchar16) * SG_SIZE;
-
-        // now we need to calculate again for other x
-        padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            // B0..3, F0..31
-            QUANTIZATION(4);
-        }
-
-        intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) );
-        cOffset += sizeof(uchar16) * SG_SIZE;
-    }
-#else
-    // Write final accumulated values
-    uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
-                   sg_tid * (MATRIX_M / 8);
-    __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
-    for (uint i = 0; i < (SIMD_LANE_N); ++i)
-    {
-        __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
-        for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
-        {
-            g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
-        }
-        cOffset += SG_SIZE * (MATRIX_M / 8);
-    }
-#endif
-
-}
-
-#undef QUANTIZATION
-#undef SCALE
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl
deleted file mode 100644
index 6fccacc0ac5..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl
+++ /dev/null
@@ -1,389 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/mmad.cl"
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    {\
-        for(uint z = 0; z < 4; z++)\
-        {\
-            regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\
-            regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\
-            regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\
-            regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\
-        }\
-    }
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\
-    regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\
-    regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\
-    regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\
-    \
-    regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\
-    regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\
-    regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\
-    regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\
-    \
-    regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\
-    regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\
-    regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\
-    regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\
-    \
-    regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\
-    regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\
-    regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\
-    regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]);
-
-#else
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-
-#endif
-
-inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
-{
-#if OUT_WITH_PADDING == 1
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
-    padded_offset += y_idx * OUT_Y_PITCH;
-    padded_offset += x_idx * OUT_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += OUT_OFFSET;
-
-    return padded_offset;
-#else
-    return cOffset;
-#endif
-}
-
-inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
-                                    __local int8* l_tileB, const uint l_offsetTileB_col0,
-                                    const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
-                                    const uint l_offsetTileB_col3, int8* rowA, int8* colB,
-                                    int8* regC)
-{
-    // Read tile A from SLM to regA
-    uint l_offsetTileATemp = l_offsetTileA;
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-		rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
-        l_offsetTileATemp += 8 * SG_SIZE;
-    }
-    // Read tile B from SLM to regB and compute mmad
-    colB[0] = l_tileB[l_offsetTileB_col0];
-    colB[1] = l_tileB[l_offsetTileB_col1];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
-    }
-    colB[0] = l_tileB[l_offsetTileB_col2];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
-	}
-    colB[1] = l_tileB[l_offsetTileB_col3];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
-    }
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
-    }
-}
-
-/*
- *  \brief GEMM kernel to compute MxN matrix using SLM
- *  \param g_inA  - Input matrix 
- *  \param g_inB  - Input matrix 
- *  \param g_outC - Output matrix
- */
-
-__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
-KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8)
-														  (__global char* const g_inA,                                                         
-														  __global int* g_outC,
-														  __global char* const g_inB,                                                           
- 														    #if BIAS_TERM
-																__global BIAS_TYPE* biases,
-															#endif
-																__global float* quantizations,
-															#if CALIBRATION_TERM
-																__global float* calibrations,
-															#endif
-																uint split_idx
-
-														   )
-{
-
-    __global int4* const g_matrixA = (__global int4*)g_inA;
-    __global int4* const g_matrixB = (__global int4*)g_inB;
-    __global int8* g_matrixC = (__global int8*)g_outC;
-
-    // Each work-group works to compute 128x128 tile.
-    // Each work-group contains 16 sub-groups.
-    // Each sub-group within the work-group works to compute a 32x32 tile.
-    // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128).
-    // 2) Each sub-group works to compute 32x32 tileC (stored in regC).
-    //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
-    // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
-    __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
-    __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
-
-    __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA;
-    __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA;
-    __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB;
-
-    const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y);
-
-    const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint);
-    const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8);
-    const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4);
-    const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4);
-
-    // Thread IDs
-    const uint g_tidY = get_global_id(DIM_Y);
-    const uint g_tidX = get_global_id(DIM_X);
-    const uint l_tidX = get_local_id(DIM_X);
-    const uint l_tidY = get_local_id(DIM_Y);
-    const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX;
-
-    // SubGroup IDs
-    const uint sg_tid = get_sub_group_local_id();
-    const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
-    const uint sg_global_idY = g_tidY;
-    const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
-    const uint sg_local_idY = l_tidY;
-    const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
-
-	const uint sub_group_id = get_sub_group_id();
-
-    // Registers
-    int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
-    int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
-    int8 colB[2];  // each lane will store 32x4 piece of matrixB
-
-    // SLM indices
-    const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
-    const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
-    const uint numElements32x8TileB = numElements32x32TileB / 4;
-    const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
-    const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
-    const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
-
-    // Global indices
-    uint g_idxA[2];
-    uint g_idxB[2];
-#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
-    g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid;
-    g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid;
-    g_idxA[1] = g_idxA[0] + l_groupSize;
-    g_idxB[1] = g_idxB[0] + l_groupSize;
-#else // Row (matrixA) and Col (matrixB) major layout
-    g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-    g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-#endif
-    // Initial SLM setup
-    {
-        l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]];
-        l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]];
-
-        l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]];
-        if (l_tid < 32)
-        {
-            // Not all work-items will be needed to fetch the remaining matrix B
-            l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]];
-        }
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    int4 hdcReadValueA[2];
-    int4 hdcReadValueB[2];
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
-    {
-        hdcReadValueA[0] = g_matrixA[g_idxA[0]];
-        hdcReadValueB[0] = g_matrixB[g_idxB[0]];
-        hdcReadValueA[1] = g_matrixA[g_idxA[1]];
-        if (l_tid < 32)
-        {
-            // Not all work-items will be needed to fetch the remaining matrix B
-            hdcReadValueB[1] = g_matrixB[g_idxB[1]];
-        }
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-
-        //MMAD compute
-        FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint],
-                                l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8],
-                                l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
-                                l_offsetTileB_col3, rowA, colB, regC);
-
-        //SLM setup - SLM write only
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0];
-        l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0];
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1];
-        if (l_tid < 32)
-        {
-            // Not all work-items will be needed to fetch the remaining matrix B
-            l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    } // main outer loop
-
-    //Last MMAD compute iteration (avoids branching in main loop)
-	FUNC_CALL(mmad_32x32_int8)(
-        &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint],
-        l_offsetTileA,
-        &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8],
-        l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB,
-        regC);
-
-
-#ifdef OUTPUT_TILED_GLOBAL_LAYOUT
-	
-    // Write out in swizzled manner after quantizing
-    __global uchar* g_outC_uchar = (__global uchar*)g_outC;
-    uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
-                   sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
-
-    uchar16 regC_uchar16;
-    uint offset_uc16 = 0;
-
-	const uint workgroup_id_x = get_group_id(0); 
-	uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x 
-	uint feature = get_sub_group_local_id()*4 + feature_off;
-
-    float4 quant_f = vload4(0, quantizations + feature);
-    float4 bias_f = vload4(0, biases + feature);
-    float4 calib_f = vload4(0, calibrations + feature);
-
-#if MMAD_SUPPORTED == 1
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-#endif
-    for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
-    {
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            // B0..3, F0..31		
-            QUANTIZATION(0);
-        }
-
-        intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16));
-		cOffset += sizeof(uchar16) * SG_SIZE;
-
-        // now we need to calculate again for other x
-        padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            // B0..3, F0..31
-            QUANTIZATION(4);
-        }
-
-        intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) );
-        cOffset += sizeof(uchar16) * SG_SIZE;
-    }
-
-#else
-    // Write final accumulated values
-    uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
-                   sg_tid * (MATRIX_M / 8);
-    __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
-    for (uint i = 0; i < (SIMD_LANE_N); ++i)
-    {
-        __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
-        for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
-        {
-            g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
-        }
-        cOffset += SG_SIZE * (MATRIX_M / 8);
-    }
-#endif
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl
deleted file mode 100644
index ddc43c1f789..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl
+++ /dev/null
@@ -1,430 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/mmad.cl"
-
-inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
-{
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
-    padded_offset += y_idx * OUT_Y_PITCH;
-    padded_offset += x_idx * OUT_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += OUT_OFFSET;
-
-    return padded_offset;
-}
-
-inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
-                                    __local int8* l_tileB, const uint l_offsetTileB_col0,
-                                    const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
-                                    const uint l_offsetTileB_col3, int8* rowA, int8* colB,
-                                    int8* regC)
-{
-    // Read tile A from SLM to regA
-    uint l_offsetTileATemp = l_offsetTileA;
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-		rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
-        l_offsetTileATemp += 8 * SG_SIZE;
-    }
-    // Read tile B from SLM to regB and compute mmad
-    colB[0] = l_tileB[l_offsetTileB_col0];
-    colB[1] = l_tileB[l_offsetTileB_col1];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
-    }
-    colB[0] = l_tileB[l_offsetTileB_col2];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
-	}
-    colB[1] = l_tileB[l_offsetTileB_col3];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
-    }
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
-    }
-}
-
-/*
- *  \brief GEMM kernel to compute MxN matrix using SLM
- *  \param g_inA  - Input matrix 
- *  \param g_inB  - Input matrix 
- *  \param g_outC - Output matrix
- */
-
-__attribute__((intel_reqd_sub_group_size(SG_SIZE)))   
-KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8)
-														  (														  
-														  __global char* const g_inA,                                                    
-														  __global int* g_outC,
-														  __global char* const g_inB,                                                     
- 														    #if BIAS_TERM
-																__global BIAS_TYPE* biases,
-															#endif
-																__global float* quantizations,
-															#if CALIBRATION_TERM
-																__global float* calibrations,
-															#endif
-																uint split_idx
-
-														   )
-{
-
-    __global int4* const g_matrixA = (__global int4*)g_inA;
-    __global int4* const g_matrixB = (__global int4*)g_inB;
-    __global int8* g_matrixC = (__global int8*)g_outC;
-
-    // 1) All work-items in work-group fill SLM with tileA and tileB.
-    // 2) Each sub-group works to compute a 32x32 tileC (stored in regC).
-    //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
-    // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
-    __local int8 l_workGroupTileA_0[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
-    __local int8 l_workGroupTileB_0[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
-    __local uint* l_workGroupTileA_uint_0 = (__local uint*)l_workGroupTileA_0;
-
-    __local int8 l_workGroupTileA_1[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
-    __local int8 l_workGroupTileB_1[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
-    __local uint* l_workGroupTileA_uint_1 = (__local uint*)l_workGroupTileA_1;
-
-    __local int8* l_workGroupTileA_live =  l_workGroupTileA_0;
-    __local int8* l_workGroupTileB_live =  l_workGroupTileB_0;
-    __local uint* l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
-
-    __local int4* l_workGroupTileA_0_int4 = (__local int4*)l_workGroupTileA_0;
-    __local int4* l_workGroupTileB_0_int4 = (__local int4*)l_workGroupTileB_0;
-    __local int4* l_workGroupTileA_1_int4 = (__local int4*)l_workGroupTileA_1;
-    __local int4* l_workGroupTileB_1_int4 = (__local int4*)l_workGroupTileB_1;
-
-    const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y);
-
-    // Thread IDs
-    const uint g_tidY = get_global_id(DIM_Y);
-    const uint g_tidX = get_global_id(DIM_X);
-    const uint l_tidX = get_local_id(DIM_X);
-    const uint l_tidY = get_local_id(DIM_Y);
-    const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX;
-
-    // SubGroup IDs
-    const uint sg_tid = get_sub_group_local_id();
-    const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
-    const uint sg_global_idY = g_tidY;
-    const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
-    const uint sg_local_idY = l_tidY;
-    const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
-
-    const uint sub_group_id = get_sub_group_id();
-
-    // Registers
-    int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
-    int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
-    int8 colB[2]; // each lane will store 32x4 piece of matrixB
-
-    // SLM indices
-    const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
-    const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
-    const uint numElements32x8TileB = numElements32x32TileB / 4;
-    const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
-    const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
-    const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
-
-    // Global indices
-#ifdef TILED_GLOBAL_LAYOUT  // 32-row major (matrixA) and 32-col major (matrixB)
-    uint g_idxA = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid;
-    uint g_idxB = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid;
-#else  // Row (matrixA) and Col (matrixB) major layout
-    uint g_idxA = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) +
-                  (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    uint g_idxB = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) +
-                  (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-#endif
-
-    // Initial SLM setup
-    {
-        uint g_idxATemp = g_idxA;
-        for (uint i = l_tid; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
-        {
-            l_workGroupTileA_0_int4[i] = g_matrixA[g_idxATemp];
-#ifdef TILED_GLOBAL_LAYOUT
-            g_idxATemp += WG_SIZE;
-#else
-            g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
-#endif
-        }
-
-        uint g_idxBTemp = g_idxB;
-        for (uint i = l_tid; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE)
-        {
-            l_workGroupTileB_0_int4[i] = g_matrixB[g_idxBTemp];
-#ifdef TILED_GLOBAL_LAYOUT
-            g_idxBTemp += WG_SIZE;
-#else
-            g_idxBTemp +=  (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
-#endif
-        }
-
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    int4 hdcReadValueA[(WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
-                           ? 1
-                           : (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
-    int4 hdcReadValueB[(WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1
-                           ? 1
-                           : (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE];
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
-    {
-        /*
-         * SLM setup - HDC read only
-         */
-
-#if ((MATRIX_K / MATRIX_SMALL_K) > 1)
-        uint g_idxATemp = g_idxA;
-        for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
-        {
-            hdcReadValueA[j] = g_matrixA[g_idxATemp];
-#ifdef TILED_GLOBAL_LAYOUT
-            g_idxATemp += WG_SIZE;
-#else
-            g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
-#endif
-        }
-
-        uint g_idxBTemp = g_idxB;
-        for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j)
-        {
-            hdcReadValueB[j] = g_matrixB[g_idxBTemp];
-#ifdef TILED_GLOBAL_LAYOUT
-            g_idxBTemp += WG_SIZE;
-#else
-            g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4));
-#endif
-        }
-
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB += MATRIX_SMALL_K / sizeof(int4);
-#endif
-#endif
-
-        /*
-         * MMAD compute
-         */
-
-        FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
-                                l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
-                                l_offsetTileB_col3, rowA, colB, regC);
-
-        /*
-         * SLM setup - SLM write only
-         */
-
-#if ((MATRIX_K / MATRIX_SMALL_K) > 1)
-        if (k % 2 == 0)
-        {
-            for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
-                 i += WG_SIZE, ++j)
-            {
-                l_workGroupTileA_1_int4[i] = hdcReadValueA[j];
-            }
-
-            for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
-                 i += WG_SIZE, ++j)
-            {
-                l_workGroupTileB_1_int4[i] = hdcReadValueB[j];
-            }
-
-            l_workGroupTileA_live = l_workGroupTileA_1;
-            l_workGroupTileB_live = l_workGroupTileB_1;
-            l_workGroupTileA_live_uint = l_workGroupTileA_uint_1;
-        }
-        else
-        {
-            for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4));
-                 i += WG_SIZE, ++j)
-            {
-                l_workGroupTileA_0_int4[i] = hdcReadValueA[j];
-            }
-
-            for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4));
-                 i += WG_SIZE, ++j)
-            {
-                l_workGroupTileB_0_int4[i] = hdcReadValueB[j];
-            }
-
-            l_workGroupTileA_live = l_workGroupTileA_0;
-            l_workGroupTileB_live = l_workGroupTileB_0;
-            l_workGroupTileA_live_uint = l_workGroupTileA_uint_0;
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
-    }
-
-    /*
-     * Last MMAD compute iteration (avoids branching in main loop)
-     */
-    FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live,
-                            l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
-                            l_offsetTileB_col3, rowA, colB, regC);
-                            
-#ifdef OUTPUT_TILED_GLOBAL_LAYOUT
-    // Write out in swizzled manner after quantizing
-    __global uchar* g_outC_uchar = (__global uchar*)g_outC;
-    uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
-                   sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
-
-    uchar8 regC_uchar8[SIMD_LANE_M * SIMD_LANE_N / (sizeof(uchar8) / sizeof(uchar))];
-    uint offset_uc8 = 0;
-
-	const uint workgroup_id_x = get_group_id(0); 
-	uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x 
-	uint feature = get_sub_group_local_id() + feature_off;
-
-    float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
-    float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
-    float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
-
-#if MMAD_SUPPORTED == 1
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-#endif
-    for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
-    {
-        // begin of account for output PADDING
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        // end of account for padding
-
-        // B0 F0..31
-		regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-        // B1 F0..31		
-		regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-
-		FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
-        cOffset += sizeof(uchar8) * SG_SIZE;
-        padded_offset += sizeof(uchar8) * SG_SIZE;
-        offset_uc8++;
-
-        // B2 F0..31
-        regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-        // B3 F0..31		
-		regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-		
-		FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
-		cOffset += sizeof(uchar8) * SG_SIZE;
-        offset_uc8++;
-
-        // now we need to calculate again for other x
-        padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        //
-
-        regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s4) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s4) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s4) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s4) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-		
-		regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s5) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s5) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s5) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s5) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-
-		FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
-        cOffset += sizeof(uchar8) * SG_SIZE;
-        padded_offset += sizeof(uchar8) * SG_SIZE;
-        offset_uc8++;
-
-        regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s6) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s6) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s6) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s6) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-		
-		regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s7) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s7) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s7) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));
-		regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s7) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));
-
-		FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]);
-        cOffset += sizeof(uchar8) * SG_SIZE;
-        offset_uc8++;
-    }
-#else
-    // Write final accumulated values
-    uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
-                   sg_tid * (MATRIX_M / 8);
-    __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
-    for (uint i = 0; i < (SIMD_LANE_N); ++i)
-    {
-        __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
-        for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
-        {
-            g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
-        }
-        cOffset += SG_SIZE * (MATRIX_M / 8);
-    }
-#endif
-
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
index b41428dc4e3..d94dbbd0cdc 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl
@@ -29,7 +29,6 @@
 #undef TO_ACCUMULATOR_TYPE
 #endif
 
-#if QUANTIZATION_TERM
 #define ACCUMULATOR_TYPE int
 #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
 #define ACTIVATION_TYPE float
@@ -55,10 +54,6 @@
 #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size"
 #endif
 
-#else // QUANTIZATION_TERM
-#error "convolution_gpu_mmad_b_fs_yx_fsv32: invalid parameters: quantization term is expected to be true"
-#endif
-
 __attribute__((reqd_work_group_size(8, OW_GROUP, 1)))
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 KERNEL(convolution_mmad_b_fs_yx_fsv32)(
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl
deleted file mode 100644
index 075a5b04c2d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-// input data is in blocks 4batch x 32 features
-// each SIMD process 4 batches and 8 output features
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_mmad_batched)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if QUANTIZATION_TERM
-    const __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    const __global float* calibrations,
-#endif
-    uint split_idx)
-{
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-
-    const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
-    const uint b_block = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
-    const uint f_block = f / 32;
-
-    int4 dotProd = 0;
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block;
-
-    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
-    {
-        for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
-        {
-            const int input_offset_y = input_y + j * DILATION_SIZE_Y;
-            const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-
-            if(!zero_y)
-            {
-                for (uint i = 0; i < FILTER_SIZE_X ; ++i)
-                {
-                    const int input_offset_x = input_x + i * DILATION_SIZE_X;
-                    const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-
-                    if(!zero_x)
-                    {
-                        uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH;
-                        uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
-
-						int4 input_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
-                        int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-
-                        dotProd = MMAD_4x8(input_data, weights_data, dotProd);
-                    }
-                }
-            }
-        }
-    }
-
-for(uint b = 0; b < 4; b++)
-{
-
-#if BIAS_TERM
-    const uint bias_index = f;
-#if QUANTIZATION_TERM
-#if CALIBRATION_TERM
-    dotProd[b] = (UNIT_TYPE)round(((float)dotProd[b] * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
-#else  // CALIBRATION_TERM
-    dotProd[b] = (UNIT_TYPE)round(((float)dotProd[b] * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
-#endif // CALIBRATION_TERM
-#else // QUANTIZATION_TERM
-    dotProd[b] += (UNIT_TYPE)biases[bias_index];
-#endif // QUANTIZATION_TERM
-#endif // BIAS_TERM
-
-    const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f, y, x);
-#if QUANTIZATION_TERM
-    output[dst_index] = ACTIVATION(convert_char(dotProd[b]), ACTIVATION_PARAMS);
-#else
-    output[dst_index] = ACTIVATION(dotProd[b], ACTIVATION_PARAMS);
-#endif  
-}
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl
deleted file mode 100644
index 583d1bea3f3..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-    
-#define QUANTIZATION \
-    uchar4 out;\
-    out[0] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 0][b] * SCALE + bias_f.s0);\
-    out[1] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 1][b] * SCALE + bias_f.s1);\
-    out[2] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 2][b] * SCALE + bias_f.s2);\
-    out[3] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 3][b] * SCALE + bias_f.s3);
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION \
-    uchar4 out;\
-    out[0] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 0][b]);\
-    out[1] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 1][b]);\
-    out[2] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 2][b]);\
-    out[3] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 3][b]);
-
-#else
-
-#define QUANTIZATION \
-    char4 out;\
-    out[0] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 0][b] * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS);\
-    out[1] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 1][b] * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS);\
-    out[2] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 2][b] * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS);\
-    out[3] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 3][b] * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS);
-
-#endif
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-// input data is in blocks 4batch x 32 features
-
-#define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_mmad_batched_block)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
-    __global BIAS_TYPE* biases,
-    const __global float* quantizations,
-#if CALIBRATION_TERM
-    const __global float* calibrations,
-#endif
-    uint split_idx)
-{
-    const uint x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
-    const uint y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT;
-
-    const uint b_f = ((uint)get_group_id(2) * WG_BATCH_COUNT + get_sub_group_id());
-
-#if WEIGHTS_PER_WORKITEM == 4
-    const uint f = (b_f * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED;
-#else
-    const uint f = ((b_f * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED;
-#endif
-    const uint b_block = (b_f * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED;
-
-    // all accumulators
-    int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 };
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint filter_offset = ((b_f * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block + input_y * IN_Y_PITCH + input_x * IN_X_PITCH;
-
-    uint filter_idx = filter_offset;
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
-    {
-        uint input_offset_y = 0;
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
-        for (uint j = 0; j < FILTER_SIZE_Y; ++j)
-        {
-            uint input_idx = input_offset + input_offset_y;
-
-            ////// preloading input data //////
-            int4 preloaded_input[NEEDED_INPUT_X];
-            for(int p = 0; p < NEEDED_INPUT_X; p++)
-            {
-                preloaded_input[p] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
-                input_idx += IN_X_PITCH;
-            }
-
-            __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-            for(uint wi = 0; wi < WEIGHTS_PER_WORKITEM; wi++)
-            {
-                ////// preloading weights data //////
-                int8 preloaded_weights[FILTER_SIZE_X];
-                uint tmp_filter_idx = filter_idx;
-                __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-                for(uint w = 0; w < FILTER_SIZE_X; w++)
-                {
-                    preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + tmp_filter_idx + (wi * FILTER_OFM_BLOCK_PITCH))));
-                    tmp_filter_idx += FILTER_X_PITCH;
-                }
-                ////// computing //////
-                __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-                for (uint i = 0; i < FILTER_SIZE_X; ++i)
-                {
-                    __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-                    for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++)
-                    {
-                        const uint out_idx = ox + wi * OUT_BLOCK_WIDTH;
-                        const uint in_idx = ox * STRIDE_SIZE_X + i;
-                        dotProd[out_idx] = MMAD_4x8(preloaded_input[in_idx], preloaded_weights[i], dotProd[out_idx]);
-                    }
-                }
-            }
-            filter_idx += FILTER_X_PITCH * FILTER_SIZE_X;
-            input_offset_y += IN_Y_PITCH;
-        }
-        input_offset += IN_F_BLOCK_PITCH;
-    }
-
-////// QUANTIZE & OUTPUT //////
-
-#if WEIGHTS_PER_WORKITEM == 4
-
-float4 quant_f = vload4(0, quantizations + f);
-float4 bias_f = vload4(0, biases + f);
-float4 calib_f = vload4(0, calibrations + f);
-
-uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y, x);
-
-__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
-{
-    uint4 to_output;
-    __attribute__((opencl_unroll_hint(4)))
-    for(uint b = 0; b < 4; b++)
-    {
-        QUANTIZATION;
-        to_output[b] = as_uint(out);
-    }
-    intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output);
-    dst_index += OUT_X_PITCH;
-}
-#else
-__attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-{
-    float quant_f = quantizations[f + w * 8];
-    float bias_f = biases[f + w * 8];
-#if CALIBRATION_TERM
-    float calib_f = calibrations[f + w * 8];
-#endif
-    __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-    for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
-    {
-        const uint out_idx = o + OUT_BLOCK_WIDTH * w;
-        __attribute__((opencl_unroll_hint(4)))
-        for(uint b = 0; b < 4; b++)
-        {
-        #if CALIBRATION_TERM
-            dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f);
-        #else  // CALIBRATION_TERM
-            dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF);
-        #endif // CALIBRATION_TERM
-
-            const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f + w * 8, y, x + o);
-            output[dst_index] = ACTIVATION(convert_char(dotProd[out_idx][b]), ACTIVATION_PARAMS);
-        }
-    }
-}
-#endif
-
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
-
-#undef SCALE
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl
deleted file mode 100644
index 98b034bb714..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION \
-    uchar4 out;\
-    out[0] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s0);\
-    out[1] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * SCALE + bias_f.s1);\
-    out[2] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * SCALE + bias_f.s2);\
-    out[3] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s3);
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION \
-    uchar4 out;\
-    out[0] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]);\
-    out[1] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]);\
-    out[2] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]);\
-    out[3] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]);
-
-#else
-
-#define QUANTIZATION \
-    char4 out;\
-    out[0] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]  * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0 ) ), ACTIVATION_PARAMS);\
-    out[1] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]  * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1 ) ), ACTIVATION_PARAMS);\
-    out[2] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]  * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2 ) ), ACTIVATION_PARAMS);\
-    out[3] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]  * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3 ) ), ACTIVATION_PARAMS);
-
-#endif
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-// input data is in blocks 4batch x 32 features
-
-#define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1)
-#define NEEDED_INPUT_Y ((OUT_BLOCK_HEIGHT-1) * (STRIDE_SIZE_Y) + (FILTER_SIZE_Y - 1) + 1)
-
-__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
-KERNEL(convolution_mmad_batched_block_1x1)(
-    __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output,
-    __global FILTER_TYPE* weights,
-    __global BIAS_TYPE* biases,
-    const __global float* quantizations,
-#if CALIBRATION_TERM
-    const __global float* calibrations,
-#endif
-    uint split_idx)
-{
-    const uint x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
-    const uint y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT;
-
-    const uint b_f = ((uint)get_group_id(2) * WG_BATCH_COUNT + (uint)get_sub_group_id());
-#if WEIGHTS_PER_WORKITEM == 4
-    const uint f = (b_f * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED;
-#else
-    const uint f = ((b_f * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED;
-#endif
-    const uint b_block = (b_f * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED;
-
-    int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 };
-
-    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    const uint filter_offset = ((b_f * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block;
-
-    uint filter_idx = filter_offset;
-    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
-    {
-        ////// preloading input data //////
-        int4 preloaded_input[NEEDED_INPUT_X * NEEDED_INPUT_Y];
-        for(int h = 0; h < NEEDED_INPUT_Y; h++)
-        {
-            for(int p = 0; p < NEEDED_INPUT_X; p++)
-            {
-                const int input_offset_y = input_y + h;
-                const int input_offset_x = input_x + p;
-
-                uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH;
-                preloaded_input[p + h * NEEDED_INPUT_X] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
-            }
-        }
-
-        __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
-        for (uint j = 0; j < FILTER_SIZE_Y; ++j)
-        {
-            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
-            for (uint i = 0; i < FILTER_SIZE_X; ++i)
-            {
-                ////// preloading weights data //////
-                int8 preloaded_weights[WEIGHTS_PER_WORKITEM];
-                __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-                for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-                {
-                    preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + (filter_idx + w * FILTER_OFM_BLOCK_PITCH) ) ));
-                }
-
-                ////// computing //////
-                __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-                for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-                {
-                    __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-                    for(uint oy = 0; oy < OUT_BLOCK_HEIGHT; oy++)
-                    {
-                        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-                        for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++)
-                        {
-                            const uint out_idx = ox + OUT_BLOCK_WIDTH * (oy + w * OUT_BLOCK_HEIGHT);
-                            const uint preloaded_idx =ox * STRIDE_SIZE_X + i + NEEDED_INPUT_X * (oy * STRIDE_SIZE_Y + j);
-                            dotProd[out_idx] = MMAD_4x8(preloaded_input[preloaded_idx], preloaded_weights[w], dotProd[out_idx]);
-                        }
-                    }
-                }
-                filter_idx += FILTER_X_PITCH;
-            }
-        }
-        input_offset += IN_F_BLOCK_PITCH;
-    }
-
-
-#if WEIGHTS_PER_WORKITEM == 4
-
-float4 quant_f = vload4(0, quantizations + f);
-float4 bias_f = vload4(0, biases + f);
-float4 calib_f = vload4(0, calibrations + f);
-
-uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y, x);
-
-__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
-{
-    uint tmp_dst_index = dst_index;
-    __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-    for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
-    {
-        uint4 to_output;
-        __attribute__((opencl_unroll_hint(4)))
-        for(uint b = 0; b < 4; b++)
-        {
-            const uint out_idx = o + OUT_BLOCK_WIDTH * h;
-
-            QUANTIZATION;
-            to_output[b] = as_uint(out);
-        }
-        intel_sub_group_block_write4((__global uint*)(output + tmp_dst_index), to_output);
-        tmp_dst_index += OUT_X_PITCH;
-    }
-    dst_index += OUT_Y_PITCH;
-}
-
-#else // WEIGHTS_PER_WORKITEM ==4
-
-////// QUANTIZE & OUTPUT //////
-__attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-{
-    float quant_f = quantizations[f + w * 8];
-    float bias_f = biases[f + w * 8];
-#if CALIBRATION_TERM
-    float calib_f = calibrations[f + w * 8];
-#endif
-    __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-    for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
-    {
-        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-        for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
-        {
-            const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
-            for(uint b = 0; b < 4; b++)
-            {
-            #if CALIBRATION_TERM
-                dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f);
-            #else  // CALIBRATION_TERM
-                dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF);
-            #endif // CALIBRATION_TERM
-            }
-        }
-    }
-}
-
-////// OUTPUT STAGE //////
-__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++)
-{
-    __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-    for(uint o = 0; o < OUT_BLOCK_WIDTH; o++)
-    {
-        const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o);
-        
-        __attribute__((opencl_unroll_hint(4)))
-        for(uint b = 0; b < 4; b++)
-        {
-            #if WEIGHTS_PER_WORKITEM == 2
-                char2 out;
-                const uint out_idx = o + OUT_BLOCK_WIDTH * h;
-                out[0] = ACTIVATION(convert_char(dotProd[out_idx][b]), ACTIVATION_PARAMS);
-                out[1] = ACTIVATION(convert_char(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT][b]), ACTIVATION_PARAMS);
-
-                intel_sub_group_block_write_uc2((__global uchar*)(output + dst_index + b * 32), as_uchar2(out));
-            #else
-            __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM)))
-            for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++)
-            {
-                const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT);
-                const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f + w * 8, y + h, x + o);
-                char char_val = ACTIVATION(convert_char(dotProd[out_idx][b]), ACTIVATION_PARAMS);
-                output[dst_index + b * 32] = char_val;
-            }
-            #endif
-        }
-    }
-}
-
-#endif // WEIGHTS_PER_WORKITEM ==4
-
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
-
-
-#undef SCALE
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
index 9f212434dc5..a26ca07ce42 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl
@@ -33,7 +33,6 @@
 #undef TO_ACCUMULATOR_TYPE
 #endif
 
-#if QUANTIZATION_TERM
 #define ACCUMULATOR_TYPE int
 #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
 #define ACTIVATION_TYPE float
@@ -65,10 +64,6 @@
 #error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: Unsupported block size"
 #endif
 
-#else // QUANTIZATION_TERM
-#error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: invalid parameters: quantization term is expected to be true"
-#endif
-
 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl
index 18d2e9b8859..cfa4f7ceaae 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl
@@ -30,7 +30,6 @@
 #undef TO_ACCUMULATOR_TYPE
 #endif
 
-#if QUANTIZATION_TERM
 #define ACCUMULATOR_TYPE int
 #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
 #define ACTIVATION_TYPE float
@@ -54,10 +53,6 @@
 #error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: Unsupported block size"
 #endif
 
-#else // QUANTIZATION_TERM
-#error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: invalid parameters: quantization term is expected to be true"
-#endif
-
 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl
deleted file mode 100644
index 9d84fd8bbc0..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl
+++ /dev/null
@@ -1,945 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/data_types.cl"
-#include "include/mmad.cl"
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION \
-    slm_write0.s0 = convert_uchar_sat((float)outvec.s0 * SCALE + bias_f);\
-    slm_write0.s1 = convert_uchar_sat((float)outvec.s1 * SCALE + bias_f);\
-    slm_write0.s2 = convert_uchar_sat((float)outvec.s2 * SCALE + bias_f);\
-    slm_write0.s3 = convert_uchar_sat((float)outvec.s3 * SCALE + bias_f);
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    slm_write0.s0 = convert_uchar_sat(outvec.s0);\
-    slm_write0.s1 = convert_uchar_sat(outvec.s1);\
-    slm_write0.s2 = convert_uchar_sat(outvec.s2);\
-    slm_write0.s3 = convert_uchar_sat(outvec.s3);
-
-#else
-
-#define QUANTIZATION \
-    slm_write0.s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s0) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));\
-    slm_write0.s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s1) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));\
-    slm_write0.s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s2) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));\
-    slm_write0.s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s3) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));
-
-#endif
-
-// mapping to clDNN
-#define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C)
-#define _OD OUTPUT_FEATURE_NUM
-#define _OW OUTPUT_SIZE_X
-#define _OH OUTPUT_SIZE_Y
-#define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
-#define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
-#define _IH INPUT0_SIZE_Y
-#define _IW INPUT0_SIZE_X
-#define _ID INPUT0_FEATURE_NUM
-#define K_HEIGHT FILTER_SIZE_Y
-#define K_WIDTH FILTER_SIZE_X
-#define BATCH_SIZE OUTPUT_BATCH_NUM
-
-#define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
-#define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
-#define K_STRIDE STRIDE_SIZE_X
-// end of mapping
-
-// for now kernel stride is square
-#define K_WSTRIDE K_STRIDE
-#define K_HSTRIDE K_STRIDE
-
-#define PACK 32
-#define BATCH_PACK 4
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(convolution_mmad_slm_2x14_rep4)(
-__global int8 *inputs,
-__global uchar* outputs,
-__global int8* weights,
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if QUANTIZATION_TERM
-    const __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    const __global float* calibrations,
-#endif
-    uint split_idx
-)
-{
-	const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z;
-	const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y; 
-
-	ushort fmg     = get_group_id(0);   // Output Depth 
-	ushort group_y = get_group_id(1);   // Output Width
-	ushort group_z = get_group_id(2);   // Output Height
-
-	/* 32,1,4 WG , SIMD8 - 16 HW threads in a WG
-	threads 0-3   (group1) : (lid_x:0-15,lid_y:0,lid_z:0)	
-	threads 4-7   (group2) : (lid_x:0-15,lid_y:0,lid_z:1)
-	threads 8-11  (group3) : (lid_x:0-15,lid_y:0,lid_z:2)
-	threads 12-15  (group4) : (lid_x:0-15,lid_y:0,lid_z:3)
-	
-    Verify sub_group_layout through below printfs 
-	
-	if(group_z == 0 && group_y == 0 && fmg == 0 && get_sub_group_id() == 31) { 
-			printf("\n sub_group_local_id: %d, lid_x: %d, lid_y: %d, lid_z: %d ", get_sub_group_local_id(), get_local_id(0) ,get_local_id(1),get_local_id(2));	
-			printf("\n #WorkgroupsX: %d, #WorkgroupsY: %d, #WorkgroupsZ: %d",get_num_groups(0),get_num_groups(1),get_num_groups(2)); 	
-	}
-	
-	If sub_group_layout is different then derive lid_x, lid_z
-	
-	lid_z: thread_id/4
-	*/
-	
-	/* Thread, local IDs */
-	ushort thread_id 		= get_sub_group_id();
-	ushort threadid_group_4 = thread_id % 4;
-	ushort threadid_mod_2   = thread_id%2;
-	ushort threadid_mod_8   = thread_id % 8;
-
-	ushort lid_x    = get_local_id(0);
-	ushort lid_z    = get_local_id(2);
-
-	uchar  lane_id  = get_sub_group_local_id();
-
-	/* 32-bit signed accumulator for 4 mini-batches , for a thread OUT_BLOCK_WIDTH*HEIGHT*4 registers are used
-	   Will be converted to 8-bits before final write														*/
-	 
-	int4 out[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = { 0 } ;
-	
-	/* Account for batching */
-
-	ushort batch = ( fmg*LOCAL_SIZE_X ) /_OD;
-
-	// Size calculated for int8 elements , One Batch processing is [H][W][4N][32C]
-	uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ;  
-	
-	uint in_addr_offset = batch*input_size;
-	
-	/* Goto activation tile for work group, offset is w.r.t int8 array */
-	
-	uint groupy_tile = TILE_W*group_y;
-	uint groupz_tile = TILE_H*group_z;
-	
-     in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK;
-	 
-	 	/* SLM space for Activation, Weights
-	       ( 32,1,4 ) Workgroup - 4 tiles along Y direction and 32 different output channels
-	        Activation - 10Wx16Wx4Nx32C Weights -9RSx32Kx32C	*/
-	
-	__local int8 act_slm      [  10*16*4 ];
-	__local int8 weight_slm   [  9*32  ];
-   
-   /* 10Hx16Wx4Nx32C activation tile written into SLM.  Distribute among 16 threads in Workgroup
-	   threads 0-1 write 16x4x32 of H=0, W=0...15 ( 8x4x32 per thread )
-	   threads 2-3 write 16x4x32 of H=1, W=0...15 ( 8x4x32 per thread )
-	   threads 4-5 write 16x4x32 of H=2, W=0...15 ( 8x4x32 per thread )
-	   threads 6-7 write 16x4x32 of H=3, W=0...15 ( 8x4x32 per thread )
-	   threads 8-9 write 16x4x32 of H=4, W=0...15 ( 8x4x32 per thread )
-	   threads 10-11 write 16x4x32 of H=5, W=0...15 ( 8x4x32 per thread )
-	   threads 12 write 16x4x32 of H=6, W=0...15 ( 16x4x32 per thread )
-	   thread 13 writes 16x4x32 of H=7
-	   thread 14 writes 16x4x32 of H=8
-	   thread 15 writes 16x4x32 of H=9
-
-	   Interleaved write to avoid SLM BC
-	   
-	   threads0,1 write 16x4x32 together
-	   thread0 writes first 4x32 block, thread1 writes next 4x32 block etc.
-   */
-
-        
-	/* Goto activation tile for thread in group */
-	
-	uint row_offset   =  thread_id / 2;
-	
-	if ( thread_id >= 12 ) {
-		row_offset = 6 + thread_id - 12 - threadid_mod_2;
-	}
-	
-	// In addr offset for the particular thread
-	in_addr_offset    += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ;
-
-   /* Activation SLM indices */
-    uint act_slm_write =  row_offset * ( TILE_W + 2) * BATCH_PACK;
-	uint act_slm_read  =  OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ;
-
-	/* Weights 
-	   Weight Global Tensor Order: [K/8][C/32][R][S][8C][8K][4C]
-	*/
-	
-	/* 9RSx32Kx32C Weight Block in SLM
-	   thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0 ( k=0..7)
-	   thread1 handles w(0,0),w(0,1),w(0,2) of K=1 ( k=8..15)
-	   thread2 handles w(1,0),w(1,1) of K=0 ( k=0..7)
-	   thread3 handles w(1,0),w(1,1) of K=1 ( k=8..15)
-	   thread4 handles w(1,2),w(2,0) of K=0 ( k=0..7)
-	   thread5 handles w(1,2),w(2,0) of K=1 ( k=8..15)
-	   thread6 handles w(2,1),w(2,2) of K=0 ( k=0..7)
-	   thread7 handles w(2,1),w(2,2) of K=1 ( k=8..15)
-	   
-	   Similarly threads8-15 handles for K=2,3
-	   
-	   Weight Layout in SLM
-	   
-	   w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=8..15,C=0..15)
-	   w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=8..15,C=16..31)
-	   
-	   Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM
-	   Thread0 will read k=0..7, thread1 will read k=8..15
-	   
-	   First all output channels are present in SLM, then next weight pixel is present in SLM */
-	  
-	 #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
-	  
-	 uint output_depth = fmg % ( _OD / LOCAL_SIZE_X ); 
-	 	  
-	 uint weight_size_CRS =  ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside
-	 
-	 // Global weight addr for workgroup
-	 uint weight_global_addr_offset =  output_depth * 4 * weight_size_CRS ; //32 output channels per workgroup
-	 
-	 // Global weight address for thread 
-	 uint weight_global_channel_offset = threadid_mod_2 * weight_size_CRS ;
-	 
-	uint slm_channel_offset = 0;
-	 
-    if ( thread_id >= 8 ) {
-		weight_global_channel_offset +=  2*weight_size_CRS;
-		slm_channel_offset = 1;	
-    }
-	 
-	 uint weight_global_pixel_offset = 0;
-	 uint slm_pixel_offset = 0;
-	 
-    if ( threadid_mod_8 >=2  )
-    {
-		weight_global_pixel_offset = 3*8 +  ( ( (threadid_mod_8/2) - 1 )*2*8 );
-		slm_pixel_offset 		   = 3*LOCAL_SIZE_X + ( ( (threadid_mod_8/2) - 1 )*2*LOCAL_SIZE_X );
-    }
-	 
-	 weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset;
-	 
-	 /* Weight slm write index */
-	 
-	 uint slm_write_weight = threadid_mod_2*4  + slm_pixel_offset + slm_channel_offset * 16;
-	 
-	 /* Weight slm read index */
-	 
-	 uint wt_slm_rd_offset = threadid_group_4*8;
- 
-    if ( threadid_mod_2 )
-    {
-		wt_slm_rd_offset = wt_slm_rd_offset - 8 + 4;
-    }
-	
-	int kd;
-	
-	__attribute__((opencl_unroll_hint(1)))
-	for(kd = 0; kd <  ( _ID / PACK ) ; kd++) 
-	{
-
-	{
-			/* Load Activation from global to SLM */
-				
-			int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset;
-
-			__global uint *activation_tile = (__global uint*)&inputs[ in_addr ];
-			
-			__local uint *act_slm_ptr   = (__local uint *) &act_slm [ act_slm_write  ];
-			
-			/* The odd thread in fused pair will start from next 4x8 block */
-			
-			activation_tile += threadid_mod_2*4*8;
-			act_slm_ptr 	+= threadid_mod_2*4*8;
-					
-			int4 act_col_0 =  as_int4( intel_sub_group_block_read4(activation_tile) );	
-			int4 act_col_1 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );				
-			int4 act_col_2 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );				
-			int4 act_col_3 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );
-			int4 act_col_4 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );			
-			int4 act_col_5 =  as_int4( intel_sub_group_block_read4(activation_tile + 5*8*8) );								
-			int4 act_col_6 =  as_int4( intel_sub_group_block_read4(activation_tile + 6*8*8) );				
-			int4 act_col_7 =  as_int4( intel_sub_group_block_read4(activation_tile + 7*8*8) );				
-
-			SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) );				
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 5*8*8 ) , as_uint4 ( act_col_5 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 6*8*8 ) , as_uint4 ( act_col_6 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 7*8*8 ) , as_uint4 ( act_col_7 ) );
-
-			if ( thread_id >=12 )
-            {
-				activation_tile = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8;
-				act_slm_ptr 	+= 8*8*8;		
-	
-				int4 act_col_9 =  as_int4( intel_sub_group_block_read4(activation_tile) );				
-				int4 act_col_10 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );				
-				int4 act_col_11 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );				
-				int4 act_col_12 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );
-				int4 act_col_13 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );			
-				int4 act_col_14 =  as_int4( intel_sub_group_block_read4(activation_tile + 5*8*8) );								
-				int4 act_col_15 =  as_int4( intel_sub_group_block_read4(activation_tile + 6*8*8) );				
-				int4 act_col_16 =  as_int4( intel_sub_group_block_read4(activation_tile + 7*8*8) );				
-				
-				SLM_BLOCK_WRITE_4 ( act_slm_ptr  , as_uint4 ( act_col_9 ) );				
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 )   , as_uint4 ( act_col_10 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 5*8*8 ) , as_uint4 ( act_col_14 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 6*8*8 ) , as_uint4 ( act_col_15 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 7*8*8 ) , as_uint4 ( act_col_16 ) );
-			}
-
-		/* load weights from global to weight_slm */
-		
-			int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset;
-
-			__global uint *weight_tile   = (__global uint*)&weights    [ weight_addr ];
-			__local  uint *wt_slm_ptr    = (__local uint *) &weight_slm [ slm_write_weight  ];
-			
-			int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );						
-			int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) );			
-			int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) );	
-			int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );
-			
-			SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w0 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 )   , as_uint4 ( w1 ) );		
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 32*8 ) , as_uint4 ( w2 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 32*8 + 8*8 ) , as_uint4 ( w3 ) );
-		   
-		   if( threadid_mod_8 < 2 )
-           { 
-				weight_tile += 16*8;
-				wt_slm_ptr  += 2*32*8;
-			
-				int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );						
-				int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) );			
-				
-				SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) );
-				SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 )   , as_uint4 ( w5 ) );		
-			}
-	}		
-
-		// Synchronize SLM writes across workgroup
-		 barrier(CLK_LOCAL_MEM_FENCE);		
-
-			uint wt_slm_rd = wt_slm_rd_offset;
-		
-			__local uint *slm_ptr0     = (__local uint *) &act_slm[ act_slm_read ];
-			__local uint *slm_ptr1     = (__local uint *) &weight_slm[ wt_slm_rd ];
-			
-			int8 weights_reg0, weights_reg1,weights_reg2;
-			
-			/**********************************************************************************************************
-			  First phase - load first row of weights and for the first activation row - 1Hx8Wx4N inputs at a time
-                          - Weights - 24 registers, Activations - 32 registers: Total 56 registers used	for input data			  
-			***********************************************************************************************************/
-			{ 
-					int4 act_reg[ 8 ];
-	
-	                 /* Load weights from SLM into registers  */
-				{
-					    weights_reg0.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg0.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-						
-						weights_reg1.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg1.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-						
-						weights_reg2.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg2.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-				}
-						
-			/* load first 1Hx8Wx4N inputs - Activation Broadcast will occur since it is same for fused threads */
-			
-				__attribute__((opencl_unroll_hint(8)))
-				for (int ic = 0; ic < 8; ic++)
-				{
-	                 /* Load activations from SLM into registers  */
-					 
-					 uint slm_offset = ic * BATCH_PACK * 8 ;
-					 
-    				 act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; 
-				}
-			
-			/* Convolve */ 
-			
-			   /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/
-
-                out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[0], weights_reg0 );
-				out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[1], weights_reg0 );
-				out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[2], weights_reg0 );
-				out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[3], weights_reg0 );
-				out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[4], weights_reg0 );
-				out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[5], weights_reg0 );
-				out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[6], weights_reg0 );
-				out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[7], weights_reg0 );
-
-				out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[1], weights_reg1 );
-				out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[2], weights_reg1 );
-				out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[3], weights_reg1 );
-				out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[4], weights_reg1 );
-				out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[5], weights_reg1 );
-				out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[6], weights_reg1 );
-				out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[7], weights_reg1 );
-				
-				out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[2], weights_reg2 );
-				out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[3], weights_reg2 );
-				out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[4], weights_reg2 );
-				out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[5], weights_reg2 );
-				out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[6], weights_reg2 );
-				out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[7], weights_reg2 );
-			   
-				/* load next 1Hx8Wx4N inputs */
-		
-				__attribute__((opencl_unroll_hint(8)))
-				for (int ic = 8; ic < 16; ic++)
-				{
-					 uint slm_offset = ic * BATCH_PACK * 8;
-					 
-					 act_reg [ ic - 8 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset) ) ; 
-				}
-				
-				/* Convolve */				
-				
-				out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[0], weights_reg2 );
-				out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[1], weights_reg2 );
-				out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[2], weights_reg2 );
-				out[ 9 ] = _MMAD_4x8 ( out[ 9 ], act_reg[3], weights_reg2 );
-				out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[4], weights_reg2 );
-				out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[5], weights_reg2 );
-				out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[6], weights_reg2 );
-				out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[7], weights_reg2 );
-				
-				out[ 7 ]  =  _MMAD_4x8 ( out[ 7 ], act_reg[0], weights_reg1 );
-				out[ 8 ]  =  _MMAD_4x8 ( out[ 8 ], act_reg[1], weights_reg1 );
-				out[ 9 ]  = _MMAD_4x8 (  out[ 9 ],  act_reg[2], weights_reg1 );
-				out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[3], weights_reg1 );
-				out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[4], weights_reg1 );
-				out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[5], weights_reg1 );
-				out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[6], weights_reg1 );
-				
-				out[ 8 ] =  _MMAD_4x8 ( out[ 8 ],  act_reg[0], weights_reg0 );
-				out[ 9 ] = _MMAD_4x8 ( out [ 9 ],   act_reg[1], weights_reg0 );
-				out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[2], weights_reg0 );
-				out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[3], weights_reg0 );
-				out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[4], weights_reg0 );
-				out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[5], weights_reg0 );
-			}	
-			
-			/* Second , Third phase */
-		{
-				int8 weights_reg3, weights_reg4,weights_reg5;
-				int4 act_reg_2[ 6 ];
-
-				/*****************************************************************************************************************************************
-				 Second phase - load second row of weights, now both rows are in registers, for the second activation row - 1Hx6Wx4N inputs at a time
-                              - Weights - 48 registers, Activations - 24 registers: Total 72 registers used	for input data			 
-				******************************************************************************************************************************************/
-				
-				 /* Load weights of row = 1 from SLM into registers  */
-				 {
-				 
-						weights_reg3.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg3.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-						
-						weights_reg4.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg4.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-						
-						weights_reg5.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg5.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-				}
-				
-				/* load input row =1,col=0:1  1Hx2Wx8N  */
-					 
-				uint slm_row_offset_2 	  = 1*(TILE_W + 2)*BATCH_PACK*8;	 
-				
-				act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_2) ) ; 
-				act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_2 + BATCH_PACK*8) ) ; 
-				
-				out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[0] , weights_reg0 );
-				out[ 0 ]  = _MMAD_4x8 ( out[ 0 ],  act_reg_2[0]  , weights_reg3 );
-				out[ 1 ]  = _MMAD_4x8 ( out[ 1 ],  act_reg_2[1]  , weights_reg3 );
-				out[ 15 ] = _MMAD_4x8 ( out[ 15 ], act_reg_2[1] , weights_reg0 );
-				
-				out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[1], weights_reg1 );
-				out[ 0 ]  = _MMAD_4x8 ( out[ 0 ],  act_reg_2[1], weights_reg4 );			
-				
-				/* load input row =1,col=2:7,8:13,1Hx6Wx4N  */
-				
-				uint col = 2;
-				
-				__attribute__((opencl_unroll_hint(2)))
-				do {
-				
-				uint slm_offset 	  = 1*(TILE_W + 2)*BATCH_PACK*8 + col*BATCH_PACK*8;	 
-	
-				act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; 
-				act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset +   BATCH_PACK*8)) ; 
-				act_reg_2 [ 2 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 2*BATCH_PACK*8)) ; 
-				act_reg_2 [ 3 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 3*BATCH_PACK*8) ) ; 
-   				act_reg_2 [ 4 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 4*BATCH_PACK*8) ) ; 
-   				act_reg_2 [ 5 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 5*BATCH_PACK*8) ) ; 
-   
-   				uint first_row_offset   = col - 2;
-				uint second_row_offset  = 14 + col - 2;
-
-   				out [ first_row_offset ]      =  _MMAD_4x8 ( out[ first_row_offset ] ,    act_reg_2[0] , weights_reg5 );
-				out [ first_row_offset + 1 ]  =  _MMAD_4x8 ( out[ first_row_offset + 1] , act_reg_2[0],  weights_reg4 );
-				out [ first_row_offset + 2 ]  =  _MMAD_4x8 ( out[ first_row_offset + 2] , act_reg_2[0],  weights_reg3 );
-				out [ first_row_offset + 3 ]  =  _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[1], weights_reg3 );
-				
-				out [ second_row_offset ]      =  _MMAD_4x8 ( out[ second_row_offset ] , act_reg_2[0] , weights_reg2 );			
-				out [ second_row_offset + 1 ]  =  _MMAD_4x8 ( out[ second_row_offset + 1] , act_reg_2[0],  weights_reg1 );
-				out [ second_row_offset + 2 ]  =  _MMAD_4x8 ( out[ second_row_offset + 2] , act_reg_2[0],  weights_reg0 );
-				out [ second_row_offset + 3 ]  =  _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[1], weights_reg0 );
-				
-				out [ first_row_offset + 1 ]   = _MMAD_4x8 (  out[ first_row_offset + 1 ], act_reg_2[1], weights_reg5 );
-				out [ first_row_offset + 2 ]   = _MMAD_4x8 (  out[ first_row_offset + 2 ], act_reg_2[1], weights_reg4 );
-				out [ first_row_offset + 3 ]   = _MMAD_4x8 ( out[ first_row_offset + 3 ],  act_reg_2[2], weights_reg4 );
-				out [ first_row_offset + 4 ]   = _MMAD_4x8 ( out[ first_row_offset + 4 ],  act_reg_2[2], weights_reg3 );
-				
-				out [ second_row_offset + 1 ]  = _MMAD_4x8 (  out[ second_row_offset + 1 ], act_reg_2[1], weights_reg2 );
-				out [ second_row_offset + 2 ]  = _MMAD_4x8 (  out[ second_row_offset + 2 ], act_reg_2[1], weights_reg1 );
-				out [ second_row_offset + 3 ]   = _MMAD_4x8 ( out[ second_row_offset + 3 ], act_reg_2[2], weights_reg1 );
-				out [ second_row_offset + 4 ]   = _MMAD_4x8 ( out[ second_row_offset + 4 ], act_reg_2[2], weights_reg0 );
-
-				out [ first_row_offset + 2 ]   = _MMAD_4x8 ( out[ first_row_offset + 2], act_reg_2[2], weights_reg5 );				
-				out [ first_row_offset + 3 ]   = _MMAD_4x8 ( out[ first_row_offset + 3], act_reg_2[3], weights_reg5 );
-				out [ first_row_offset + 4 ]   = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[3], weights_reg4 );
-				out [ first_row_offset + 5 ]   = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[3], weights_reg3 );	
-				
-				out [ second_row_offset + 2 ]   = _MMAD_4x8 ( out[ second_row_offset + 2], act_reg_2[2], weights_reg2 );				
-				out [ second_row_offset + 3 ]   = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[3], weights_reg2 );
-				out [ second_row_offset + 4 ]   = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[3], weights_reg1 );
-				out [ second_row_offset + 5 ]   = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[3], weights_reg0 );	
-
-				out [ first_row_offset + 6 ]   = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[4], weights_reg3 );
-				out [ first_row_offset + 7 ]   = _MMAD_4x8 ( out[ first_row_offset + 7], act_reg_2[5], weights_reg3 );
-				out [ first_row_offset + 5 ]   = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[4], weights_reg4 );
-				out [ first_row_offset + 6 ]   = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[5], weights_reg4 );
-				out [ first_row_offset + 4 ]   = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[4], weights_reg5 );
-				out [ first_row_offset + 5 ]   = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[5], weights_reg5 );
-				
-				out [ second_row_offset + 6 ]   = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[4], weights_reg0 );
-				out [ second_row_offset + 7 ]   = _MMAD_4x8 ( out[ second_row_offset + 7], act_reg_2[5], weights_reg0 );
-				out [ second_row_offset + 5 ]   = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[4], weights_reg1 );
-				out [ second_row_offset + 6 ]   = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[5], weights_reg1 );
-				out [ second_row_offset + 4 ]   = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[4], weights_reg2 );				
-				out [ second_row_offset + 5 ]   = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[5], weights_reg2 );
-
-				col +=6;
-				
-				} while ( col < 14 );
-				
-				/* load input row =1,col=14:15  1Hx2Wx4N  */
-
-				uint slm_row_offset_3 	  = 1 * (TILE_W + 2) * BATCH_PACK * 8 + 14 * BATCH_PACK * 8;	
-
-				act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_3)) ; 
-				act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_3 +   BATCH_PACK*8)) ; 
-				
-				out[ 13 ]  = _MMAD_4x8 ( out[ 13 ],   act_reg_2[0],  weights_reg4 );		
-				out[ 27 ]  = _MMAD_4x8 ( out[ 27 ],   act_reg_2[0],  weights_reg1 );		
-				out[ 26 ]  = _MMAD_4x8 ( out[ 26 ],   act_reg_2[0],  weights_reg2 );	
-				
-				out[ 12 ]  = _MMAD_4x8 ( out[ 12 ],  act_reg_2[0],  weights_reg5 );					
-				out[ 13 ]  = _MMAD_4x8 ( out[ 13 ],  act_reg_2[1],  weights_reg5 );	
-				
-				out[ 27 ]  = _MMAD_4x8 ( out[ 27 ],  act_reg_2[1],  weights_reg2 );
-				
-                /****************************************************************************************************************************************
-				   Third phase - load third row of weights, this replaces first weight row, for the third activation row read 1Hx6Wx4N inputs at a time 
-				               - Weights - 48 registers, Activations - 24 registers: Total 72 registers used for input data			  
-				*****************************************************************************************************************************************/
-				
-				 /* Load weights of row = 2 from SLM into registers - replaces row = 0 weights  */
-				 {
-					    weights_reg0.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg0.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-						
-						weights_reg1.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg1.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-						
-						weights_reg2.s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) );
-					    weights_reg2.s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) );
-						slm_ptr1   			   += LOCAL_SIZE_X*8;	
-				}
-				
-				uint slm_row_offset_4 	  = 2*(TILE_W + 2)*BATCH_PACK*8;	 
-				
-				act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_4)) ; 
-				act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_4 + BATCH_PACK*8)) ; 
-		
-				out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[0] , weights_reg3 );
-				out[ 0 ]  = _MMAD_4x8 ( out[ 0 ],  act_reg_2[0]  , weights_reg0 );
-				out[ 1 ]  = _MMAD_4x8 ( out[ 1 ],  act_reg_2[1]  , weights_reg0 );
-				out[ 15 ] = _MMAD_4x8 ( out[ 15 ], act_reg_2[1] , weights_reg3 );
-				
-				out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[1], weights_reg4 );
-				out[ 0 ]  = _MMAD_4x8 ( out[ 0 ],  act_reg_2[1], weights_reg1 );	
-				
-				/* load input row =2,col=2:7,8:13,1Hx6Wx4N  */
-				
-				uint col_2 = 2;
-				
-				__attribute__((opencl_unroll_hint(2)))
-				do {
-				
-				uint slm_offset 	  = 2*(TILE_W + 2)*BATCH_PACK*8 + col_2*BATCH_PACK*8;	 
-	
-				act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; 
-				act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset +   BATCH_PACK*8)) ; 
-				act_reg_2 [ 2 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 2*BATCH_PACK*8)) ; 
-				act_reg_2 [ 3 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 3*BATCH_PACK*8) ) ; 
-   				act_reg_2 [ 4 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 4*BATCH_PACK*8) ) ; 
-   				act_reg_2 [ 5 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 5*BATCH_PACK*8) ) ; 
-   
-   				uint first_row_offset   = col_2 - 2;
-				uint second_row_offset  = 14 + col_2 - 2;
-   			
-				out [ first_row_offset + 1 ]  =  _MMAD_4x8 ( out[ first_row_offset + 1] , act_reg_2[0],  weights_reg1 );
-				out [ first_row_offset + 2 ]  =  _MMAD_4x8 ( out[ first_row_offset + 2] , act_reg_2[0],  weights_reg0 );
-				out [ first_row_offset + 3 ]  =  _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[1], weights_reg0 );
-				out [ first_row_offset ]      =  _MMAD_4x8 ( out[ first_row_offset ] ,    act_reg_2[0] , weights_reg2 );
-				
-				out [ second_row_offset + 1 ]  =  _MMAD_4x8 ( out[ second_row_offset + 1] , act_reg_2[0],  weights_reg4 );
-				out [ second_row_offset + 2 ]  =  _MMAD_4x8 ( out[ second_row_offset + 2] , act_reg_2[0],  weights_reg3 );
-				out [ second_row_offset + 3 ]  =  _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[1], weights_reg3 );
-				out [ second_row_offset ]      =  _MMAD_4x8 ( out[ second_row_offset ] , act_reg_2[0] , weights_reg5 );
-				
-				out [ first_row_offset + 1 ]   = _MMAD_4x8 (  out[ first_row_offset + 1 ], act_reg_2[1], weights_reg2 );
-				out [ first_row_offset + 2 ]   = _MMAD_4x8 (  out[ first_row_offset + 2 ], act_reg_2[1], weights_reg1 );
-				out [ first_row_offset + 3 ]   = _MMAD_4x8 ( out[ first_row_offset + 3 ],  act_reg_2[2], weights_reg1 );
-				out [ first_row_offset + 4 ]   = _MMAD_4x8 ( out[ first_row_offset + 4 ],  act_reg_2[2], weights_reg0 );
-				
-				out [ second_row_offset + 1 ]  = _MMAD_4x8 (  out[ second_row_offset + 1 ], act_reg_2[1], weights_reg5 );
-				out [ second_row_offset + 2 ]  = _MMAD_4x8 (  out[ second_row_offset + 2 ], act_reg_2[1], weights_reg4 );
-				out [ second_row_offset + 3 ]   = _MMAD_4x8 ( out[ second_row_offset + 3 ], act_reg_2[2], weights_reg4 );
-				out [ second_row_offset + 4 ]   = _MMAD_4x8 ( out[ second_row_offset + 4 ], act_reg_2[2], weights_reg3 );
-
-				out [ first_row_offset + 5 ]   = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[3], weights_reg0 );	
-				out [ first_row_offset + 2 ]   = _MMAD_4x8 ( out[ first_row_offset + 2], act_reg_2[2], weights_reg2 );				
-				out [ first_row_offset + 3 ]   = _MMAD_4x8 ( out[ first_row_offset + 3], act_reg_2[3], weights_reg2 );
-				out [ first_row_offset + 4 ]   = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[3], weights_reg1 );
-				
-				out [ second_row_offset + 5 ]   = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[3], weights_reg3 );	
-				out [ second_row_offset + 2 ]   = _MMAD_4x8 ( out[ second_row_offset + 2], act_reg_2[2], weights_reg5 );				
-				out [ second_row_offset + 3 ]   = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[3], weights_reg5 );
-				out [ second_row_offset + 4 ]   = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[3], weights_reg4 );
-
-				out [ first_row_offset + 6 ]   = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[4], weights_reg0 );
-				out [ first_row_offset + 7 ]   = _MMAD_4x8 ( out[ first_row_offset + 7], act_reg_2[5], weights_reg0 );
-				out [ first_row_offset + 5 ]   = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[4], weights_reg1 );
-				out [ first_row_offset + 6 ]   = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[5], weights_reg1 );				
-				out [ first_row_offset + 4 ]   = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[4], weights_reg2 );
-				out [ first_row_offset + 5 ]   = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[5], weights_reg2 );
-				
-				out [ second_row_offset + 6 ]   = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[4], weights_reg3 );
-				out [ second_row_offset + 7 ]   = _MMAD_4x8 ( out[ second_row_offset + 7], act_reg_2[5], weights_reg3 );
-				out [ second_row_offset + 5 ]   = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[4], weights_reg4 );
-				out [ second_row_offset + 6 ]   = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[5], weights_reg4 );
-				out [ second_row_offset + 4 ]   = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[4], weights_reg5 );				
-				out [ second_row_offset + 5 ]   = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[5], weights_reg5 );
-				
-				col_2 +=6;
-				
-				} while ( col_2 < 14 );
-				
-				/* load input row =2,col=14:15  1Hx2Wx4N  */
-
-				uint slm_row_offset_5 	  = 2 * (TILE_W + 2) * BATCH_PACK * 8 + 14 * BATCH_PACK * 8;	
-
-				act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_5)) ; 
-				act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_5 +   BATCH_PACK*8)) ; 
-				
-				out[ 13 ]  = _MMAD_4x8 ( out[ 13 ],   act_reg_2[0],  weights_reg1 );		
-				out[ 27 ]  = _MMAD_4x8 ( out[ 27 ],   act_reg_2[0],  weights_reg4 );		
-				out[ 26 ]  = _MMAD_4x8 ( out[ 26 ],   act_reg_2[0],  weights_reg5 );	
-				
-				out[ 12 ]  = _MMAD_4x8 ( out[ 12 ],  act_reg_2[0],  weights_reg2 );					
-				out[ 13 ]  = _MMAD_4x8 ( out[ 13 ],  act_reg_2[1],  weights_reg2 );	
-				
-				out[ 27 ]  = _MMAD_4x8 ( out[ 27 ],  act_reg_2[1],  weights_reg5 );
-	}
-				
-				/*************************************************************************************************
-				   Fourth phase - discard middle weight row, for fourth activation row load 1Hx8Wx4N at a time 
-				                - Weights - 24 registers, Activations - 32 registers: Total 56 registers used for input data			  
-				**************************************************************************************************/
-		{ 
-					int4 act_reg[ 8 ];
-				
-			/* load first 1Hx8Wx4N inputs */
-			
-				uint slm_row_offset_6 =  3 * (TILE_W + 2) * BATCH_PACK * 8 ;
-
-				__attribute__((opencl_unroll_hint(8)))
-				for (int ic = 0; ic < 8; ic++)
-				{
-	                 /* Load activations from SLM into registers  */
-					 uint slm_offset = ic * BATCH_PACK * 8  + slm_row_offset_6;
-    				 act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; 
-				}
-			
-			/* Convolve */ 
-			
-				uint phase_offset = 14;
-				
-				out[ phase_offset + 0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[0], weights_reg0 );
-				out[ phase_offset + 1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[1], weights_reg0 );
-				out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[2], weights_reg0 );
-				out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[3], weights_reg0 );
-				out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[4], weights_reg0 );
-				out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[5], weights_reg0 );
-				out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[6], weights_reg0 );
-				out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[7], weights_reg0 );
-
-				out[ phase_offset +0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[1], weights_reg1 );
-				out[ phase_offset +1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[2], weights_reg1 );
-				out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[3], weights_reg1 );
-				out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[4], weights_reg1 );
-				out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[5], weights_reg1 );
-				out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[6], weights_reg1 );
-				out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[7], weights_reg1 );
-				
-				out[ phase_offset +0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[2], weights_reg2 );
-				out[ phase_offset +1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[3], weights_reg2 );
-				out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[4], weights_reg2 );
-				out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[5], weights_reg2 );
-				out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[6], weights_reg2 );
-				out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[7], weights_reg2 );
-			   
-				/* load next 1Hx8Wx4N inputs */
-		
-				__attribute__((opencl_unroll_hint(8)))
-				for (int ic = 8; ic < 16; ic++)
-				{
-					 uint slm_offset = ic * BATCH_PACK * 8 + slm_row_offset_6;
-					 act_reg [ ic - 8 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; 
-				}
-				
-				/* Convolve */				
-				
-				out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[0], weights_reg2 );
-				out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[1], weights_reg2 );
-				out[ phase_offset + 8 ] = _MMAD_4x8 ( out[ phase_offset +8 ], act_reg[2], weights_reg2 );
-				out[ phase_offset +9 ] = _MMAD_4x8 ( out[phase_offset + 9 ], act_reg[3], weights_reg2 );
-				out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[4], weights_reg2 );
-				out[ phase_offset +11 ] = _MMAD_4x8 ( out[phase_offset + 11 ], act_reg[5], weights_reg2 );
-				out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[6], weights_reg2 );
-				out[ phase_offset +13 ] = _MMAD_4x8 ( out[ phase_offset +13 ], act_reg[7], weights_reg2 );
-				
-				out[ phase_offset +7 ] =  _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[0], weights_reg1 );
-				out[ phase_offset +8 ] =  _MMAD_4x8 ( out[phase_offset + 8 ], act_reg[1], weights_reg1 );
-				out[ phase_offset +9 ] = _MMAD_4x8 ( out[ phase_offset +9 ], act_reg[2], weights_reg1 );
-				out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[3], weights_reg1 );
-				out[ phase_offset +11 ] = _MMAD_4x8 ( out[ phase_offset +11 ], act_reg[4], weights_reg1 );
-				out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[5], weights_reg1 );
-				out[ phase_offset +13 ] = _MMAD_4x8 ( out[phase_offset + 13 ], act_reg[6], weights_reg1 );
-				
-				out[ phase_offset +8 ] =  _MMAD_4x8 ( out[phase_offset + 8 ],  act_reg[0], weights_reg0 );
-				out[ phase_offset +9 ] = _MMAD_4x8 ( out[ phase_offset +9 ], act_reg[1], weights_reg0 );
-				out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[2], weights_reg0 );
-				out[ phase_offset +11 ] = _MMAD_4x8 ( out[phase_offset + 11 ], act_reg[3], weights_reg0 );
-				out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[4], weights_reg0 );
-				out[ phase_offset +13 ] = _MMAD_4x8 ( out[phase_offset + 13 ], act_reg[5], weights_reg0 );
-			}	
-			
-			// To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM
-			barrier(CLK_LOCAL_MEM_FENCE);	
-			
-	} //for kd
-
-        /****************************************************************************************************************
-		*******************************Output Write Stage****************************************************************
-		****************************************************************************************************************/
-		
-		/* 
-		   Outputs will be passed through activation function and quantized to 8 bits before writing 
-		   Output will be in same format as input [K/32][N/4][P][Q][4N][32K]
-		   Writes are staged in SLM so that 32-bit writes can be done to Global memory 
-		*/	
-			
-		/******************* Write output to SLM *************************************/	
-			
-		/*  Quantize and pack 4x1 byte - from consectuive n-coordinates
-         	Write uint32 from each lane to SLM , the entire thread will write 8-consecutive K-coorindates	
-			Four threads will write 4x8xuint32 for 32 output channels and 4 batches
-			This will be repeated for entire WG-tile
-			
-			Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK )
-		*/
-
-			 uint out_slm_write        =  lid_z * TILE_W * OUT_BLOCK_HEIGHT * 32 + threadid_group_4 * 8 + lane_id;
-
-			__local uchar4*  out_slm   = (__local uchar4*)  &act_slm;
-			__local uchar4* out_slm_2  = (__local uchar4*)  &out_slm[ out_slm_write ];
-		
-			/* Scale the accumulator down and do the ReLU before converting to 8 bits */
-
-			/*  Real code might do this, but need to get scale right or the convert to uchar saturates and then doesn''t match CPU 
-			float scale = (float)SCALE_FACTOR;
-
-			uchar outchar = (uchar)max(((float)outint) * scale, 0.0f); */
-
-            const uint _feature = ((fmg * 32) % _OD) + (uint)get_local_id(0);
-            float quant_f = as_float(intel_sub_group_block_read((__global uint*) (quantizations + _feature) ));
-            float bias_f = as_float(intel_sub_group_block_read((__global uint*) (biases + _feature) ));
-            float calib_f = as_float(intel_sub_group_block_read((__global uint*) (calibrations + _feature) ));
-
-			__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
-			for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)
-            {
-			    __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-				for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
-                {
-					int4 outvec = out[ r * OUT_BLOCK_WIDTH + c];
-			
-					uchar4 slm_write0;
-					
-					int slm_addr = c * 32 + r * TILE_W * 32;
-					
-					/*TODO - Activation & Quantization  code goes here -  presently applying ReLU and  taking lower 8-bits */
-
-                    QUANTIZATION;
-					
-					out_slm_2[ slm_addr ]   = slm_write0;
-
-				} // out_block_width-for loop
-			
-			}  // out_block_height-for loop
-
-			//  Wait till all threads in WG finish placing the output
-			  barrier(CLK_LOCAL_MEM_FENCE);
-			  
-			/******************* Read from SLM & Write to Global *************************************/	
-			
-		    /* Each lane will read uint4 from SLM - 4K x 4N values. Swizzle them into 4N x 4K order
-
-     		   SLM Read Distribution - 8Px14Qx4Nx32K output tile
-			
-			   Threads 0-1 handles row0, col 0-13,
-			   Threads 2-3 handles row1, col 0-13,
-			   ..
-			   Threads 14-15 handles row7, col 0-13 */
-
-			uint row_id =   thread_id / 2;
-			uint col_id =   ( thread_id % 2 )*7;
-			
-			uint out_slm_read =  col_id * 32 + row_id * TILE_W * 32 + lane_id * 4;
-			
-			__local uint4 *out_slm3   = (__local uint4*) &out_slm[ out_slm_read ];
-			
-			/* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */
-			uint row_size_bytes        = (_OW + OWPAD) * PACK * BATCH_PACK;
-			
-			/* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */
-			uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD); 
-			
-			/* Each fmg writes [OH][OW][4][32]*/
-		
-		 	uint output_depth_index      =  output_depth;
-
-			uint batch_index			 =  batch;
-			
-			uint slice_pack_addr_bytes  = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + (groupz_tile + row_id ) * row_size_bytes + (groupy_tile + col_id ) * PACK * BATCH_PACK; 
-			
-			__global uint* output_write = (__global uint *) &outputs [ slice_pack_addr_bytes ];
-			
-			/* Each lane writes 4K values of 4 batches and 8 different columns */
-			
-			/* 4K values of K=0..31 */
-			
-			const char  mask_constant = 0xFF;
-            
-			__attribute__((opencl_unroll_hint(7)))
-			for ( int c=0; c<7; c++ )
-            {
-				/* Get 4K4N values in uint4 - each uint containing 4N values of a K
- 				   swizzle the data and pack into another uint4 containing 4N4K values - each uint containing 4K values of a N. 
-				   Use block_writes for writing uint4 */
-                
-				uint4 out_k4n4 = out_slm3 [ c*8 ];
-
-               	//Pack 4K values of first n
-				uchar4 out_n0k4;
-
-				out_n0k4.s0 = out_k4n4.s0 & mask_constant;
-				out_n0k4.s1 = out_k4n4.s1 & mask_constant;
-				out_n0k4.s2 = out_k4n4.s2 & mask_constant;
-				out_n0k4.s3 = out_k4n4.s3 & mask_constant;
-		
-		        /* Assigning to uchar hence need to get the required bits to lower 8-bits*/
-				
-				//Pack 4K values of second n		
-				uchar4 out_n1k4;
-				
-			    out_n1k4.s0 = (out_k4n4.s0 >> 8) & mask_constant;
-				out_n1k4.s1 = (out_k4n4.s1 >> 8) & mask_constant;
-				out_n1k4.s2 = (out_k4n4.s2 >> 8) & mask_constant;
-				out_n1k4.s3 = (out_k4n4.s3 >> 8) & mask_constant;
-
-		        //Pack 4K values of third n			
-				uchar4 out_n2k4;
-				
-				out_n2k4.s0  = (out_k4n4.s0 >> 16) & mask_constant;
-				out_n2k4.s1  = (out_k4n4.s1 >> 16) & mask_constant;
-				out_n2k4.s2  = (out_k4n4.s2 >> 16) & mask_constant;
-				out_n2k4.s3  = (out_k4n4.s3 >> 16) & mask_constant;
-
-		        //Pack 4K values of fourth n
-				uchar4 out_n3k4;
-
-				out_n3k4.s0 = (out_k4n4.s0 >> 24) & mask_constant;
-				out_n3k4.s1 = (out_k4n4.s1 >> 24) & mask_constant;
-				out_n3k4.s2 = (out_k4n4.s2 >> 24) & mask_constant;
-				out_n3k4.s3 = (out_k4n4.s3 >> 24) & mask_constant;
-				
-				uint4 out_n4k4;
-				
-				out_n4k4.s0 = as_uint ( out_n0k4 );
-				out_n4k4.s1 = as_uint ( out_n1k4 );
-				out_n4k4.s2 = as_uint ( out_n2k4 );
-				out_n4k4.s3 = as_uint ( out_n3k4 );
-								
-			    intel_sub_group_block_write4 ( output_write , out_n4k4 );
-
-				output_write += 4*8;
-			}
-} //end of kernel
-
-#undef SCAL
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl
deleted file mode 100644
index 6a11e01bdc2..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl
+++ /dev/null
@@ -1,1044 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/mmad.cl"
-
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION \
-    out_write_N2K4[0].s0 = convert_uchar_sat((float)outvec0.s0 * SCALE + bias_f.s0); /*K= lane_id,N=0*/ \
-    out_write_N2K4[0].s1 = convert_uchar_sat((float)outvec1.s0 * SCALE + bias_f.s1); /*K= lane_id + 8,N=0*/\
-    out_write_N2K4[0].s2 = convert_uchar_sat((float)outvec2.s0 * SCALE + bias_f.s2); /*K= lane_id + 16,N=0*/\
-    out_write_N2K4[0].s3 = convert_uchar_sat((float)outvec3.s0 * SCALE + bias_f.s3); /*K= lane_id + 24,N=0*/\
-    \    
-    out_write_N2K4[0].s4 = convert_uchar_sat((float)outvec0.s1 * SCALE + bias_f.s0); /*K= lane_id,N=1*/\
-    out_write_N2K4[0].s5 = convert_uchar_sat((float)outvec1.s1 * SCALE + bias_f.s1); /*K= lane_id + 8,N=1*/\
-    out_write_N2K4[0].s6 = convert_uchar_sat((float)outvec2.s1 * SCALE + bias_f.s2); /*K= lane_id + 16,N=1*/\
-    out_write_N2K4[0].s7 = convert_uchar_sat((float)outvec3.s1 * SCALE + bias_f.s3); /*K= lane_id + 24,N=1*/\
-    \
-    out_write_N2K4[1].s0 = convert_uchar_sat((float)outvec0.s2 * SCALE + bias_f.s0); /*K= lane_id,N=2*/\
-    out_write_N2K4[1].s1 = convert_uchar_sat((float)outvec1.s2 * SCALE + bias_f.s1); /*K= lane_id + 8,N=2*/\
-    out_write_N2K4[1].s2 = convert_uchar_sat((float)outvec2.s2 * SCALE + bias_f.s2); /*K= lane_id + 16,N=2*/\
-    out_write_N2K4[1].s3 = convert_uchar_sat((float)outvec3.s2 * SCALE + bias_f.s3); /*K= lane_id + 24,N=2*/\
-    \
-    out_write_N2K4[1].s4 = convert_uchar_sat((float)outvec0.s3 * SCALE + bias_f.s0); /*K= lane_id,N=3*/\
-    out_write_N2K4[1].s5 = convert_uchar_sat((float)outvec1.s3 * SCALE + bias_f.s1); /*K= lane_id + 8,N=3*/\
-    out_write_N2K4[1].s6 = convert_uchar_sat((float)outvec2.s3 * SCALE + bias_f.s2); /*K= lane_id + 16,N=3*/\
-    out_write_N2K4[1].s7 = convert_uchar_sat((float)outvec3.s3 * SCALE + bias_f.s3); /*K= lane_id + 24,N=3*/
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION \
-    out_write_N2K4[0].s0 = convert_uchar_sat(outvec0.s0); /*K= lane_id,N=0*/ \
-    out_write_N2K4[0].s1 = convert_uchar_sat(outvec1.s0); /*K= lane_id + 8,N=0*/\
-    out_write_N2K4[0].s2 = convert_uchar_sat(outvec2.s0); /*K= lane_id + 16,N=0*/\
-    out_write_N2K4[0].s3 = convert_uchar_sat(outvec3.s0); /*K= lane_id + 24,N=0*/\
-    \    
-    out_write_N2K4[0].s4 = convert_uchar_sat(outvec0.s1); /*K= lane_id,N=1*/\
-    out_write_N2K4[0].s5 = convert_uchar_sat(outvec1.s1); /*K= lane_id + 8,N=1*/\
-    out_write_N2K4[0].s6 = convert_uchar_sat(outvec2.s1); /*K= lane_id + 16,N=1*/\
-    out_write_N2K4[0].s7 = convert_uchar_sat(outvec3.s1); /*K= lane_id + 24,N=1*/\
-    \
-    out_write_N2K4[1].s0 = convert_uchar_sat(outvec0.s2); /*K= lane_id,N=2*/\
-    out_write_N2K4[1].s1 = convert_uchar_sat(outvec1.s2); /*K= lane_id + 8,N=2*/\
-    out_write_N2K4[1].s2 = convert_uchar_sat(outvec2.s2); /*K= lane_id + 16,N=2*/\
-    out_write_N2K4[1].s3 = convert_uchar_sat(outvec3.s2); /*K= lane_id + 24,N=2*/\
-    \
-    out_write_N2K4[1].s4 = convert_uchar_sat(outvec0.s3); /*K= lane_id,N=3*/\
-    out_write_N2K4[1].s5 = convert_uchar_sat(outvec1.s3); /*K= lane_id + 8,N=3*/\
-    out_write_N2K4[1].s6 = convert_uchar_sat(outvec2.s3); /*K= lane_id + 16,N=3*/\
-    out_write_N2K4[1].s7 = convert_uchar_sat(outvec3.s3); /*K= lane_id + 24,N=3*/
-
-#else
-
-#define QUANTIZATION \
-    out_write_N2K4[0].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=0*/ \
-    out_write_N2K4[0].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=0*/\
-    out_write_N2K4[0].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=0*/\
-    out_write_N2K4[0].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=0*/\
-    \    
-    out_write_N2K4[0].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=1*/\
-    out_write_N2K4[0].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=1*/\
-    out_write_N2K4[0].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=1*/\
-    out_write_N2K4[0].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=1*/\
-    \
-    out_write_N2K4[1].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=2*/\
-    out_write_N2K4[1].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=2*/\
-    out_write_N2K4[1].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=2*/\
-    out_write_N2K4[1].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=2*/\
-    \
-    out_write_N2K4[1].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=3*/\
-    out_write_N2K4[1].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=3*/\
-    out_write_N2K4[1].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=3*/\
-    out_write_N2K4[1].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=3*/
-
-#endif
-
-// mapping to clDNN
-#define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C)
-#define _OD OUTPUT_FEATURE_NUM
-#define _OW OUTPUT_SIZE_X
-#define _OH OUTPUT_SIZE_Y
-#define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
-#define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
-#define _IH INPUT0_SIZE_Y
-#define _IW INPUT0_SIZE_X
-#define _ID INPUT0_FEATURE_NUM
-#define K_HEIGHT FILTER_SIZE_Y
-#define K_WIDTH FILTER_SIZE_X
-#define BATCH_SIZE OUTPUT_BATCH_NUM
-
-#define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
-#define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
-#define K_STRIDE STRIDE_SIZE_X
-// end of mapping
-
-// for now kernel stride is square
-#define K_WSTRIDE K_STRIDE
-#define K_HSTRIDE K_STRIDE
-
-#define PACK 32
-#define BATCH_PACK 4
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(convolution_mmad_slm_2x14_rep4)(
-__global int8 *inputs,
-__global uchar* outputs,
-__global int8* weights,
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-#if QUANTIZATION_TERM
-    const __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    const __global float* calibrations,
-#endif
-    uint split_idx
-)
-{
-	const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z;
-	const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y;
-
-	ushort fmg     = get_group_id(0);   // Output Depth
-	ushort group_y = get_group_id(1);   // Output Width
-	ushort group_z = get_group_id(2);   // Output Height
-
-	/* 16,1,8 WG , SIMD8 - 16 HW threads in a WG
-	threads 0-1 : ( lid_x:0-15,lid_y:0,lid_z:0)
-	threads 2-3 : ( lid_x:0-15,lid_y:0,lid_z:1)
-	..
-	threads 12-13: ( lid_x:0-15, lid_y:0,lid_z:6)
-	threads 14-15: ( lid_x:0-15, lid_y:0,lid_z:7)
-	*/
-
-	/* Thread, local IDs */
-	ushort thread_id 		= get_sub_group_id();
-	ushort threadid_mod_2   = thread_id % 2;
-	ushort threadid_mod_8   = thread_id % 8;
-
-	ushort lid_x    = get_local_id(0);
-	ushort lid_z    = get_local_id(2);
-
-	uchar  lane_id  = get_sub_group_local_id();
-
-	/* 32-bit signed accumulator , 112 output registers for 1Px7Qx4Nx32K output tile size
-	   Will be converted to 8-bits before final write */
-
-	int4  out_07 [ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]   = {0}; // For output channels 0-7
-	int4  out_815[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]   = {0}; // For output channels 8-15
-	int4  out_1623[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]  = {0}; // For output channels 16-23
-	int4  out_2431[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ]  = {0}; // For output channels 24-31
-
-	/* Account for batching */
-
-	ushort batch 	= ( fmg*LOCAL_SIZE_X*4 ) /_OD; // Each thread processing 32 output_channels and each fmg processing 64 output channels , LOCAL_SIZE_X is only 16
-
-	// Size calculated for int8 elements
-	uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ;
-
-	uint in_addr_offset = batch*input_size;
-
-	/* Goto activation tile for work group, offset is w.r.t int8 array */
-
-	uint groupy_tile = TILE_W*group_y;
-	uint groupz_tile = TILE_H*group_z;
-
-    in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK;
-
-	 	/* SLM space for Activation, Weights
-	       ( 16,1,8 ) Workgroup - 7 tiles along Y direction and 64 different output channels
-		    2 threads used to load global memory
-	        Activation - 9Hx9Wx4Nx32C Weights -3Rx3Sx64Kx32C	*/
-
-	__local int8 act_slm      [  9*9*4 ];
-	__local int8 weight_slm   [  9*64  ];
-
-   /* 9Hx9Wx4Nx32C activation tile written into SLM.  Distribute among 14 threads in Workgroup
-	   threads 0-1 write 9x4x32 of  H=0, W=0...8
-	   threads 2-3 write 9x4x32 of H=1, W=0...8
-	   threads 4-5 write 9x4x32 of H=2, W=0...8
-	   threads 6-7  write 9x4x32 of H=3, W=0...8
-	   threads 8-9 write 9x4x32 of H=4, W=0...8
-	   threads 10-11 write 9x4x32 of H=5,W=0...8
-	   threads 12-13 write 9x4x32 of H=6,W=0...8
-	   threads 14 write 9x4x32 of H=7,W=0...8
-	   threads 15 write 9x4x32 of H=8,W=0...8 */
-
-	/* Goto activation tile for thread in group */
-
-	uint row_offset   =  thread_id / 2;
-
-	if ( thread_id >= 14 )
-    {
-        row_offset = 7;
-	}
-
-	// In addr offset for the particular thread
-	in_addr_offset    += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ;
-
-   /* Activation SLM indices */
-    uint act_slm_write =  row_offset * ( TILE_W + 2) * BATCH_PACK;
-	uint act_slm_read  =  OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ;
-
-	/* 9RSx64Kx32C Weight Block in SLM
-	   thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0,1 ( k=0..15 )
-	   thread1 handles w(0,0),w(0,1),w(0,2) of K=2,3 ( k=16..31)
-	   thread2 handles w(1,0),w(1,1) of K=0,1 ( k=0..15)
-	   thread3 handles w(1,0),w(1,1) of K=2,3 ( k=16..31)
-	   thread4 handles w(1,2),w(2,0) of K=0,1 ( k=0..15)
-	   thread5 handles w(1,2),w(2,0) of K=2,3 ( k=16..31)
-	   thread6 handles w(2,1),w(2,2) of K=0,1 ( k=0..15)
-	   thread7 handles w(2,1),w(2,2) of K=2,3 ( k=16..31)
-
-	   Similarly threads8-15 handles for K=4,5,6,7
-
-	   Weight Layout in SLM
-
-	   w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=32..39,C=0..15)
-	   w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=32..39,C=16..31)
-
-	   Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM
-	   Thread0 will read k=0..31, thread1 will read k=32..63
-
-	   First all output channels are present in SLM, then next weight pixel is present in SLM */
-
-	 #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
-
-	 uint output_depth    = fmg % ( _OD / ( LOCAL_SIZE_X * 4 ) ); //LOCAL_SIZE_X=16, 64 output channels used
-
-	 uint weight_size_CRS =  ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside
-
-	 // Global weight addr for workgroup
-	 uint weight_global_addr_offset =  output_depth * 8 * weight_size_CRS ; //64 output channels per workgroup
-
-	 /* Global weight address for thread */
-
-	 // Goto appropriate output channel in weights
-	 uint weight_global_channel_offset = threadid_mod_2 * 2 * weight_size_CRS ;
-
-	uint slm_channel_offset     = threadid_mod_2;
-	uint bc_fused_thread_offset = 0;
-
-	 if ( thread_id >= 8 )
-    {
-		bc_fused_thread_offset =  1;
-
-		weight_global_channel_offset =  4 * weight_size_CRS + slm_channel_offset * weight_size_CRS * 2 ;
-    }
-
-	 // Goto appropriate pixel in weights
-
-	 uint weight_global_pixel_offset = 0;
-	 uint slm_pixel_offset = 0;
-
-    if ( threadid_mod_8 >=2  )
-    {
-	 /* First three pixels handled by threads 0-1, then 2 pixels handled by two threads */
-
-		weight_global_pixel_offset = 3*8 +  ( ( (threadid_mod_8/2) - 1 )*2*8 );
-		slm_pixel_offset 		   = 3*64 + ( ( (threadid_mod_8/2) - 1 )*2*64 );
-    }
-
-    weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset;
-
-	 /* Weight slm write index */
-
-	 uint slm_write_weight = slm_pixel_offset + slm_channel_offset * 32 + bc_fused_thread_offset * 4;
-
-	 /* Weight slm read index */
-
-	 /* Thread 0  reads output channels 0-15, thread 1 handles output channels 16-31, data present in interleaved
-	    manner in SLM
-		Data layout in SLM
-
-		w(0,0) C=0..7, K = 0..7 | w(0,0) C=0..7, K = 32..39
-		w(0,0) C=8..15,K=0..7   | w(0,0) C=8..15,K = 32..39
-		w(0,0) C=0..7, K=8..15  | w(0,0) C=0..7, K = 40..47
-		w(0,0) C=8..15,K=8..15  | w(0,0) C=8..15,K=  40..47
-
-		*/
-    uint wt_slm_rd_offset = threadid_mod_2*4;
-
-	int kd;
-
-	__attribute__((opencl_unroll_hint(1)))
-	for(kd = 0; kd <  ( _ID / PACK ) ; kd++)
-	{
-		{
-			/* Load Activation from global to SLM */
-
-			int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset;
-
-			__global uint *activation_tile = (__global uint*)&inputs[ in_addr ];
-
-			__local uint *act_slm_ptr   = (__local uint *) &act_slm [ act_slm_write  ];
-
-			/* The odd thread in fused pair will start from next 4x8 block */
-
-			activation_tile += threadid_mod_2*4*8;
-			act_slm_ptr 	+= threadid_mod_2*4*8;
-
-			int4 act_col_0 =  as_int4( intel_sub_group_block_read4(activation_tile) );//col 0
-			int4 act_col_1 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );//col 2
-			int4 act_col_2 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );//col 4
-			int4 act_col_3 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );//col 6
-
-			SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) );
-			SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) );
-
-			if ( threadid_mod_2  == 0 )
-            {
-				int4 act_col_4 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
-
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) );
-			}
-
-			if ( thread_id >=14)
-            {
-				activation_tile  = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8;
-				act_slm_ptr 	 = act_slm_ptr + (TILE_W + 2)  * BATCH_PACK *8;
-
-				int4 act_col_9 =  as_int4( intel_sub_group_block_read4(activation_tile) );
-				int4 act_col_10 =  as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );
-				int4 act_col_11 =  as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );
-				int4 act_col_12 =  as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );
-
-				SLM_BLOCK_WRITE_4 ( act_slm_ptr  , as_uint4 ( act_col_9 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 )   , as_uint4 ( act_col_10 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) );
-				SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) );
-
-				if ( threadid_mod_2  == 0 )
-                {
-					int4 act_col_13 =  as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) );
-
-					SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) );
-				}
-			}
-
-		/* load weights from global to weight_slm */
-
-			int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset;
-
-			__global uint *weight_tile   = (__global uint*)&weights    [ weight_addr ];
-			__local  uint *wt_slm_ptr    = (__local uint *)&weight_slm [ slm_write_weight  ];
-
-			__global uint *weight_tile_2   = weight_tile;
-			__local uint *wt_slm_ptr_2     = wt_slm_ptr;
-
-			int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );	// Pixel1 K=0..7 C=0..15
-			int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) );	// Pixel1 K=0..7 C=16..31
-			int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) );	// Pixel2 K=0..7 C=0..15
-			int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=0..7 C=16..31
-
-			// Goto next output channel
-			weight_tile += weight_size_CRS*8;
-
-			int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) );	// Pixel1 K=8..15 C=0..15
-			int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) );	// Pixel1 K=8..15 C=16..31
-			int4 w6 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) );	// Pixel2 K=8..15 C=0..15
-			int4 w7 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=8..15 C=16..31
-
-			SLM_BLOCK_WRITE_4 ( wt_slm_ptr, as_uint4 ( w0 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ), as_uint4 ( w2 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ), as_uint4 ( w3 ) );
-
-			wt_slm_ptr  += 16*8;
-
-			SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 )   , as_uint4 ( w5 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ) , as_uint4 ( w6 ) );
-			SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ) , as_uint4 ( w7 ) );
-
-		   if( threadid_mod_8 < 2 )
-           {
-				// Goto next pixel
-				weight_tile_2 += 16*8;
-				wt_slm_ptr_2  += 2*64*8;
-
-				int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) );	// Pixel1 K=0..7 C=0..15
-				int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) );	// Pixel1 K=0..7 C=16..31
-
-				// Goto next output channel
-				weight_tile_2 += weight_size_CRS*8;
-
-				int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) );	// Pixel1 K=8..15 C=0..15
-				int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) );	// Pixel1 K=8..15 C=16..31
-
-				SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2, as_uint4 ( w0 ) );
-				SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w1 ) );
-
-				wt_slm_ptr_2  += 16*8;
-
-				SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2 , as_uint4 ( w4 ) );
-				SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 )   , as_uint4 ( w5 ) );
-			}
-	}
-
-		// Synchronize SLM writes across workgroup
-		 barrier(CLK_LOCAL_MEM_FENCE);
-
-		if ( lid_z <= 6 )
-        {
-			uint wt_slm_rd = wt_slm_rd_offset;
-
-			__local uint *slm_ptr0     = (__local uint *) &act_slm[ act_slm_read ];
-			__local uint *slm_ptr1     = (__local uint *) &weight_slm[ wt_slm_rd ];
-
-			/* balancing load of weights, activations   */
-			int8 weights_reg[3]; //24 registers
-			int4 act_reg[18];    //72 registers
-			uint slm_read_pixel_offset = 64*8;
-
-			/**********************************************************************************************************
-			  First phase - multiply first row of weights  and 1st row of activations
-			***********************************************************************************************************/
-
-	                 /* Load weights from SLM into registers - row0, output channels 0..7  */
-
-				{
-					 	__local uint *slm_ptrw0  = slm_ptr1;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-				}
-
-			/* load 1Hx9Wx4N inputs, Activation row0   */
-
-				__attribute__((opencl_unroll_hint(9)))
-				for (int ic = 0; ic < 9; ic++)
-				{
-	                 /* Load activations from SLM into registers  */
-
-					 uint slm_offset = ic * BATCH_PACK * 8 ;
-
-    				 act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
-				}
-
-			/* Convolve */
-
-			   /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/
-
-				/*  Output channels 0-7 */
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
-
-		     /* Load weights from SLM into registers - row0, output channels 8..15  */
-
-				{
-					 	__local uint *slm_ptrw0 = slm_ptr1 + 2*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-				}
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
-
-				/* Load weights from SLM into registers - row0, output channels 16..23  */
-				{
-					 	__local uint *slm_ptrw0 = slm_ptr1 + 4*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-				}
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
-
-				/* load 1Hx9Wx4N inputs, Activation row1   */
-
-				uint slm_row_offset_2 	  = 1*(TILE_W + 2)*BATCH_PACK*8;
-
-				__attribute__((opencl_unroll_hint(9)))
-				for (int ic = 0; ic < 9; ic++)
-				{
-	                 /* Load activations from SLM into registers  */
-
-					 uint slm_offset = slm_row_offset_2 + ic * BATCH_PACK * 8 ;
-
-    				 act_reg [ ic + 9 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
-				}
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
-
-				/* Load weights from SLM into registers - row0, output channels 24..31  */
-				{
-					 	__local uint *slm_ptrw0 = slm_ptr1 + 6*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-						slm_ptrw0   			 += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) );
-				}
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
-
-			/**********************************************************************************************************
-			  Second phase - multiply second row of weights  and second row of activations
-			***********************************************************************************************************/
-
-			 /* Load weights from SLM into registers - row1, output channels 0..7  */
-				{
-					 	__local uint *slm_ptrw1  = slm_ptr1 + 3*slm_read_pixel_offset;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			 += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1  			     += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-				}
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[9], weights_reg[0] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[10], weights_reg[0] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[11], weights_reg[0] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[12], weights_reg[0] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[13], weights_reg[0] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[14], weights_reg[0] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[15], weights_reg[0] );
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[10], weights_reg[1] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[11], weights_reg[1] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[12], weights_reg[1] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[13], weights_reg[1] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[14], weights_reg[1] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[15], weights_reg[1] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[16], weights_reg[1] );
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[11], weights_reg[2] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[12], weights_reg[2] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[13], weights_reg[2] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[14], weights_reg[2] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[15], weights_reg[2] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[16], weights_reg[2] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[17], weights_reg[2] );
-
-				    /* Load weights from SLM into registers - row1, output channels 8..15  */
-				{
-					 	__local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 2*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			   += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-				}
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[9], weights_reg[0] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[10], weights_reg[0] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[11], weights_reg[0] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[12], weights_reg[0] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[13], weights_reg[0] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[14], weights_reg[0] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[15], weights_reg[0] );
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[10], weights_reg[1] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[11], weights_reg[1] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[12], weights_reg[1] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[13], weights_reg[1] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[14], weights_reg[1] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[15], weights_reg[1] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[16], weights_reg[1] );
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[11], weights_reg[2] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[12], weights_reg[2] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[13], weights_reg[2] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[14], weights_reg[2] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[15], weights_reg[2] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[16], weights_reg[2] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[17], weights_reg[2] );
-
-				/* Load weights from SLM into registers - row1, output channels 16..23  */
-				{
-					 	__local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 4*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			   += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-				}
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[9], weights_reg[0] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[10], weights_reg[0] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[11], weights_reg[0] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[12], weights_reg[0] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[13], weights_reg[0] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[14], weights_reg[0] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[15], weights_reg[0] );
-
-				/* load 1Hx9Wx4N inputs, Activation row2  */
-
-				uint slm_row_offset_3	  = 2*(TILE_W + 2)*BATCH_PACK*8;
-
-				__attribute__((opencl_unroll_hint(9)))
-				for (int ic = 0; ic < 9; ic++)
-				{
-	                 /* Load activations from SLM into registers  */
-
-					 uint slm_offset = slm_row_offset_3 + ic * BATCH_PACK * 8 ;
-
-    				 act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ;
-				}
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[10], weights_reg[1] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[11], weights_reg[1] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[12], weights_reg[1] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[13], weights_reg[1] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[14], weights_reg[1] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[15], weights_reg[1] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[16], weights_reg[1] );
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[11], weights_reg[2] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[12], weights_reg[2] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[13], weights_reg[2] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[14], weights_reg[2] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[15], weights_reg[2] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[16], weights_reg[2] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[17], weights_reg[2] );
-
-				/* Load weights from SLM into registers - row1, output channels 24..31  */
-				{
-					 	__local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 6*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-						slm_ptrw1   			   += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) );
-				}
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[9], weights_reg[0] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[10], weights_reg[0] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[11], weights_reg[0] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[12], weights_reg[0] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[13], weights_reg[0] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[14], weights_reg[0] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[15], weights_reg[0] );
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[10], weights_reg[1] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[11], weights_reg[1] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[12], weights_reg[1] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[13], weights_reg[1] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[14], weights_reg[1] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[15], weights_reg[1] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[16], weights_reg[1] );
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[11], weights_reg[2] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[12], weights_reg[2] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[13], weights_reg[2] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[14], weights_reg[2] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[15], weights_reg[2] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[16], weights_reg[2] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[17], weights_reg[2] );
-
-			/**********************************************************************************************************
-			  Third phase - multiply third row of weights  and third row of activations
-			***********************************************************************************************************/
-
-				 /* Load weights from SLM into registers - row2, output channels 0..7  */
-				{
-					 	__local uint *slm_ptrw2  = slm_ptr1 + 6*slm_read_pixel_offset;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2 			     += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-				}
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] );
-				out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] );
-				out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] );
-				out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] );
-				out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] );
-				out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] );
-				out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] );
-
-				     /* Load weights from SLM into registers - row2, output channels 8..15  */
-				{
-					 	__local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 2*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-				}
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] );
-				out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] );
-				out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] );
-				out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] );
-				out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] );
-				out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] );
-				out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] );
-
-				/* Load weights from SLM into registers - row2, output channels 16..23  */
-				{
-					 	__local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 4*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-				}
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] );
-				out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] );
-				out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] );
-				out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] );
-				out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] );
-				out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] );
-				out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] );
-
-				/* Load weights from SLM into registers - row3, output channels 24..31  */
-				{
-					 	__local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 6*8*8;
-
-					    weights_reg[0].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[0].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[1].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[1].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-						slm_ptrw2   			   += slm_read_pixel_offset;
-
-						weights_reg[2].s0123     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) );
-					    weights_reg[2].s4567     = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) );
-				}
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] );
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] );
-
-				out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] );
-				out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] );
-				out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] );
-				out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] );
-				out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] );
-				out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] );
-				out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] );
-		}
-
-			// To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM
-			barrier(CLK_LOCAL_MEM_FENCE);
-	} //for kd
-
-        /****************************************************************************************************************
-		*******************************Output Write Stage****************************************************************
-		****************************************************************************************************************/
-			/*
-		   Outputs will be passed through activation function and quantized to 8 bits before writing
-		   Output will be in same format as input [K/32][N/4][P][Q][4N][32K] */
-
-			/******************* Write output to SLM *************************************/
-
-		/*  Quantize and pack 4x1 byte - from consectuive n-coordinates
-			Each thread produces [1P][7Q][4N][32K]
-         	Write uint32 from each lane to SLM , the entire thread will write 32-consecutive K-coorindates
-
-			Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK )
-			In SLM 7x7x4x32 present first then the next 32 channels
-		*/
-
-		if( lid_z <= 6 )
-        {
-			/* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */
-			uint row_size_bytes        = (_OW + OWPAD) * PACK * BATCH_PACK;
-
-			/* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */
-			uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD);
-
-			/* Each output_depth WG writes 64 output channels */
-
-		 	uint output_depth_index      =  output_depth*2 + threadid_mod_2;
-			uint batch_index			 =  batch;
-
-			/* Each WG produces entire 7x7 output, hence no group_y, group_z tiling */
-
-            uint output_offset_x = groupy_tile * OUT_X_PITCH;
-            uint output_offset_y = groupz_tile * OUT_Y_PITCH;
-			uint slice_pack_addr_bytes  = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + lid_z * row_size_bytes;
-						
-			__global uchar* output_write_ptr = (__global uchar *) &outputs [ slice_pack_addr_bytes + output_offset_x + output_offset_y ];
-
-                const uint feature = output_depth_index * 32 + get_sub_group_local_id();
-
-                const float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) ));
-                const float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) ));
-                const float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) ));
-
-                __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
-				for (int col = 0; col < OUT_BLOCK_WIDTH; col++)
-                {
-
-					int4 outvec0 = out_07[col];
-					int4 outvec1 = out_815[col];
-					int4 outvec2 = out_1623[col];
-					int4 outvec3 = out_2431[col];
-
-					/* Non-Linear Activation & Quantization code */
-
-					uchar8 out_write_N2K4[2];
-
-                    QUANTIZATION;
-
-					intel_sub_group_block_write_uc8 (  output_write_ptr  , out_write_N2K4[0] );
-					output_write_ptr += 64;
-					intel_sub_group_block_write_uc8 (  output_write_ptr  , out_write_N2K4[1] );
-					output_write_ptr += 64;
-
-				} // out_block_width-for loop
-		}//lid_z loop
-} //end of kernel
-
-#undef SCAL
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl
deleted file mode 100644
index 0066ddd618b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-__attribute__((intel_reqd_sub_group_size(16)))
-KERNEL(convolution_grad_weights_gpu_1x1)(
-    const __global UNIT_TYPE* input_grad,
-    __global UNIT_TYPE* output,
-    __global UNIT_TYPE* filter,
-#if BIAS_TERM
-    __global UNIT_TYPE* bias,
-#endif
-#if MOMENTUM
-    __global UNIT_TYPE* prev_grad_w,
-#if BIAS_TERM
-    __global UNIT_TYPE* prev_grad_b,
-#endif
-#endif
-    const __global UNIT_TYPE* input,
-    uint split_idx,
-    float lr)
-{
-    const uint local_id = get_local_id(0); 
-    const uint ifm      = get_global_id(1);
-    const uint ofm      = get_global_id(2);
-
-    const int in_x = -PADDING_SIZE_X;
-    const int in_y = -PADDING_SIZE_Y;
-
-    ACCUMULATOR_TYPE grad_w = 0;
-
-#if BIAS_TERM
-    ACCUMULATOR_TYPE grad_b = 0;
-#endif
-
-    uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH;
-
-    for(int b = 0; b < INPUT0_BATCH_NUM; b++)
-    {
-        UNIT_TYPE result = UNIT_VAL_ZERO;
-
-#if BIAS_TERM
-        UNIT_TYPE result_bias = UNIT_VAL_ZERO;
-#endif
-
-        const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM;
-        const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM;
-
-        for (uint i = 0; i < INPUT0_SIZE_Y; i++)
-        {
-		    const int input_offset_y = in_y + i * STRIDE_SIZE_Y;
-            const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0;
-            for (uint j = 0; j < (INPUT0_SIZE_X + 15)/16; j++)
-            {
-                const int input_offset_x = in_x + j * STRIDE_SIZE_X * 16 + local_id * STRIDE_SIZE_X;
-                const bool zero_x = input_offset_x >= INPUT1_SIZE_X || input_offset_x < 0;
-                const bool grad_zero = j*16 + local_id >= INPUT0_SIZE_X;
-#if BIAS_TERM
-                UNIT_TYPE grad;
-                if(grad_zero)
-                {
-                    grad = 0;
-                }
-                else
-                {
-                    uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH*16 + local_id*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                    grad = input_grad[input_grad_idx];
-                }
-#endif
-                if(!zero_x && !zero_y)
-                {
-                    uint input_idx = in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH;
-#if BIAS_TERM
-                    result = fma(input[input_idx], grad, result);
-#else
-                    if(!grad_zero)
-                    {
-                        uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH*16 + local_id*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                        result = fma(input[input_idx], input_grad[input_grad_idx], result);
-                    }
-#endif
-                }
-#if BIAS_TERM
-                result_bias += grad;
-#endif
-            }
-        }
-
-        grad_w += result;
-
-#if BIAS_TERM
-        grad_b += result_bias;
-#endif
-    }
-
-    grad_w = sub_group_reduce_add(grad_w);
-#if BIAS_TERM
-    grad_b = sub_group_reduce_add(grad_b);
-#endif
-
-    if (local_id == 0)
-    {
-#if MOMENTUM
-        UNIT_TYPE update_gradient_w = lr * (prev_grad_w[weights_idx] * MOMENTUM_FACTOR + grad_w + DECAY_RATE * filter[weights_idx]);
-        filter[weights_idx] -= update_gradient_w;
-        prev_grad_w[weights_idx] = update_gradient_w;
-#else
-        filter[weights_idx] -= lr * grad_w + DECAY_RATE * lr * filter[weights_idx];
-#endif
-
-#if BIAS_TERM
-        if(ifm == 0)
-        {
-#if MOMENTUM
-            UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR;
-            bias[ofm] -= update_gradient_b;
-            prev_grad_b[ofm] = update_gradient_b;
-#else
-            bias[ofm] -= lr * grad_b;
-#endif
-        }
-#endif
-    }
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl
deleted file mode 100644
index df6a4595708..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-KERNEL(convolution_grad_weights_gpu_3x3)(
-    const __global UNIT_TYPE* input_grad,
-    __global UNIT_TYPE* output,
-    __global UNIT_TYPE* filter,
-#if BIAS_TERM
-    __global UNIT_TYPE* bias,
-#endif
-#if MOMENTUM
-    __global UNIT_TYPE* prev_grad_w,
-#if BIAS_TERM
-    __global UNIT_TYPE* prev_grad_b,
-#endif
-#endif
-    const __global UNIT_TYPE* input,
-    uint split_idx,
-    float lr)
-{
-    const uint ofm = get_global_id(0);
-    const uint ifm = get_global_id(1);
-
-    if (ofm >= INPUT0_FEATURE_NUM || ifm >= INPUT1_FEATURE_NUM)
-        return;
-
-    const int in_x = -PADDING_SIZE_X;
-    const int in_y = -PADDING_SIZE_Y;
-
-    ACCUMULATOR_TYPE grad_w[9] = {};
-#if BIAS_TERM
-    ACCUMULATOR_TYPE grad_b = 0;
-#endif
-
-    uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH;
-
-    for(int b = 0; b < INPUT0_BATCH_NUM; b++)
-    {
-        const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM;
-        const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM;
-
-        for (uint i = 0; i < INPUT0_SIZE_Y; i++)
-        {
-
-            for (uint j = 0; j < INPUT0_SIZE_X; j+=2)
-            {
-                float2 grad;
-                if (j + 1 >= INPUT0_SIZE_X)
-                {
-                    uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                    grad.s0 = input_grad[input_grad_idx];
-                    grad.s1 = 0;
-                }
-                else
-                {
-                    uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                    grad = vload2(0, &input_grad[input_grad_idx]);
-                }
-                for (uint y = 0; y < 3; y++)
-                {
-                    const int input_offset_y = in_y + y + i;
-                    const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0;
-                    const int input_offset_x = in_x + j;
-                    const bool zero_x = input_offset_x < 0 || input_offset_x + 3 >= INPUT1_SIZE_X;
-                    uint input_idx = in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH;
-                    union v4 {
-                        float s[4];
-                        float4 v;
-                    };
-                    union v4 inp;
-                    if (zero_y)
-                        continue;
-                    if (zero_x)
-                    {
-                        for (uint k = 0; k < 4; k++)
-                        {
-                            if (input_offset_x + k >= INPUT1_SIZE_X || input_offset_x + k < 0)
-                                inp.s[k] = 0;
-                            else
-                                inp.s[k] = input[input_idx + k];
-                        }
-                    }
-                    else
-                    {
-                        inp.v = vload4(0, &input[input_idx]);
-                    }
-                    for (uint x = 0; x < 3; x++)
-                    {
-                        grad_w[y * 3 + x] = mad(inp.s[x] * lr, grad.s0, grad_w[y * 3 + x]);
-                        grad_w[y * 3 + x] = mad(inp.s[x + 1] * lr, grad.s1, grad_w[y * 3 + x]);
-                    }
-                }
-#if BIAS_TERM
-                grad_b += grad.s0;
-                grad_b += grad.s1;
-#endif
-            }
-        }
-    }
-
-    union {
-        float  s[8];
-        float8 v;
-    } uweights_0_7;
-    float uweights8;
-
-#if MOMENTUM
-    float dwa[9];
-    uweights_0_7.v = vload8(0, &prev_grad_w[weights_idx]);
-    dwa[0 * 3 + 0] = uweights_0_7.v.s0;
-    dwa[0 * 3 + 1] = uweights_0_7.v.s1;
-    dwa[0 * 3 + 2] = uweights_0_7.v.s2;
-    dwa[1 * 3 + 0] = uweights_0_7.v.s3;
-    dwa[1 * 3 + 1] = uweights_0_7.v.s4;
-    dwa[1 * 3 + 2] = uweights_0_7.v.s5;
-    dwa[2 * 3 + 0] = uweights_0_7.v.s6;
-    dwa[2 * 3 + 1] = uweights_0_7.v.s7;
-    dwa[2 * 3 + 2] = prev_grad_w[weights_idx + 8];
-#endif
-
-    uweights_0_7.v = vload8(0, &filter[weights_idx]);
-    uweights8 = filter[weights_idx + 8];
-
-#if MOMENTUM
-    float8 newDelta_0_7 = (float8)(    
-                                    grad_w[0 * 3 + 0] + (MOMENTUM_FACTOR * dwa[0 * 3 + 0]), 
-                                    grad_w[0 * 3 + 1] + (MOMENTUM_FACTOR * dwa[0 * 3 + 1]),
-                                    grad_w[0 * 3 + 2] + (MOMENTUM_FACTOR * dwa[0 * 3 + 2]), 
-                                    grad_w[1 * 3 + 0] + (MOMENTUM_FACTOR * dwa[1 * 3 + 0]), 
-                                    grad_w[1 * 3 + 1] + (MOMENTUM_FACTOR * dwa[1 * 3 + 1]),
-                                    grad_w[1 * 3 + 2] + (MOMENTUM_FACTOR * dwa[1 * 3 + 2]), 
-                                    grad_w[2 * 3 + 0] + (MOMENTUM_FACTOR * dwa[2 * 3 + 0]),
-                                    grad_w[2 * 3 + 1] + (MOMENTUM_FACTOR * dwa[2 * 3 + 1]));
-    float newDelta8 =               grad_w[2 * 3 + 2] + (MOMENTUM_FACTOR * dwa[2 * 3 + 2]);
-#else
-    float8 newDelta_0_7 = (float8)(    
-                                    grad_w[0 * 3 + 0], 
-                                    grad_w[0 * 3 + 1],
-                                    grad_w[0 * 3 + 2], 
-                                    grad_w[1 * 3 + 0], 
-                                    grad_w[1 * 3 + 1],
-                                    grad_w[1 * 3 + 2], 
-                                    grad_w[2 * 3 + 0],
-                                    grad_w[2 * 3 + 1]);
-    float newDelta8 =               grad_w[2 * 3 + 2];    
-#endif
-    uweights8      -= newDelta8;
-    uweights_0_7.v -= newDelta_0_7;
-
-    vstore8(uweights_0_7.v, 0, &filter[weights_idx]);
-    filter[weights_idx + 8] = uweights8;
-#if MOMENTUM
-    vstore8(newDelta_0_7, 0, &prev_grad_w[weights_idx]);
-    prev_grad_w[weights_idx + 8] = newDelta8;
-#endif
-
-#if BIAS_TERM
-    if(ifm == 0)
-    {
-#if MOMENTUM
-        UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR;
-        bias[ofm] -= update_gradient_b;
-        prev_grad_b[ofm] = update_gradient_b;
-#else
-        bias[ofm] -= lr * grad_b;
-#endif
-    }
-#endif
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl
deleted file mode 100644
index e5d9fde8403..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-KERNEL(convolution_grad_weights_gpu_7x7)(
-    const __global UNIT_TYPE* input_grad,
-    __global UNIT_TYPE* output,
-    __global UNIT_TYPE* filter,
-#if BIAS_TERM
-    __global UNIT_TYPE* bias,
-#endif
-#if MOMENTUM
-    __global UNIT_TYPE* prev_grad_w,
-#if BIAS_TERM
-    __global UNIT_TYPE* prev_grad_b,
-#endif
-#endif
-    const __global UNIT_TYPE* input,
-    uint split_idx,
-    float lr)
-{
-    const uint x_filter = get_global_id(0);
-    const uint ofm = get_global_id(1);
-    const uint ifm = get_global_id(2);
-
-    if (x_filter >= 7 || ofm >= INPUT0_FEATURE_NUM || ifm >= INPUT1_FEATURE_NUM)
-        return;
-
-    const int in_x = -PADDING_SIZE_X;
-    const int in_y = -PADDING_SIZE_Y;
-
-    ACCUMULATOR_TYPE grad_w[7] = { 0, 0, 0, 0, 0, 0, 0 };
-#if BIAS_TERM
-    ACCUMULATOR_TYPE grad_b = UNIT_VAL_ZERO;
-#endif
-
-    uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH;
-
-    for(int b = 0; b < INPUT0_BATCH_NUM; b++)
-    {
-        const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM;
-        const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM;
-
-        for(int i = 0; i < INPUT0_SIZE_Y; i++)
-        {
-            for(int j = 0; j < INPUT0_SIZE_X; j++)
-            {
-                float grad;
-                uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                grad = input_grad[input_grad_idx];
-                for(uint y_filter = 0; y_filter < 7; y_filter++)
-                {
-                    const int input_offset_y = in_y + y_filter + i * STRIDE_SIZE_Y;
-                    const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0;
-                    const int input_offset_x = in_x + x_filter + j * STRIDE_SIZE_X;
-                    const bool zero_x = input_offset_x < 0 || input_offset_x >= INPUT1_SIZE_X;
-                    uint input_idx = in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH;
-                    if(!zero_x && !zero_y)
-                    {
-                        const float delta_f = input[input_idx] * lr * grad;
-                        grad_w[y_filter] += delta_f;
-                    }
-                } 
-#if BIAS_TERM
-                grad_b += grad;
-#endif
-            }
-        }
-    }
-    for(uint y_filter = 0; y_filter < 7; y_filter++)
-    {
-        uint address = weights_idx + 48 - (7 * (6 - y_filter) + (6 - x_filter));
-#if MOMENTUM
-        float dw = prev_grad_w[address];
-        const float delta_f_m = MOMENTUM_FACTOR * dw;
-        grad_w[y_filter] += delta_f_m;
-        prev_grad_w[address] = grad_w[y_filter];
-#endif
-        filter[address] -= grad_w[y_filter];
-    }
-#if BIAS_TERM
-    if(ifm == 0 && x_filter == 0)
-    {
-#if MOMENTUM
-        UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR;
-        bias[ofm] -= update_gradient_b;
-        prev_grad_b[ofm] = update_gradient_b;
-#else
-        bias[ofm] -= lr * grad_b;
-#endif
-    }
-#endif    
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl
deleted file mode 100644
index 98bbc29c44d..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-KERNEL(convolution_grad_weights_gpu_ref)(
-    const __global UNIT_TYPE* input_grad,
-    __global UNIT_TYPE* output,
-    __global float* filter,
-#if BIAS_TERM
-    __global float* bias,
-#endif
-#if MOMENTUM
-    __global float* prev_grad_w,
-#if BIAS_TERM
-    __global float* prev_grad_b,
-#endif
-#endif
-    const __global UNIT_TYPE* input,
-    uint split_idx,
-    float lr)
-{
-    const uint ofm_ifm       = get_global_id(0);
-    const uint id_x          = (uint)get_global_id(1);
-    const uint id_y          = (uint)get_global_id(2);
-    const uint ifm           = ofm_ifm % INPUT1_FEATURE_NUM;
-    const uint ofm           = ofm_ifm / INPUT1_FEATURE_NUM;
-
-    const int in_x    = id_x - PADDING_SIZE_X;
-    const int in_y    = id_y - PADDING_SIZE_Y;
-
-    ACCUMULATOR_TYPE grad_w = 0;
-
-#if BIAS_TERM
-    ACCUMULATOR_TYPE grad_b = 0;
-#endif
-
-    uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH + id_y * FILTER_Y_PITCH + id_x * FILTER_X_PITCH;
-
-    for(int b = 0; b < INPUT0_BATCH_NUM; b++)
-    {
-        ACCUMULATOR_TYPE result = ACCUMULATOR_TYPE_ZERO;
-
-#if BIAS_TERM
-        ACCUMULATOR_TYPE result_bias = ACCUMULATOR_TYPE_ZERO;
-#endif
-
-        const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM;
-        const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM;
-
-        for (uint i = 0; i < INPUT0_SIZE_Y; i++)
-        {
-            for (uint j = 0; j < INPUT0_SIZE_X; j++)
-            {
-                const int input_offset_y = in_y + i * STRIDE_SIZE_Y;
-                const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0;
-                const int input_offset_x = in_x + j * STRIDE_SIZE_X;
-                const bool zero_x = input_offset_x >= INPUT1_SIZE_X || input_offset_x < 0;
-#if BIAS_TERM
-                uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                ACCUMULATOR_TYPE grad = TO_ACCUMULATOR_TYPE(input_grad[input_grad_idx]);
-#endif
-                if(!zero_x && !zero_y)
-                {
-                    uint input_idx = INPUT1_OFFSET + in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH;
-#if BIAS_TERM
-                    result = fma(TO_ACCUMULATOR_TYPE(input[input_idx]), grad, result);
-#else
-                    uint input_grad_idx = INPUT0_OFFSET + grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH;
-                    result = fma(TO_ACCUMULATOR_TYPE(input[input_idx]), TO_ACCUMULATOR_TYPE(input_grad[input_grad_idx]), result);
-#endif
-                }
-#if BIAS_TERM
-                result_bias += grad;
-#endif
-            }
-        }
-
-        grad_w += result;
-
-#if BIAS_TERM
-        grad_b += result_bias;
-#endif
-    }
-
-#if OUTPUT_GRAD_W
-    output[weights_idx] = grad_w;
-#else
-    #if MOMENTUM
-        float update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR;
-        filter[weights_idx] -= update_gradient_w;
-        prev_grad_w[weights_idx] = update_gradient_w;
-    #else
-        filter[weights_idx] -= lr * grad_w + DECAY_RATE * lr * filter[weights_idx];
-    #endif
-
-#if BIAS_TERM
-        if(ifm == 0 && id_x == 0 && id_y == 0)
-        {
-#if MOMENTUM
-        float update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR;
-        bias[ofm] -= update_gradient_b;
-        prev_grad_b[ofm] = update_gradient_b;
-#else
-        bias[ofm] -= lr * grad_b;
-#endif
-        }
-#endif
-
-#endif
-
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl
deleted file mode 100644
index fba71dbdd69..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-__attribute__((intel_reqd_sub_group_size(16)))
-KERNEL(convolution_grad_weights_gpu_ref)(
-    const __global UNIT_TYPE* input_grad,
-    __global UNIT_TYPE* output,
-    __global UNIT_TYPE* filter,
-#if BIAS_TERM
-    __global UNIT_TYPE* bias,
-#endif
-#if MOMENTUM
-    __global UNIT_TYPE* prev_grad_w,
-#if BIAS_TERM
-    __global UNIT_TYPE* prev_grad_b,
-#endif
-#endif
-    const __global UNIT_TYPE* input,
-    uint split_idx,
-    float lr)
-{
-    const uint local_id = get_local_id(0);
-    const uint ofm_ifm  = get_global_id(1);
-    const uint id_x_y   = get_global_id(2);
-
-    const uint id_x     = id_x_y % FILTER_SIZE_X;
-    const uint id_y     = id_x_y / FILTER_SIZE_X;
-    const uint ifm      = ofm_ifm % INPUT1_FEATURE_NUM;
-    const uint ofm      = ofm_ifm / INPUT1_FEATURE_NUM;
-
-    const int in_x      = id_x - PADDING_SIZE_X;
-    const int in_y      = id_y - PADDING_SIZE_Y;
-
-    ACCUMULATOR_TYPE grad_w = 0;
-#if BIAS_TERM
-    ACCUMULATOR_TYPE grad_b = 0;
-#endif
-
-    const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM;
-    const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM;
-
-    uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH + id_y * FILTER_Y_PITCH + id_x * FILTER_X_PITCH;
-
-    for(int y = 0; y < INPUT0_SIZE_Y; y++)
-    {
-        const int input_offset_y = in_y + y * STRIDE_SIZE_Y;
-        const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0;
-        for (uint x = 0; x < INPUT0_SIZE_X; x++)
-        {
-            const int input_offset_x = in_x + x * STRIDE_SIZE_X;
-            const bool zero_x = input_offset_x >= INPUT1_SIZE_X || input_offset_x < 0;
-            for (uint b = 0; b < INPUT0_BATCH_NUM / 16; b++)
-            {
-#if BIAS_TERM
-                uint input_grad_idx = grad_split_offset + b*16*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + x*INPUT0_X_PITCH + y*INPUT0_Y_PITCH;
-                UNIT_TYPE grad = as_float(intel_sub_group_block_read((const __global uint*)(input_grad + input_grad_idx)));
-                grad_b += grad;
-#endif
-                if(!zero_x && !zero_y)
-                {
-                uint input_idx = in_split_offset + b*16*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH;
-#if BIAS_TERM
-                grad_w = fma(as_float(intel_sub_group_block_read((const __global uint*)(input + input_idx))), grad, grad_w);
-#else
-                uint input_grad_idx = grad_split_offset + b*16*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + x*INPUT0_X_PITCH + y*INPUT0_Y_PITCH;
-                grad_w = fma(as_float(intel_sub_group_block_read((const __global uint*)(input + input_idx))), as_float(intel_sub_group_block_read((const __global uint*)(input_grad + input_grad_idx))), grad_w);
-#endif
-                }
-            }
-        }
-    }
-
-    grad_w = sub_group_reduce_add(grad_w);
-#if BIAS_TERM
-    grad_b = sub_group_reduce_add(grad_b);
-#endif
-
-    if (local_id == 0)
-    {
-#if OUTPUT_GRAD_W
-        output[weights_idx] = grad_w;
-#else
-    #if MOMENTUM
-            UNIT_TYPE update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR;
-            filter[weights_idx] -= update_gradient_w;
-            prev_grad_w[weights_idx] = update_gradient_w;
-    #else
-            filter[weights_idx] -= lr * (grad_w + DECAY_RATE * filter[weights_idx]);
-    #endif
-
-#if BIAS_TERM
-        if(ifm == 0 && id_x == 0 && id_y == 0)
-        {
-#if MOMENTUM
-            UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR;
-            bias[ofm] -= update_gradient_b;
-            prev_grad_b[ofm] = update_gradient_b;
-#else
-            bias[ofm] -= lr * grad_b;
-#endif
-        }
-#endif
-#endif
-    }
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl
index c36cb5a07ce..8691930df78 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl
@@ -89,15 +89,6 @@ KERNEL(deconvolution_gpu_bfyx_opt)(
                     uint fixed_input_offset_y = (uint)input_offset_y / STRIDE_SIZE_Y;
                     uint input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH;
 
-#if GRADIENT
-                    uint filter_idx = filter_offset + of*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
-                    for (uint h = 0; h < FILTER_OFM_NUM; h++)
-                    {
-                        acc += TO_ACCUMULATOR_TYPE(input[input_idx]) * TO_ACCUMULATOR_TYPE(filter[filter_idx]);
-                        filter_idx += FILTER_OFM_PITCH;
-                        input_idx += INPUT0_FEATURE_PITCH;
-                    }
-#else
                     uint filter_idx = filter_offset + of*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
                     for (uint h = 0; h < FILTER_IFM_NUM; h++)
                     {
@@ -105,7 +96,6 @@ KERNEL(deconvolution_gpu_bfyx_opt)(
                         filter_idx += FILTER_IFM_PITCH;
                         input_idx += INPUT0_FEATURE_PITCH;
                     }
-#endif
                 }
             }
         }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl
index f7fd8ee408a..d87bbafd948 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl
@@ -105,24 +105,6 @@ KERNEL(deconvolution_gpu_yxfb_ref)(
                             input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH + (uint)fixed_input_offset_z*INPUT0_Z_PITCH;
 #endif
 
-#if GRADIENT
-                            uint filter_idx = filter_offset + of*FILTER_IFM_PITCH + (FILTER_SIZE_Z - k - 1)*FILTER_Z_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
-                            for (uint h = 0; h < FILTER_OFM_NUM; h++) {
-#if !INPUT0_SIMPLE
-#   if INPUT0_DIMS <= 4
-                                input_idx = INPUT0_GET_INDEX(batch_offset, h + g*FILTER_IFM_NUM, fixed_input_offset_y, fixed_input_offset_x);
-#   elif INPUT0_DIMS == 5
-                                input_idx = INPUT0_GET_INDEX(batch_offset, h + g*FILTER_IFM_NUM, fixed_input_offset_z, fixed_input_offset_y, fixed_input_offset_x);
-#   endif
-#endif
-
-                                acc += TO_ACCUMULATOR_TYPE(input[input_idx]) * TO_ACCUMULATOR_TYPE(filter[filter_idx]);
-                                filter_idx += FILTER_OFM_PITCH;
-#if INPUT0_SIMPLE
-                                input_idx += INPUT0_FEATURE_PITCH;
-#endif
-                            }
-#else // GRADIENT
                             uint filter_idx = filter_offset + of*FILTER_OFM_PITCH + (FILTER_SIZE_Z - k - 1)*FILTER_Z_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH;
                             for (uint h = 0; h < FILTER_IFM_NUM; h++) {
 #if !INPUT0_SIMPLE
@@ -139,7 +121,6 @@ KERNEL(deconvolution_gpu_yxfb_ref)(
                                 input_idx += INPUT0_FEATURE_PITCH;
 #endif
                             }
-#endif // GRADIENT
                         }
                     }
                 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl
deleted file mode 100644
index 00a06c524a8..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "include/include_all.cl"
-
-#define PACK 4
-
-#define SGR_MAX_SIZE   (get_max_sub_group_size())
-#define SGR_LOCAL_ID   (get_sub_group_local_id())
-
-#define GET_INDEX(_x) \
-   ( ((_x / SGR_MAX_SIZE) * SGR_MAX_SIZE /* Normed to max_subgroup_size */)   \
-     * (4 * sizeof(int)                  /* 4xINT32 per sub_group reading */) \
-   )
-
-inline int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx)
-{
-    int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(src + idx)));
-    int16 to_return;
-    for(uint i = 0; i < 4; i++)
-    {
-        for(uint j = 0; j < 4; j++)
-        {
-            to_return[i * 4 + j] = as_char4(int_data[i])[j];
-        }
-    }
-    return to_return;
-}
-#define GET_INPUT(A, B) FUNC_CALL(get_int16)(A, GET_INDEX(x))
-
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(eltwise_b_fs_yx_fsv4)(
-    INPUTS_DECLS
-    __global UNIT_TYPE* output
-#if CALIBRATION_TERM
-    , const __global float* calibrations
-#endif
-    )
-{
-    // This kernel works with linearized data w/o strides and padding
-    // so only one dimension 'X' is required
-    const uint x   = get_global_id(0);
-    const uint idx = GET_INDEX(x);
-
-    int16 res;
-
-    DO_ELTWISE;
-
-    for(uint i = 0; i < 4; i++)
-    {
-        const uint out_idx = idx + (sizeof(int) * (SGR_LOCAL_ID + (i * SGR_MAX_SIZE)));
-        char4 char_res;
-
-        for(uint j = 0; j < 4; j++)
-        {
-            int res_tmp = res[i * 4 + j];
-        #if QUANTIZATION_TERM
-        #if CALIBRATION_TERM
-            // Batch:
-            const uint b = out_idx / OUTPUT_BATCH_PITCH;
-            // Feature:
-            // Because of specific data layout Feature  must be normed to PACK size
-            uint d3 = ((out_idx - b * OUTPUT_BATCH_PITCH) / (OUTPUT_FEATURE_PITCH * PACK)) * PACK;
-            res_tmp = (int)round(((float)res_tmp) * calibrations[d3+j]);
-        #else  // CALIBRATION_TERM
-            res_tmp = (int)round(((float)res_tmp) * O_QF);
-        #endif // CALIBRATION_TERM
-        #endif // QUANTIZATION_TERM
-        
-        #if QUANTIZATION_TERM
-        #ifdef ELTW_UNSIGNED
-            char_res[j] = ACTIVATION(convert_uchar_sat(res_tmp), ACTIVATION_PARAMS);
-        #else
-            char_res[j] = ACTIVATION(convert_char_sat(res_tmp), ACTIVATION_PARAMS);
-        #endif
-        #else
-            char_res[j] = ACTIVATION(convert_char(res_tmp), ACTIVATION_PARAMS);
-        #endif
-        }
-        // put 4 chars into output
-        // char_result[i] = as_int(char_res);
-        *((__global int*)(output + out_idx)) = as_int(char_res);
-    }
-}
-
-#undef PACK
-#undef SGR_MAX_SIZE
-#undef SGR_LOCAL_ID
-#undef GET_INDEX
-#undef GET_INPUT
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl
deleted file mode 100644
index f93b1ab5aa7..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "include/include_all.cl"
-
-#ifdef INPUT_STRIDED
-#define GET_INDEX(src) \
-    GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2 * CAT(src, _STRIDE_Y), d1 * CAT(src, _STRIDE_X)) 
-#else
-#define GET_INDEX(src) \
-    GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2, d1) 
-#endif
-
-int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx)
-{
-    int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(src + idx)));
-    int16 to_return;
-    for(uint b = 0; b < 4; b++)
-    {
-        for(uint f = 0; f < 4; f++)
-        {
-            to_return[b * 4 + f] = as_char4(int_data[b])[f];
-        }
-    }
-    return to_return;
-}
-#define GET_INPUT(A, B) FUNC_CALL(get_int16)(A, GET_INDEX(B))
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(eltwise_fs_bs_yx_bsv4_fsv32)(
-    INPUTS_DECLS
-    __global UNIT_TYPE* output
-#if CALIBRATION_TERM
-    , const __global float* calibrations
-#endif
-    )
-{
-    const uint of_32_aligned = ((OUTPUT_FEATURE_NUM + 31) / 32) * 32;
-    const uint d1 = get_global_id(0);   // X
-    const uint d2 = get_global_id(1);   // Y
-    const uint d3 = ((uint)get_global_id(2) * 4) % of_32_aligned; // Feature
-    const uint d4 = 4 * (((uint)get_global_id(2) * 4) / of_32_aligned); // Batch
-
-    int16 res;
-
-    DO_ELTWISE;
-
-    int4 char_result;
-    for(uint b = 0; b < 4; b++)
-    {
-        char4 char_res;
-        for(uint f = 0; f < 4; f++)
-        {
-            int res_tmp = res[b * 4 + f];
-        #if CALIBRATION_TERM
-            res_tmp = (int)round(((float)res_tmp) * calibrations[d3+f]);
-        #else  // CALIBRATION_TERM
-            res_tmp = (int)round(((float)res_tmp) * O_QF);
-        #endif // CALIBRATION_TERM
-            char_res[f] = ACTIVATION(convert_char_sat(res_tmp), ACTIVATION_PARAMS);
-        }
-        // pack 4 chars into int
-        char_result[b] = as_int(char_res);
-    }
-
-    uint output_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, d4, d3, d2, d1);
-    intel_sub_group_block_write4((__global uint*)(output + output_offset), as_uint4(char_result));
-}
-
-#undef GET_INDEX
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl
deleted file mode 100644
index f1a5a4e5f9f..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-KERNEL(embed_ref)(const __global UNIT_TYPE* input0,
-    __global UNIT_TYPE* output,
-    const __global UNIT_TYPE* weights
-#if BIAS_TERM
-    ,const __global UNIT_TYPE* biases
-#endif
-)
-{
-    const uint x = (uint)get_global_id(0);
-	const uint y = (uint)get_global_id(1);
-	const uint b = (uint)get_global_id(2);
-
-	uint output_idx = (b*INPUT0_ELEMENTS_COUNT*NUM_OUTPUT_SIZE)+(uint)(x*NUM_OUTPUT_SIZE+y);
-    output[output_idx] = weights[(uint)(input0[(b*INPUT0_ELEMENTS_COUNT)+x]*NUM_OUTPUT_SIZE+y)];
-#if BIAS_TERM
-    output[output_idx] += biases[y];
-#endif
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl
deleted file mode 100644
index 76169fe87eb..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/common.cl"
-
-#include "include/data_types.cl"
-#include "include/fetch.cl"
-#include "include/mmad.cl"
-
-#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
-#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
-#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
-#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(fully_connected_kernel_mmad_batched)(
-    const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output,
-    const __global FILTER_TYPE* weights
-#if BIAS_TERM
-    , const __global BIAS_TYPE* biases
-#endif
-#if QUANTIZATION_TERM
-    ,const __global float* quantizations
-#endif
-#if CALIBRATION_TERM
-    ,const __global float* calibrations
-#endif
-    )
-{
-    const uint sg_channel = get_sub_group_local_id();
-
-    const uint batch_id = (uint)get_group_id(0) * 8;
-    const uint b_block = batch_id / 4;
-    const uint f = (uint)get_global_id(1) % FILTER_OFM_ALIGNED;
-
-    uint in_addr = IN_OFFSET + b_block * IN_B_BLOCK_PITCH;
-
-    const uint filter_offset = ((uint)get_group_id(1) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
-    uint filter_idx = filter_offset;
-
-    int8 tileA;
-    int8 tileB;
-    int8 tileC = 0;
-
-    for(uint z = 0; z < FILTER_IFM_MMAD_NUM; z++ )
-    {
-        for (uint k = 0; k < FILTER_SIZE_X * FILTER_SIZE_Y; ++k)
-        {
-            // load A tile ( input )
-            // load 8 batches 4 channels per WI, so we'll have 8x32 block
-
-            tileA.lo = as_int4(intel_sub_group_block_read4((const __global uint*)(input + in_addr)));
-            tileA.hi = as_int4(intel_sub_group_block_read4((const __global uint*)(input + in_addr + IN_B_BLOCK_PITCH)));
-
-            // load B tile ( weights )
-            tileB = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
-    
-            // compute C tile ( output )
-            tileC = MMAD_8x8(tileA, tileB, tileC); // here we output 8 batches per workitem, and each workitem gets different output feature
-
-            in_addr += 32 * 4; // 4 batches * 4 features per channel * 8 SIMD channels
-            filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel
-        }
-        in_addr += IN_F_BLOCK_PITCH;
-        in_addr -= (FILTER_SIZE_X * FILTER_SIZE_Y * 32 * 4);
-    }
-
-#if BIAS_TERM
-#if   BIAS_PER_OUTPUT
-    const uint bias_index = GET_DATA_INDEX(BIAS, batch_id, f, y, x);
-#elif BIAS_PER_OFM
-    const uint bias_index = f;
-#endif
-    for(uint i = 0; i < 8; i++)
-    {
-#if CALIBRATION_TERM
-    tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
-#else  // CALIBRATION_TERM
-    tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
-#endif // CALIBRATION_TERM
-    }
-#endif // BIAS_TERM
-
-    // save to output
-    if(f < FILTER_OFM_NUM)
-    {
-        for(uint i = 0; i < 8; i++)
-        {
-            const uint curr_b = batch_id + i;
-#if defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
-            const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, curr_b, f, 0, 0);
-#else
-            const uint dst_index = GET_DATA_INDEX(OUTPUT, curr_b, f, 0, 0);
-#endif
-            output[dst_index] = ACTIVATION(convert_char(tileC[i]), ACTIVATION_PARAMS);
-        }
-    }
-}
-
-#undef FILTER_IFM_MMAD_NUM
-#undef FILTER_OFM_MMAD_NUM
-#undef FILTER_IFM_ALIGNED
-#undef FILTER_OFM_ALIGNED
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl
deleted file mode 100644
index eb1d803cbef..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-KERNEL(fully_connected_grad_input_gpu_ref)(
-    const __global INPUT0_TYPE* input_grad,
-    __global OUTPUT_TYPE* output,
-    const __global FILTER_TYPE* weights,
-    const __global INPUT1_TYPE* input
-    )
-{
-    const uint x            = get_global_id(1);
-    const uint y            = get_global_id(2);
-    const uint b_f          = get_global_id(0);
-    const uint batch_id     = b_f % INPUT0_BATCH_NUM;
-    const uint feature_id   = b_f / INPUT0_BATCH_NUM;
-
-    if(b_f >= INPUT1_FEATURE_NUM * INPUT0_BATCH_NUM)
-        return;
-
-    ACCUMULATOR_TYPE result = 0;
-
-    for (uint ofm = 0; ofm < FILTER_OFM_NUM; ++ofm)
-    {
-        const uint input_grad_idx = GET_DATA_INDEX(INPUT0, batch_id, 0, 0, ofm);
-        const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, feature_id, y, x);
-
-        result += (ACCUMULATOR_TYPE)(input_grad[input_grad_idx] * weights[filter_idx]);
-    }
-
-    const uint output_idx = GET_DATA_INDEX(OUTPUT, batch_id, feature_id, y, x);
-    output[output_idx] = result;
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl
deleted file mode 100644
index c038bdf3e10..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2016-2017 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-KERNEL(fully_connected_grad_weights_gpu_ref)(
-    const __global INPUT0_TYPE* input_grad,
-    __global OUTPUT_TYPE* output,
-    __global float* weights,
-#if BIAS_TERM
-    __global float* bias,
-#endif
-#if MOMENTUM
-    __global float* prev_grad_w,
-#if BIAS_TERM
-    __global float* prev_grad_b,
-#endif
-#endif
-    const __global INPUT1_TYPE* input,
-    const float lr
-    )
-{
-    const uint ofm_ifm       = get_global_id(0);
-    const uint id_x          = (uint)get_global_id(1);
-    const uint id_y          = (uint)get_global_id(2);
-    const uint ifm           = ofm_ifm % FILTER_IFM_NUM;
-    const uint ofm           = ofm_ifm / FILTER_IFM_NUM;
-
-    ACCUMULATOR_TYPE grad_w = 0;
-#if BIAS_TERM
-    ACCUMULATOR_TYPE grad_b = 0;
-#endif
-
-    const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, id_y, id_x);
-    for (uint b = 0; b < INPUT0_BATCH_NUM; b++)
-    {
-        const uint input_grad_idx = GET_DATA_INDEX(INPUT0, b, 0, 0, ofm);
-        const uint input_idx = GET_DATA_INDEX(INPUT1, b, ifm, id_y, id_x);
-        ACCUMULATOR_TYPE grad = TO_ACCUMULATOR_TYPE(input_grad[input_grad_idx]);
-        grad_w += TO_ACCUMULATOR_TYPE(input[input_idx] * grad);
-#if BIAS_TERM
-        grad_b += TO_ACCUMULATOR_TYPE(grad);
-#endif
-    }
-
-#if MOMENTUM
-    float update_gradient_w = lr * (grad_w + DECAY_RATE * weights[filter_idx]) + prev_grad_w[filter_idx] * MOMENTUM_FACTOR;
-    weights[filter_idx] -= update_gradient_w;
-    prev_grad_w[filter_idx] = update_gradient_w;
-#else
-    weights[filter_idx] -= lr * grad_w + DECAY_RATE * lr * weights[filter_idx];
-#endif
-
-#if BIAS_TERM
-    if(ifm == 0 && id_x == 0 && id_y == 0)
-    {
-#if MOMENTUM
-        float update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR;
-        bias[ofm] -= update_gradient_b;
-        prev_grad_b[ofm] = update_gradient_b;
-#else
-        bias[ofm] -= lr * grad_b;
-#endif
-    }
-#endif
-    
-
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl
deleted file mode 100644
index e27ff51a405..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-#define LOCAL_SIZE INPUT0_BATCH_NUM
-
-__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
-KERNEL(convolution)(
-    __global INPUT0_TYPE* input, 
-    __global OUTPUT_TYPE* output, 
-    __global FILTER_TYPE* weights, 
-#if BIAS_TERM
-    __global BIAS_TYPE* biases,
-#endif
-    uint split_idx,
-    __global INPUT0_TYPE* scale_in
-#if SCALE_BIAS_TERM
-    , __global INPUT0_TYPE* scale_bias
-#endif
-#if FUSED_TRAINING
-    , __global INPUT0_TYPE* inv_var,
-    __global INPUT0_TYPE* conv_output,
-    __global INPUT0_TYPE* bn_output
-#endif
-    )
-{
-    const uint f = get_global_id(1);
-    const uint b = get_global_id(0);
-
-    UNIT_TYPE conv_out = UNIT_VAL_ZERO;
-
-    const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
-
-    const uint filter_offset = f*FILTER_OFM_PITCH;
-    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
-
-    for (uint y = 0; y < OUTPUT_SIZE_Y; ++y)
-    {
-        const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
-        for (uint x = 0; x < OUTPUT_SIZE_X; ++x)
-        {
-            const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
-            for (uint k = 0; k < FILTER_IFM_NUM; ++k)
-            {
-                for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
-                {
-                    const int input_offset_y = input_y + j * DILATION_SIZE_Y;
-                    const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-
-                    if(!zero_y)
-                    {
-                        for (uint i = 0; i < FILTER_SIZE_X ; ++i)
-                        {
-                            const int input_offset_x = input_x + i * DILATION_SIZE_X;
-                            const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-
-                            if(!zero_x)
-                            {
-                                uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*INPUT0_FEATURE_PITCH;
-                                uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
-                                conv_out += input[input_idx] * weights[filter_idx];       
-                            }
-                        }
-                    }
-                }
-            }
-#if BIAS_TERM
-                conv_out += (UNIT_TYPE)biases[f];
-#endif
-
-                const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
-                const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset;
-#ifdef FUSED_TRAINING
-                conv_output[dst_index] = conv_out;
-#else
-                output[dst_index] = conv_out;
-#endif
-        }
-    }
-
-
-    // BATCH NORM PART
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    __local ACCUMULATOR_TYPE sum[LOCAL_SIZE];
-
-    const uint local_idx = b;
-
-    sum[local_idx] = 0;
-
-    uint input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
-    for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < OUTPUT_SIZE_X; x++)
-        {
-#ifdef FUSED_TRAINING
-            UNIT_TYPE in = conv_output[input_idx];
-#else
-            UNIT_TYPE in = output[input_idx];
-#endif
-            sum[local_idx] += in;
-            input_idx += OUTPUT_X_PITCH;
-        }
-        input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
-    {
-        if (local_idx < offset) 
-        {
-            sum[local_idx] += sum[local_idx + offset];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
-
-    sum[local_idx] = 0;
-
-    input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
-    for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < OUTPUT_SIZE_X; x++)
-        {
-#ifdef FUSED_TRAINING
-            UNIT_TYPE in = conv_output[input_idx] - mean;
-#else
-            UNIT_TYPE in = output[input_idx] - mean;
-#endif
-            sum[local_idx] += in * in;
-            input_idx += OUTPUT_X_PITCH;
-        }
-        input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
-    {
-        if (local_idx < offset) 
-        {
-            sum[local_idx] += sum[local_idx + offset];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y);
-
-    float inv_variance = (float)(1.0 / sqrt(variance + EPSILON));
-
-#ifdef FUSED_TRAINING
-    if (local_idx == 0)
-        inv_var[f] = inv_variance;
-#endif
-
-    uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0);
-    for (uint y = 0; y < OUTPUT_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < OUTPUT_SIZE_X; x++)
-        {
-#ifdef FUSED_TRAINING
-            UNIT_TYPE out_val = inv_variance * (conv_output[out_idx] - mean);
-            bn_output[out_idx] = out_val;
-#ifdef SCALE_BIAS_TERM
-            output[out_idx] = ACTIVATION(out_val * scale_in[f] + scale_bias[f], ACTIVATION_PARAMS);
-#else
-            output[out_idx] = ACTIVATION(out_val * scale_in[f], ACTIVATION_PARAMS);
-#endif
-#else
-#ifdef SCALE_BIAS_TERM
-            output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f] + scale_bias[f], ACTIVATION_PARAMS);
-#else
-            output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f], ACTIVATION_PARAMS);
-#endif
-#endif
-            out_idx += OUTPUT_X_PITCH;
-        }
-        out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH;
-    }
-
-}
-
-#undef LOCAL_SIZE
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl
deleted file mode 100644
index e7af9776425..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "include/include_all.cl"
-#include "include/sub_group.cl"
-#include "include/fetch.cl"
-
-#define TILE_M          2
-#define TILE_K          FILTER_SIZE_X
-#define TILE_N          32
-
-inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset)(uint out_offset, uint strideX, uint strideY)
-{
-// bfyx
-    uint tmp_idx = out_offset;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    x_idx *= strideX;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    y_idx *= strideY;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint f_idx = tmp_idx % OUTPUT_FEATURE_NUM;
-    tmp_idx /= OUTPUT_FEATURE_NUM;
-    uint b_idx = tmp_idx % OUTPUT_BATCH_NUM;
-
-    return GET_DATA_INDEX(INPUT1, b_idx, f_idx, y_idx, x_idx);
-}
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(fused_conv_eltwise_gemm_fp32)(
-    const __global float *src0,
-    __global float *dst,
-    const __global float *src1,
-#if BIAS_TERM
-    const __global float *bias,
-#endif
-    uint split_idx,
-    const __global float* src3)
-{
-#include "include/vec_typedefs.cl"
-
-    const unsigned group_x = get_group_id(0);
-    const unsigned group_y = get_group_id(1);
-    const unsigned global_x = get_global_id(0);
-    const unsigned global_y = get_global_id(1);
-    const unsigned global_z = get_global_id(2);
-
-    unsigned interleaved_y;
-    unsigned kernel_y;
-    unsigned kernel_idx;
-
-    // Result ctile (*dst) is M rows x N columns
-    // LWG size is 1x8.  Thus each thread calculates 8*M rows x N cols of ctile.
-    float8  blockC00 = 0.f;
-    float8  blockC10 = 0.f;
-    float8  blockC20 = 0.f;
-    float8  blockC30 = 0.f;
-    float8  blockC01 = 0.f;
-    float8  blockC11 = 0.f;
-    float8  blockC21 = 0.f;
-    float8  blockC31 = 0.f;
-
-    const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM;
-    // Src0 (patch input) is directly used as atile.
-    // Each work item points to the start of a different patch.
-    // atile is M rows x K columns.
-    const uint src0_read_offset0_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
-     + INPUT0_BATCH_PITCH * global_z                                                         // batch offset
-     + ( ( ( global_y * TILE_M + 0 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH )    // y offset
-     + ( ( ( global_y * TILE_M + 0 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X );                    // x offset
-    const uint src0_read_offset1_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset
-     + INPUT0_BATCH_PITCH * global_z                                                 // batch offset
-     + ( ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH )    // y offset
-     + ( ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X );                    // x offset
-
-    // Src1 (filter) is directly used as btile.
-    // It starts at the top of src1 and walks down.
-    // btile is K rows x N columns.
-    uint src0_read_offset0 = src0_read_offset0_const;
-    uint src0_read_offset1 = src0_read_offset1_const;
-    uint src1_read_offset = ( global_x * TILE_N * 2);
-
-#define DOT_PRODUCT_8( _result, _rowA, colB )    \
-    {   \
-        _result.s0 = mad( _rowA, sub_group_broadcast( colB,  0 ), _result.s0 );  \
-        _result.s1 = mad( _rowA, sub_group_broadcast( colB,  1 ), _result.s1 );  \
-        _result.s2 = mad( _rowA, sub_group_broadcast( colB,  2 ), _result.s2 );  \
-        _result.s3 = mad( _rowA, sub_group_broadcast( colB,  3 ), _result.s3 );  \
-        _result.s4 = mad( _rowA, sub_group_broadcast( colB,  4 ), _result.s4 );  \
-        _result.s5 = mad( _rowA, sub_group_broadcast( colB,  5 ), _result.s5 );  \
-        _result.s6 = mad( _rowA, sub_group_broadcast( colB,  6 ), _result.s6 );  \
-        _result.s7 = mad( _rowA, sub_group_broadcast( colB,  7 ), _result.s7 );  \
-    }
-
-    // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
-    // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch
-    // and FILTER_SIZE_X/2 rows of interleaved filter.
-    unsigned patch_depth = 0;
-    do
-    {
-        unsigned patch_row = 0;
-        do
-        {
-            // Load atile and btile.
-            // Kernel data is partially interleaved.  Every 2 rows are interleaved at float8 granularity.
-            // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved.  The non
-            // interleaved row is padded with zero to ensure same size as interleaved rows. This
-            // interleaving is done to ensure 0% GDR bank conflicts.  For example, this is how the
-            // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3.
-            // (0, 0) (8, 0) (16, 0) (24, 0) ...       (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
-            // (0, 1) (8, 1) (16, 1) (24, 1) ... =>    (0, 2) (8, 2) (16, 2) (24, 2) ...
-            // (0, 2) (8, 2) (16, 2) (24, 2) ...       ...
-            // ...
-            const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1;
-
-            float blockA00[FILTER_SIZE_X];
-            float blockA01[FILTER_SIZE_X];
-
-            // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop
-            {
-                unsigned i = 0;
-                LOOP(FILTER_SIZE_X, i,
-                {
-#if LEFTOVERS == 1
-                    if(src0_read_offset0_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
-                    {
-                        if(src0_read_offset0 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
-                            blockA00[i] = src0[src0_read_offset0 + i];
-                    }
-                    else
-#endif
-                        blockA00[i] = src0[src0_read_offset0 + i];
-
-#if LEFTOVERS == 1
-                    if(src0_read_offset1_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
-                    {
-                        if(src0_read_offset1 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH)
-                            blockA01[i] = src0[src0_read_offset1 + i];
-                    }
-                    else
-#endif
-                        blockA01[i] = src0[src0_read_offset1 + i];
-                } )
-            }
-
-            float*  pblockA00 = (float*)(&blockA00);
-            float*  pblockA01 = (float*)(&blockA01);
-
-            src0_read_offset0 += INPUT0_Y_PITCH;
-            src0_read_offset1 += INPUT0_Y_PITCH;
-
-
-            float blockB00[FILTER_SIZE_X*4];
-            float8* p8BlockB00 = (float8*)blockB00;
-            float4* p4BlockB00 = (float4*)blockB00;
-            float*  pBlockB00 =  (float* )blockB00;
-
-            interleaved_y = 0;
-            LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
-            {
-                p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
-                src1_read_offset += ALIGNED_OFM * 2;
-            } )
-            if ( kernel_width_is_odd )
-            {
-                p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
-                src1_read_offset += ALIGNED_OFM * 2;
-            }
-
-            // Perform MADs
-            kernel_idx = 0;
-            interleaved_y = 0;
-            LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
-            {
-                kernel_y = interleaved_y * 2;
-                DOT_PRODUCT_8( blockC00, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC01, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC10, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC11, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC20, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC21, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC30, pblockA00[kernel_y    ], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC31, pblockA01[kernel_y    ], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
-            } )
-            if ( kernel_width_is_odd )
-            {
-                kernel_y = interleaved_y * 2;
-                DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
-                DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
-                DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
-            }
-        }
-
-        //while( ++patch_row < 1 ); //debug
-        while( ++patch_row < FILTER_SIZE_Y );
-
-        src0_read_offset0 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
-        src0_read_offset1 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch
-    }
-    //while ( ++patch_depth < 1 );  //debug
-    while ( ++patch_depth < INPUT0_FEATURE_NUM );
-
-    const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
-    // Dst resembles a cube of width x height x (output channel * batches).  Each tile writes:
-    // (SIMD * TILE_M) x 1 x TILE_N.  Partial writes most likely generated if padding used.
-    __global float *out0 = dst + OUTPUT_OFFSET + out_split_offset
-     + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
-     + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
-     + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                      // y offset
-     + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X );                                      // x offset
-    __global float *out1 = dst + OUTPUT_OFFSET + out_split_offset
-     + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
-     + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
-     + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                  // y offset
-     + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X );                                  // x offset
-
-    #if BIAS_TERM
-    __global float8* biasPtr = (__global float8*) (bias + group_x * TILE_N);
-    #endif
-
-    uint out0_offset = OUTPUT_OFFSET + out_split_offset
-     + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
-     + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
-     + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                      // y offset
-     + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X );                                      // x offset
-
-     uint out1_offset = OUTPUT_OFFSET + out_split_offset
-     + global_z * OUTPUT_BATCH_PITCH                                                   // batch offset
-     + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH                                     // channel offset
-     + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH                  // y offset
-     + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X );
-
-    //-----------------------------------------------------------------------------------------------//
-    // OUTPUT PHASE
-    //-----------------------------------------------------------------------------------------------//
-    if( global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
-    {
-        if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
-        {
-            #if BIAS_TERM
-            blockC00 += *biasPtr;
-            blockC10 += *(biasPtr + 1);
-            blockC20 += *(biasPtr + 2);
-            blockC30 += *(biasPtr + 3);
-            #endif
-
-            blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV);
-            blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV);
-            blockC20 = ACTIVATION_CONV(blockC20, ACTIVATION_PARAMS_CONV);
-            blockC30 = ACTIVATION_CONV(blockC30, ACTIVATION_PARAMS_CONV);
-
-            // eltwise
-            uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
-            for(uint i = 0; i < 8; i++)
-            {
-                blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
-                blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
-                blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
-                blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
-            }
-
-            blockC00 = ACTIVATION_ELTW(blockC00, ACTIVATION_PARAMS_ELTW);
-            blockC10 = ACTIVATION_ELTW(blockC10, ACTIVATION_PARAMS_ELTW);
-            blockC20 = ACTIVATION_ELTW(blockC20, ACTIVATION_PARAMS_ELTW);
-            blockC30 = ACTIVATION_ELTW(blockC30, ACTIVATION_PARAMS_ELTW);
-            // end eltwise
-
-            for( unsigned i = 0; i < 8; i++ )
-            {
-                out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
-                out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
-                out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
-                out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
-            }
-        }
-        else
-        {
-            if ( ( global_x + 1 ) < get_global_size(0) )
-            {
-                #if BIAS_TERM
-                blockC00 += *biasPtr;
-                blockC10 += *(biasPtr + 1);
-                blockC20 += *(biasPtr + 2);
-                blockC30 += *(biasPtr + 3);
-                #endif
-
-                blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV);
-                blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV);
-                blockC20 = ACTIVATION_CONV(blockC20, ACTIVATION_PARAMS_CONV);
-                blockC30 = ACTIVATION_CONV(blockC30, ACTIVATION_PARAMS_CONV);
-
-                // eltwise
-                uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
-                for(uint i = 0; i < 8; i++)
-                {
-                    blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
-                    blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
-                    blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
-                    blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
-                }
-
-                blockC00 = ACTIVATION_ELTW(blockC00, ACTIVATION_PARAMS_ELTW);
-                blockC10 = ACTIVATION_ELTW(blockC10, ACTIVATION_PARAMS_ELTW);
-                blockC20 = ACTIVATION_ELTW(blockC20, ACTIVATION_PARAMS_ELTW);
-                blockC30 = ACTIVATION_ELTW(blockC30, ACTIVATION_PARAMS_ELTW);
-                // end eltwise
-
-                for ( unsigned i = 0; i < 8; i++ )
-                {
-                    out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
-                    out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
-                    out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
-                    out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
-                }
-            }
-            else
-            {
-                if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 )
-                {
-                    #if BIAS_TERM
-                    blockC00 += *biasPtr;
-                    blockC10 += *(biasPtr + 1);
-                    blockC20 += *(biasPtr + 2);
-                    if (( OUTPUT_FEATURE_NUM % TILE_N) > 24 ) blockC30 += *(biasPtr + 3);
-                    #endif
-
-                    blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV);
-                    blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV);
-                    blockC20 = ACTIVATION_CONV(blockC20, ACTIVATION_PARAMS_CONV);
-
-                    // remaining output channels
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        blockC30[i] = ACTIVATION_CONV(blockC30[i], ACTIVATION_PARAMS_CONV);
-                    }
-
-                    // eltwise
-                    uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
-                    for(uint i = 0; i < 8; i++)
-                    {
-                        blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
-                        blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
-                        blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
-                    }
-
-                    // remaining output channels
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        blockC30[i] += src3[src3_offset + (i + 24 )* INPUT1_FEATURE_PITCH];
-                        blockC30[i] = ACTIVATION_ELTW(blockC30[i], ACTIVATION_PARAMS_ELTW);
-                    }
-
-                    blockC00 = ACTIVATION_ELTW(blockC00, ACTIVATION_PARAMS_ELTW);
-                    blockC10 = ACTIVATION_ELTW(blockC10, ACTIVATION_PARAMS_ELTW);
-                    blockC20 = ACTIVATION_ELTW(blockC20, ACTIVATION_PARAMS_ELTW);
-                    // end eltwise
-
-                    for (unsigned i = 0; i < 8; i++)
-                    {
-                        out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
-                        out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
-                        out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i];
-                    }
-
-                    // remaining output channels
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i];
-                    }
-                }
-                else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 )
-                {
-                    #if BIAS_TERM
-                    blockC00 += *biasPtr;
-                    blockC10 += *(biasPtr + 1);
-                    if (( OUTPUT_FEATURE_NUM % TILE_N) > 16 )
-                        blockC20 += *(biasPtr + 2);
-                    #endif
-
-                    blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV);
-                    blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV);
-
-                    for (unsigned i = 0; i < 8; i++)
-                    {
-                        out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
-                        out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i];
-                    }
-
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out0[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC20[i], ACTIVATION_PARAMS_CONV);
-
-                    }
-                }
-                else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 )
-                {
-                    #if BIAS_TERM
-                    blockC00 += *biasPtr;
-                    if (( OUTPUT_FEATURE_NUM % TILE_N) > 8 )
-                        blockC10 += *(biasPtr + 1);
-                    #endif
-
-                    blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV);
-
-                    for (unsigned i = 0; i < 8; i++)
-                    {
-                        out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i];
-                    }
-
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out0[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC10[i], ACTIVATION_PARAMS_CONV);
-                    }
-                }
-                else
-                {
-                    #if BIAS_TERM
-                    blockC00 += *biasPtr;
-                    #endif
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out0[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC00[i], ACTIVATION_PARAMS_CONV);
-                    }
-                }
-            }
-        }
-    }
-
-    if ((global_y * TILE_M + 1) < OUTPUT_SIZE_X * OUTPUT_SIZE_Y )
-    {
-        if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 )
-        {
-            #if BIAS_TERM
-            blockC01 += *biasPtr;
-            blockC11 += *(biasPtr + 1);
-            blockC21 += *(biasPtr + 2);
-            blockC31 += *(biasPtr + 3);
-            #endif
-
-            blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV);
-            blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV);
-            blockC21 = ACTIVATION_CONV(blockC21, ACTIVATION_PARAMS_CONV);
-            blockC31 = ACTIVATION_CONV(blockC31, ACTIVATION_PARAMS_CONV);
-
-            // eltwise
-            uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out1_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y);
-            for(uint i = 0; i < 8; i++)
-            {
-                blockC01[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH];
-                blockC11[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH];
-                blockC21[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH];
-                blockC31[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH];
-            }
-
-            blockC01 = ACTIVATION_ELTW(blockC01, ACTIVATION_PARAMS_ELTW);
-            blockC11 = ACTIVATION_ELTW(blockC11, ACTIVATION_PARAMS_ELTW);
-            blockC21 = ACTIVATION_ELTW(blockC21, ACTIVATION_PARAMS_ELTW);
-            blockC31 = ACTIVATION_ELTW(blockC31, ACTIVATION_PARAMS_ELTW);
-            // end eltwise
-
-            for( unsigned i = 0; i < 8; i++ )
-            {
-                out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
-                out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
-                out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
-                out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i];
-            }
-        }
-        else
-        {
-            if ( ( global_x + 1 ) < get_global_size(0) )
-            {
-                #if BIAS_TERM
-                blockC01 += *biasPtr;
-                blockC11 += *(biasPtr + 1);
-                blockC21 += *(biasPtr + 2);
-                blockC31 += *(biasPtr + 3);
-                #endif
-
-                blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV);
-                blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV);
-                blockC21 = ACTIVATION_CONV(blockC21, ACTIVATION_PARAMS_CONV);
-                blockC31 = ACTIVATION_CONV(blockC31, ACTIVATION_PARAMS_CONV);
-
-                for ( unsigned i = 0; i < 8; i++ )
-                {
-                    out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
-                    out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
-                    out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
-                    out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i];
-                }
-            }
-            else
-            {
-                if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 )
-                {
-                    #if BIAS_TERM
-                    blockC01 += *biasPtr;
-                    blockC11 += *(biasPtr + 1);
-                    blockC21 += *(biasPtr + 2);
-                    if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 24 ) blockC31 += *(biasPtr + 3);
-                    #endif
-
-                    blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV);
-                    blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV);
-                    blockC21 = ACTIVATION_CONV(blockC21, ACTIVATION_PARAMS_CONV);
-
-                    for (unsigned i = 0; i < 8; i++)
-                    {
-                        out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
-                        out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
-                        out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i];
-                    }
-
-                    // Remaining channels
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out1[(24+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC31[i], ACTIVATION_PARAMS_CONV);
-                    }
-                }
-                else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 )
-                {
-                    #if BIAS_TERM
-                    blockC01 += *biasPtr;
-                    blockC11 += *(biasPtr + 1);
-                    if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 16 ) blockC21 += *(biasPtr + 2);
-                    #endif
-
-                    blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV);
-                    blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV);
-
-                    for (unsigned i = 0; i < 8; i++)
-                    {
-                        out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
-                        out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i];
-                    }
-
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out1[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC21[i], ACTIVATION_PARAMS_CONV);
-                    }
-                }
-                else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 )
-                {
-                    #if BIAS_TERM
-                    blockC01 += *biasPtr;
-                    if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 8 ) blockC11 += *(biasPtr + 1);
-                    #endif
-
-                    blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV);
-
-                    for (unsigned i = 0; i < 8; i++)
-                    {
-                        out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i];
-                    }
-
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out1[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC11[i], ACTIVATION_PARAMS_CONV);
-                    }
-                }
-                else
-                {
-                    #if BIAS_TERM
-                    blockC01 += *biasPtr;
-                    #endif
-
-                    for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++)
-                    {
-                        out1[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC01[i], ACTIVATION_PARAMS_CONV);
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl
deleted file mode 100644
index e0eec62783e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl
+++ /dev/null
@@ -1,510 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/mmad.cl"
-
-#define SUM_SCALE 0.11f
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    {\
-        float4 tmp;\
-        for(uint z = 0; z < 4; z++)\
-        {\
-            tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\
-            tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\
-            tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\
-            tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\
-            \
-            regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\
-            regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\
-            regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\
-            regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\
-        }\
-    }
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = regC[0 * 4 + i][idx];\
-    regC_uchar16.s1 = regC[1 * 4 + i][idx];\
-    regC_uchar16.s2 = regC[2 * 4 + i][idx];\
-    regC_uchar16.s3 = regC[3 * 4 + i][idx];\
-    \
-    regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\
-    regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\
-    regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\
-    regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\
-    \
-    regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\
-    regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\
-    regC_uchar16.sa = regC[2 * 4 + i][idx+2];\
-    regC_uchar16.sb = regC[3 * 4 + i][idx+2];\
-    \
-    regC_uchar16.sc = regC[0 * 4 + i][idx+3];\
-    regC_uchar16.sd = regC[1 * 4 + i][idx+3];\
-    regC_uchar16.se = regC[2 * 4 + i][idx+3];\
-    regC_uchar16.sf = regC[3 * 4 + i][idx+3];\
-    {\
-        int16 sum;\
-        for(uint s = 0; s <16; s++)\
-        {\
-            sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\
-        }\
-        regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\
-        regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\
-        regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\
-        regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\
-        \
-        regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\
-        regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\
-        regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\
-        regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\
-        \
-        regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\
-        regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\
-        regC_uchar16.sa = convert_uchar_sat( sum.sa );\
-        regC_uchar16.sb = convert_uchar_sat( sum.sb );\
-        \
-        regC_uchar16.sc = convert_uchar_sat( sum.sc );\
-        regC_uchar16.sd = convert_uchar_sat( sum.sd );\
-        regC_uchar16.se = convert_uchar_sat( sum.se );\
-        regC_uchar16.sf = convert_uchar_sat( sum.sf );\
-    }
-
-#else
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s1 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s2 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s3 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\
-    \
-    regC_uchar16.s4 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s5 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s6 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s7 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\
-    \
-    regC_uchar16.s8 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.s9 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.sa = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.sb = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\
-    \
-    regC_uchar16.sc = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.sd = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.se = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\
-    regC_uchar16.sf = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\
-    {\
-        int16 sum;\
-        for(uint s = 0; s <16; s++)\
-        {\
-            sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\
-        }\
-        regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s0)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s1)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s2)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s3)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-
-        \
-        regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s4)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s5)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s6)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s7)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-        \
-        regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s8)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s9)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sa)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sb)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-        \
-        regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sc)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sd)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.se)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sf)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-    }
-#endif
-
-
-inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
-{
-#if OUT_WITH_PADDING == 1
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
-    padded_offset += y_idx * OUT_Y_PITCH;
-    padded_offset += x_idx * OUT_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += OUT_OFFSET;
-
-    return padded_offset;
-#else
-    return cOffset;
-#endif
-}
-
-#if IN_OUT_OPT != 1
-inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY)
-{
-#if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    x_idx *= strideX;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    y_idx *= strideY;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH;
-    padded_offset += y_idx * IN2_Y_PITCH;
-    padded_offset += x_idx * IN2_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += IN2_OFFSET;
-
-    return padded_offset;
-#else
-    return cOffset;
-#endif
-}
-#endif
-
-inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
-                                    __local int8* l_tileB, const uint l_offsetTileB_col0,
-                                    const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
-                                    const uint l_offsetTileB_col3, int8* rowA, int8* colB,
-                                    int8* regC)
-{
-    // Read tile A from SLM to regA
-    uint l_offsetTileATemp = l_offsetTileA;
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
-        l_offsetTileATemp += 8 * SG_SIZE;
-    }
-    // Read tile B from SLM to regB and compute mmad
-    colB[0] = l_tileB[l_offsetTileB_col0];
-    colB[1] = l_tileB[l_offsetTileB_col1];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
-    }
-    colB[0] = l_tileB[l_offsetTileB_col2];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
-    }
-    colB[1] = l_tileB[l_offsetTileB_col3];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
-    }
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
-    }
-}
-
-/*
- *  \brief GEMM kernel to compute MxN matrix using SLM
- *  \param g_inA  - Input matrix
- *  \param g_inB  - Input matrix
- *  \param g_outC - Output matrix
- */
-
-__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
-KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8_fused_eltwise)
-  (
-  __global char* const g_inA,
-  __global int* g_outC,
-  __global char* const g_inB,
-    #if BIAS_TERM
-        __global BIAS_TYPE* biases,
-    #endif
-        __global float* quantizations,
-    #if CALIBRATION_TERM
-        __global float* calibrations,
-    #endif
-        uint split_idx,
-  __global char* const input2,
-  __global float* eltw_calibrations
-   )
-{
-
-    __global int4* const g_matrixA = (__global int4*)g_inA;
-    __global int4* const g_matrixB = (__global int4*)g_inB;
-    __global int8* g_matrixC = (__global int8*)g_outC;
-
-    // Each work-group works to compute 128x128 tile.
-    // Each work-group contains 16 sub-groups.
-    // Each sub-group within the work-group works to compute a 32x32 tile.
-    // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128).
-    // 2) Each sub-group works to compute 32x32 tileC (stored in regC).
-    //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
-    // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
-    __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024
-    __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024
-
-    __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA;
-    __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA;
-    __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB;
-
-    const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y);
-
-    const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint);
-    const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8);
-    const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4);
-    const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4);
-
-    // Thread IDs
-    const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY
-    const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX
-    const uint l_tidX = get_local_id(DIM_X);  // 0,...,31 in WG
-    const uint l_tidY = get_local_id(DIM_Y);  // 0,1,2,3  in WG
-    const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; // 0,1,2,...127
-
-    // SubGroup IDs
-    const uint sg_tid = get_sub_group_local_id();            // 0,1,...,8
-    const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);     //{0}/8
-    const uint sg_global_idY = g_tidY;                       //{0}
-
-    const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);      // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3}
-    const uint sg_local_idY = l_tidY;                        // 0,1,2,3
-    const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX;  // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4
-
-    const uint sub_group_id = get_sub_group_id();
-
-
-    // Registers
-    int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts   // (32/8)*4
-    int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
-    int8 colB[2];  // each lane will store 32x4 piece of matrixB
-
-    // SLM indices
-    const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
-    const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
-    const uint numElements32x8TileB = numElements32x32TileB / 4;
-    const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
-    const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
-    const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
-
-    // Global indices
-    uint g_idxA[2];
-    uint g_idxB[2];
-#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
-    g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid;
-    g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid;
-    g_idxA[1] = g_idxA[0] + l_groupSize;
-    g_idxB[1] = g_idxB[0] + l_groupSize;
-#else // Row (matrixA) and Col (matrixB) major layout
-    g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-    g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-#endif
-
-    // Initial SLM setup
-    {
-        l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]];
-        l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]];
-        l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]];
-        l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]];
-
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    int4 hdcReadValueA[2];
-    int4 hdcReadValueB[2];
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
-    {
-        /*
-         * SLM setup - HDC read only
-         */
-        // Overlap HDC reads with mmad compute
-        hdcReadValueA[0] = g_matrixA[g_idxA[0]];
-        hdcReadValueB[0] = g_matrixB[g_idxB[0]];
-        hdcReadValueA[1] = g_matrixA[g_idxA[1]];
-        hdcReadValueB[1] = g_matrixB[g_idxB[1]];
-
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        /*
-         * mmad compute
-         */
-        FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint],
-                                l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8],
-                                l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
-                                l_offsetTileB_col3, rowA, colB, regC);
-
-        /*
-         * SLM setup - SLM write only
-         */
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0];
-        l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0];
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1];
-        l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1];
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    } // main outer loop
-
-    /*
-     * Last mmad compute iteration (avoids branching in main loop)
-     */
-
-    FUNC_CALL(mmad_32x32_int8)(
-        &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint],
-        l_offsetTileA,
-        &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8],
-        l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB,
-        regC);
-
-#ifdef OUTPUT_TILED_GLOBAL_LAYOUT
-    // Write out in swizzled manner after quantizing
-    __global uchar* g_outC_uchar = (__global uchar*)g_outC;
-    uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
-                   sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
-
-    uchar16 regC_uchar16;
-    uint offset_uc16 = 0;
-
-    const uint workgroup_id_x = get_group_id(0);
-    uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x
-    uint feature = get_sub_group_local_id()*4 + feature_off;
-
-    float4 quant_f = vload4(0, quantizations + feature);
-    float4 bias_f = vload4(0, biases + feature);
-    float4 calib_f = vload4(0, calibrations + feature);
-
-    // eltwise calibs
-    float4 eltw_calib_f = vload4(0, eltw_calibrations + feature);
-
-    uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))];
-    uint tmpcOff = cOffset;
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-    for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++)
-    {
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff);
-#if IN_OUT_OPT == 1
-        eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset)));
-#else
-        const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y);
-        eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset)));
-#endif
-        tmpcOff += sizeof(uchar16) * SG_SIZE;
-    }
-
-#if MMAD_SUPPORTED == 1
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-#endif
-    for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
-    {
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            uchar16 eltw_input_vals = eltw[i * 2];
-            // B0..3, F0..31
-            QUANTIZATION(0);
-        }
-
-        intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16));
-        cOffset += sizeof(uchar16) * SG_SIZE;
-
-        // now we need to calculate again for other x
-        padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            uchar16 eltw_input_vals = eltw[i * 2 + 1];
-            // B0..3, F0..31
-            QUANTIZATION(4);
-        }
-
-        intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) );
-        cOffset += sizeof(uchar16) * SG_SIZE;
-    }
-#else
-    // Write final accumulated values
-    uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
-                   sg_tid * (MATRIX_M / 8);
-    __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
-    for (uint i = 0; i < (SIMD_LANE_N); ++i)
-    {
-        __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
-        for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
-        {
-            g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
-        }
-        cOffset += SG_SIZE * (MATRIX_M / 8);
-    }
-#endif
-}
-
-#undef SUM_SCALE
-#undef SCALE
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl
deleted file mode 100644
index 30542a197ca..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl
+++ /dev/null
@@ -1,505 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/mmad.cl"
-
-#define SUM_SCALE 0.11f
-#define SCALE 0.11f
-
-#ifdef LIGHTWEIGHT_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    {\
-        float4 tmp;\
-        for(uint z = 0; z < 4; z++)\
-        {\
-            tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\
-            tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\
-            tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\
-            tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\
-            \
-            regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\
-            regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\
-            regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\
-            regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\
-        }\
-    }
-
-#elif NO_QUANTIZATION
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = regC[0 * 4 + i][idx];\
-    regC_uchar16.s1 = regC[1 * 4 + i][idx];\
-    regC_uchar16.s2 = regC[2 * 4 + i][idx];\
-    regC_uchar16.s3 = regC[3 * 4 + i][idx];\
-    \
-    regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\
-    regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\
-    regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\
-    regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\
-    \
-    regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\
-    regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\
-    regC_uchar16.sa = regC[2 * 4 + i][idx+2];\
-    regC_uchar16.sb = regC[3 * 4 + i][idx+2];\
-    \
-    regC_uchar16.sc = regC[0 * 4 + i][idx+3];\
-    regC_uchar16.sd = regC[1 * 4 + i][idx+3];\
-    regC_uchar16.se = regC[2 * 4 + i][idx+3];\
-    regC_uchar16.sf = regC[3 * 4 + i][idx+3];\
-    {\
-        int16 sum;\
-        for(uint s = 0; s <16; s++)\
-        {\
-            sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\
-        }\
-        regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\
-        regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\
-        regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\
-        regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\
-        \
-        regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\
-        regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\
-        regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\
-        regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\
-        \
-        regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\
-        regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\
-        regC_uchar16.sa = convert_uchar_sat( sum.sa );\
-        regC_uchar16.sb = convert_uchar_sat( sum.sb );\
-        \
-        regC_uchar16.sc = convert_uchar_sat( sum.sc );\
-        regC_uchar16.sd = convert_uchar_sat( sum.sd );\
-        regC_uchar16.se = convert_uchar_sat( sum.se );\
-        regC_uchar16.sf = convert_uchar_sat( sum.sf );\
-    }
-
-#else
-
-#define QUANTIZATION(idx) \
-    regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    \
-    regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\
-    regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\
-    regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\
-    regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\
-    {\
-        int16 sum;\
-        for(uint s = 0; s <16; s++)\
-        {\
-            sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\
-        }\
-        regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s0)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s1)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s2)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s3)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-        \
-        regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s4)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s5)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s6)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s7)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-        \
-        regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s8)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s9)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sa)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sb)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-        \
-        regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sc)  * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sd)  * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.se)  * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\
-        regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sf)  * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\
-    }
-#endif
-
-inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset)
-{
-#if OUT_WITH_PADDING == 1
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH;
-    padded_offset += y_idx * OUT_Y_PITCH;
-    padded_offset += x_idx * OUT_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += OUT_OFFSET;
-
-    return padded_offset;
-#else
-    return cOffset;
-#endif
-}
-
-#if IN_OUT_OPT != 1
-inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY)
-{
-#if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1
-    uint tmp_idx = cOffset;
-    uint f_val_idx = tmp_idx % 32;
-    tmp_idx /= 32;
-    uint b_val_idx = tmp_idx % 4;
-    tmp_idx /= 4;
-    uint x_idx = tmp_idx % OUTPUT_SIZE_X;
-    x_idx *= strideX;
-    tmp_idx /= OUTPUT_SIZE_X;
-    uint y_idx = tmp_idx % OUTPUT_SIZE_Y;
-    y_idx *= strideY;
-    tmp_idx /= OUTPUT_SIZE_Y;
-    uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4);
-    tmp_idx /= (OUTPUT_BATCH_NUM / 4);
-    uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32);
-
-    uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH;
-    padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH;
-    padded_offset += y_idx * IN2_Y_PITCH;
-    padded_offset += x_idx * IN2_X_PITCH;
-    padded_offset += b_val_idx * 32;
-    padded_offset += f_val_idx;
-    padded_offset += IN2_OFFSET;
-
-    return padded_offset;
-#else
-    return cOffset;
-#endif
-}
-#endif
-
-inline void FUNC(mmad_32x32_int8)(  __local uint* l_tileA, const uint l_offsetTileA,
-                                    __local int8* l_tileB, const uint l_offsetTileB_col0,
-                                    const uint l_offsetTileB_col1, const uint l_offsetTileB_col2,
-                                    const uint l_offsetTileB_col3, int8* rowA, int8* colB,
-                                    int8* regC)
-{
-    // Read tile A from SLM to regA
-    uint l_offsetTileATemp = l_offsetTileA;
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp]));
-        l_offsetTileATemp += 8 * SG_SIZE;
-    }
-    // Read tile B from SLM to regB and compute mmad
-    colB[0] = l_tileB[l_offsetTileB_col0];
-    colB[1] = l_tileB[l_offsetTileB_col1];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]);
-    }
-    colB[0] = l_tileB[l_offsetTileB_col2];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] );
-    }
-    colB[1] = l_tileB[l_offsetTileB_col3];
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]);
-    }
-    __attribute__((opencl_unroll_hint(SG_TILE_M / 8)))
-    for (uint j = 0; j < (SG_TILE_M / 8); ++j)
-    {
-        // Compute partial C
-        regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]);
-    }
-}
-
-/*
- *  \brief GEMM kernel to compute MxN matrix using SLM
- *  \param g_inA  - Input matrix
- *  \param g_inB  - Input matrix
- *  \param g_outC - Output matrix
- */
-
-__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
-KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8_fused_eltwise)
-  (__global char* const g_inA,
-  __global int* g_outC,
-  __global char* const g_inB,
-    #if BIAS_TERM
-        __global BIAS_TYPE* biases,
-    #endif
-        __global float* quantizations,
-    #if CALIBRATION_TERM
-        __global float* calibrations,
-    #endif
-        uint split_idx,
-  __global char* const input2,
-  __global float* eltw_calibrations
-   )
-{
-
-    __global int4* const g_matrixA = (__global int4*)g_inA;
-    __global int4* const g_matrixB = (__global int4*)g_inB;
-    __global int8* g_matrixC = (__global int8*)g_outC;
-
-    // Each work-group works to compute 128x128 tile.
-    // Each work-group contains 16 sub-groups.
-    // Each sub-group within the work-group works to compute a 32x32 tile.
-    // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128).
-    // 2) Each sub-group works to compute 32x32 tileC (stored in regC).
-    //    Note that each work-item in the sub-group computes a 32x4 chunk of tileC.
-    // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows")
-    __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)];
-    __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)];
-
-    __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA;
-    __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA;
-    __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB;
-
-    const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y);
-
-    const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint);
-    const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8);
-    const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4);
-    const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4);
-
-    // Thread IDs
-    const uint g_tidY = get_global_id(DIM_Y);
-    const uint g_tidX = get_global_id(DIM_X);
-    const uint l_tidX = get_local_id(DIM_X);
-    const uint l_tidY = get_local_id(DIM_Y);
-    const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX;
-
-    // SubGroup IDs
-    const uint sg_tid = get_sub_group_local_id();
-    const uint sg_global_idX = (uint)(g_tidX / SG_SIZE);
-    const uint sg_global_idY = g_tidY;
-    const uint sg_local_idX = (uint)(l_tidX / SG_SIZE);
-    const uint sg_local_idY = l_tidY;
-    const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX;
-
-    const uint sub_group_id = get_sub_group_id();
-
-    // Registers
-    int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts
-    int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA
-    int8 colB[2];  // each lane will store 32x4 piece of matrixB
-
-    // SLM indices
-    const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY;
-    const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8);
-    const uint numElements32x8TileB = numElements32x32TileB / 4;
-    const uint l_offsetTileB = numElements32x32TileB * sg_local_idX;
-    const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid;
-    const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid;
-    const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid;
-
-    // Global indices
-    uint g_idxA[2];
-    uint g_idxB[2];
-#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB)
-    g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid;
-    g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid;
-    g_idxA[1] = g_idxA[0] + l_groupSize;
-    g_idxB[1] = g_idxB[0] + l_groupSize;
-#else // Row (matrixA) and Col (matrixB) major layout
-    g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) +
-               (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2);
-    g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-    g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4));
-#endif
-    // Initial SLM setup
-    {
-        l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]];
-        l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]];
-
-        l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]];
-        if (l_tid < 32)
-        {
-            // Not all work-items will be needed to fetch the remaining matrix B
-            l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]];
-        }
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    int4 hdcReadValueA[2];
-    int4 hdcReadValueB[2];
-
-    __attribute__((opencl_unroll_hint(1)))
-    for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++)
-    {
-        hdcReadValueA[0] = g_matrixA[g_idxA[0]];
-        hdcReadValueB[0] = g_matrixB[g_idxB[0]];
-        hdcReadValueA[1] = g_matrixA[g_idxA[1]];
-        if (l_tid < 32)
-        {
-            // Not all work-items will be needed to fetch the remaining matrix B
-            hdcReadValueB[1] = g_matrixB[g_idxB[1]];
-        }
-#ifdef TILED_GLOBAL_LAYOUT
-        g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4);
-#else
-        g_idxA[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[0] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxA[1] += MATRIX_SMALL_K / sizeof(int4);
-        g_idxB[1] += MATRIX_SMALL_K / sizeof(int4);
-#endif
-
-
-        //MMAD compute
-        FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint],
-                                l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8],
-                                l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2,
-                                l_offsetTileB_col3, rowA, colB, regC);
-
-        //SLM setup - SLM write only
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0];
-        l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0];
-        l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1];
-        if (l_tid < 32)
-        {
-            // Not all work-items will be needed to fetch the remaining matrix B
-            l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    } // main outer loop
-
-    //Last MMAD compute iteration (avoids branching in main loop)
-    FUNC_CALL(mmad_32x32_int8)(
-        &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint],
-        l_offsetTileA,
-        &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8],
-        l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB,
-        regC);
-
-
-#ifdef OUTPUT_TILED_GLOBAL_LAYOUT
-
-    // Write out in swizzled manner after quantizing
-    __global uchar* g_outC_uchar = (__global uchar*)g_outC;
-    uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) +
-                   sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar));
-
-    uchar16 regC_uchar16;
-    uint offset_uc16 = 0;
-
-    const uint workgroup_id_x = get_group_id(0);
-    uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x
-    uint feature = get_sub_group_local_id()*4 + feature_off;
-
-    float4 quant_f = vload4(0, quantizations + feature);
-    float4 bias_f = vload4(0, biases + feature);
-    float4 calib_f = vload4(0, calibrations + feature);
-
-    // eltwise calibs
-    float4 eltw_calib_f = vload4(0, eltw_calibrations + feature);
-
-    uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))];
-    uint tmpcOff = cOffset;
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-    for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++)
-    {
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff);
-#if IN_OUT_OPT == 1
-        eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset)));
-#else
-        const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y);
-        eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset)));
-#endif
-        tmpcOff += sizeof(uchar16) * SG_SIZE;
-    }
-
-#if MMAD_SUPPORTED == 1
-    __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) )))
-#endif
-    for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++)
-    {
-        uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            uchar16 eltw_input_vals = eltw[i * 2];
-            // B0..3, F0..31
-            QUANTIZATION(0);
-        }
-
-        intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16));
-        cOffset += sizeof(uchar16) * SG_SIZE;
-
-        // now we need to calculate again for other x
-        padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset);
-        {
-            uchar16 eltw_input_vals = eltw[i * 2 + 1];
-            // B0..3, F0..31
-            QUANTIZATION(4);
-        }
-
-        intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) );
-        cOffset += sizeof(uchar16) * SG_SIZE;
-    }
-#else
-    // Write final accumulated values
-    uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) +
-                   sg_tid * (MATRIX_M / 8);
-    __attribute__((opencl_unroll_hint(SIMD_LANE_N)))
-    for (uint i = 0; i < (SIMD_LANE_N); ++i)
-    {
-        __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8)))
-        for (uint j = 0; j < (SIMD_LANE_M / 8); ++j)
-        {
-            g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j];
-        }
-        cOffset += SG_SIZE * (MATRIX_M / 8);
-    }
-#endif
-}
-
-#undef SUM_SCALE
-#undef SCALE
-#undef QUANTIZATION
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl
index 6bef9de4977..0fd1fb9eecc 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl
@@ -55,12 +55,6 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)(
 #if WITH_BIAS
         const __global half *bias,
 #endif
-#if QUANTIZATION_TERM
-    __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    __global float* calibrations,
-#endif
 #if HAS_FUSED_OPS_DECLS
     FUSED_OPS_DECLS,
 #endif
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl
index 024c3b818d5..b3f717d0461 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl
@@ -53,12 +53,6 @@ KERNEL(gen9_common_conv_fwd_f32_kernel)(
 #if WITH_BIAS
         const __global float *bias,
 #endif
-#if QUANTIZATION_TERM
-    __global float* quantizations,
-#endif
-#if CALIBRATION_TERM
-    __global float* calibrations,
-#endif
 #if HAS_FUSED_OPS_DECLS
     FUSED_OPS_DECLS,
 #endif
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl
index 9b1fc3d3805..1fe945327b3 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl
@@ -26,11 +26,7 @@
 
 KERNEL(eltwise)(
     INPUTS_DECLS
-    __global OUTPUT_TYPE* output
-#if CALIBRATION_TERM
-    , const __global float* calibrations
-#endif
-    )
+    __global OUTPUT_TYPE* output)
 {
 
 #if OUTPUT_DIMS == 6 // 4D spatial
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl
deleted file mode 100644
index 33d340337fc..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "include/include_all.cl"
-
-
-KERNEL(index_select_gpu_ref)(
-    const __global UNIT_TYPE* input,
-#ifndef REVERSE
-    const __global int* indices,
-#endif
-    __global UNIT_TYPE* output)
-{
-    // [CONSTEXPR]:
-    const uint input_sx  = INPUT0_SIZE_X;
-    const uint input_sy  = INPUT0_SIZE_Y;
-    const uint input_sf  = INPUT0_FEATURE_NUM;
-    const uint input_sb  = INPUT0_BATCH_NUM;
-
-    const uint out_b         = (uint) get_global_id(0);
-    const uint indices_idx   = (uint) get_global_id(1);
-    const uint feature_idx   = (uint) get_global_id(2);
-
-    #if AXES_NUMBER == 1
-        #ifdef REVERSE
-            const uint indices_value = REVERSE_AXIS_SIZE - 1 - indices_idx;
-        #else
-            const uint indices_value = indices[indices_idx];
-        #endif
-    #elif AXES_NUMBER > 1
-        #ifdef REVERSE
-            uint indices_value[4] =  { 
-                #ifdef REVERSE_INDEX_SELECT_AXIS_BATCH_SIZE
-                    REVERSE_INDEX_SELECT_AXIS_BATCH_SIZE - 1 - out_b,
-                #else
-                    out_b,
-                #endif
-                #ifdef REVERSE_INDEX_SELECT_AXIS_FEATURE_SIZE
-                    REVERSE_INDEX_SELECT_AXIS_FEATURE_SIZE - 1 - feature_idx,
-                #else
-                    feature_idx,
-                #endif
-                #ifdef REVERSE_INDEX_SELECT_AXIS_Y_SIZE
-                    REVERSE_INDEX_SELECT_AXIS_Y_SIZE - 1 - indices_idx,
-                #else
-                    indices_idx,
-                #endif             
-                    0     
-             };
-        #endif
-    #endif
-    
-    // [LOGIC]:
-    #if AXES_NUMBER > 1
-        for(uint x = 0; x < input_sx; x++)
-        {
-            #ifdef REVERSE_INDEX_SELECT_AXIS_X_SIZE
-                indices_value[3] = REVERSE_INDEX_SELECT_AXIS_X_SIZE - 1 - x;
-            #else
-                indices_value[3] = x;
-            #endif
-            output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, x)] = input[GET_DATA_INDEX(INPUT0, indices_value[0], indices_value[1], indices_value[2], indices_value[3])];
-        }
-            
-    #else
-        #ifdef INDEX_SELECT_AXIS_BATCH
-            for(uint x = 0; x < input_sx; x++)
-            { 
-                for(uint y = 0; y < input_sy; y++)
-                {  
-                    output[GET_DATA_INDEX(OUTPUT, indices_idx, feature_idx, y, x)] = input[GET_DATA_INDEX(INPUT0, indices_value, feature_idx, y, x)];
-                }
-            }
-        #elif defined INDEX_SELECT_AXIS_FEATURE
-            for(uint x = 0; x < input_sx; x++)
-            {
-                output[GET_DATA_INDEX(OUTPUT, out_b, indices_idx, feature_idx, x)] = input[GET_DATA_INDEX(INPUT0, out_b, indices_value, feature_idx, x)];
-            }
-        #elif defined INDEX_SELECT_AXIS_X
-            for(uint i = 0; i < input_sy; i++)
-            {
-                output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, i, indices_idx)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, i, indices_value)];
-            }
-        #elif defined INDEX_SELECT_AXIS_Y
-
-            for(uint i = 0; i < input_sx; i++)
-            {
-                output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, i)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, indices_value, i)];
-            }
-        #endif
-    #endif
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl
deleted file mode 100644
index 99549fc9cdc..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef BATCH_AXIS
-    #define GAP_SIZE (INPUT0_FEATURE_NUM * INPUT0_SIZE_X * INPUT0_SIZE_Y)
-    #define VALUES_NUM INPUT0_BATCH_NUM
-    #define FIRST_DIM_SIZE INPUT0_SIZE_X
-    #define SECOND_DIM_SIZE INPUT0_SIZE_Y
-    #define FIRST_DIM_MUL 1
-    #define SECOND_DIM_MUL INPUT0_SIZE_X
-    #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y)
-#endif
-#ifdef FEATURE_AXIS
-    #define GAP_SIZE (INPUT0_SIZE_X * INPUT0_SIZE_Y)
-    #define VALUES_NUM INPUT0_FEATURE_NUM
-    #define FIRST_DIM_SIZE INPUT0_SIZE_X
-    #define SECOND_DIM_SIZE INPUT0_SIZE_Y
-    #define FIRST_DIM_MUL 1
-    #define SECOND_DIM_MUL INPUT0_SIZE_X
-    #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM)
-#endif
-#ifdef Y_AXIS
-    #define GAP_SIZE INPUT0_SIZE_X
-    #define VALUES_NUM INPUT0_SIZE_Y
-    #define FIRST_DIM_SIZE INPUT0_SIZE_X
-    #define SECOND_DIM_SIZE INPUT0_FEATURE_NUM
-    #define FIRST_DIM_MUL 1
-    #define SECOND_DIM_MUL (INPUT0_SIZE_Y * INPUT0_SIZE_X)
-    #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM)
-#endif
-#ifdef X_AXIS
-    #define GAP_SIZE 1
-    #define VALUES_NUM INPUT0_SIZE_X
-    #define FIRST_DIM_SIZE INPUT0_SIZE_Y
-    #define SECOND_DIM_SIZE INPUT0_FEATURE_NUM
-    #define FIRST_DIM_MUL INPUT0_SIZE_X
-    #define SECOND_DIM_MUL (INPUT0_SIZE_Y * INPUT0_SIZE_X)
-    #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM)
-#endif
-
-
-#include "include/common.cl"
-#include "include/data_types.cl"
-
-KERNEL(lookup_table_axis)(const __global UNIT_TYPE* input0, const __global float* indices, __global UNIT_TYPE* output)
-{
-    const uint first_dim_id = (uint)get_global_id(0);
-    const uint second_dim_id = (uint)get_global_id(1);
-    const uint third_dim_id = (uint)get_global_id(2);
-	const uint offset = first_dim_id * FIRST_DIM_MUL + second_dim_id * SECOND_DIM_MUL + third_dim_id * THIRD_DIM_MUL;
-    const uint val_index = (first_dim_id + second_dim_id * FIRST_DIM_SIZE + third_dim_id * FIRST_DIM_SIZE * SECOND_DIM_SIZE) * VAL_NUM;
-	for (uint i = 0; i < VAL_NUM; i++)
-    {
-        uint global_index = offset + (int)indices[val_index + i] * GAP_SIZE;
-        output[val_index + i] = input0[global_index];
-    }
-}
-
-
-#undef GAP_SIZE
-#undef VALUES_NUM
-#undef FIRST_DIM_SIZE
-#undef SECOND_DIM_SIZE
-#undef FIRST_DIM_MUL
-#undef SECOND_DIM_MUL
-#undef THIRD_DIM_MUL
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl
deleted file mode 100644
index a8e25fd2004..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/common.cl"
-#include "include/data_types.cl"
-
-KERNEL(lookup_table)(const __global UNIT_TYPE* input0, const __global float* indices, __global UNIT_TYPE* output)
-{
-    const uint x    = (uint)get_global_id(0);
-    const uint b    = (uint)get_global_id(1);
-	const uint size = INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM;
-    #ifdef INPUT0_LAYOUT_BFYX
-    const uint global_index = b * VAL_NUM + x;
-    output[global_index] = input0[(int)indices[global_index] + b*size];
-    #elif defined INPUT0_LAYOUT_YXFB
-    const uint global_index = b + x * INPUT0_BATCH_NUM;
-    output[global_index] = input0[(int)indices[global_index]*INPUT0_BATCH_NUM + b];
-    #endif
-}
-	
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
deleted file mode 100644
index 4439732718c..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
-#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
-
-#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
-
-#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE,4)
-#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
-
-#if MAX_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_MIN
-#elif AVG_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_ZERO
-#else
-    #error
-#endif
-
-inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
-{
-#if MAX_POOLING
-    return ACCUMULATOR_MAX_FUNC(tmp, in);
-#elif AVG_POOLING
-    return tmp + in;
-#endif
-}
-
-__attribute__((intel_reqd_sub_group_size(8)))
-KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
-    const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output
-#if HAS_FUSED_OPS_DECLS
-    , FUSED_OPS_DECLS
-#endif
-)
-{
-    const uint x    = (uint)get_global_id(0);
-    const uint y    = (uint)get_global_id(1);
-    const uint bf   = (uint)get_global_id(2);
-	// we process 4 features per workitem that's why we need to divide it
-    const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
-    const uint f    = ((uint)get_global_id(2) * 4) % aligned32_features;
-    const uint b    = 4 * (((uint)get_global_id(2) * 4) / aligned32_features);
-    if (x >= OUTPUT_SIZE_X)
-    {
-        return;
-    }
-
-    const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-    ACCUMULATOR_VEC4 result[4] = { INIT_VAL };
-
-#ifdef CHECK_BOUNDRY
-    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
-        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
-    {
-        return;
-    }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
-    uint num_elementes = 0;
-#endif
-
-    const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, 0, 0);
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        int input_offset_y = offset_y + j;
-        bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-        if(!zero_y)
-        {
-            for(uint i = 0; i < POOL_SIZE_X; i++)
-            {
-                int input_offset_x = offset_x + i;
-                bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-                if(!zero)
-                {
-                    const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
-
-                    int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
-                    for(uint b = 0; b < 4; b++)
-                    {
-                        char4 input_data = as_char4(int_data[b]);
-                        result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0]));
-                        result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1]));
-                        result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2]));
-                        result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3]));
-                    }
-
-#ifdef DYNAMIC_KERNEL_DIVIDER
-                    num_elementes++;
-#endif
-                }
-            }
-        }
-    }
-#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
-    const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
-    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
-    const uint num_elementes = (hend - offset_y) * (wend - offset_x);
-#endif
-#else
-    uint input_idx = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, offset_y, offset_x);
-
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        for(uint i = 0; i < POOL_SIZE_X; i++)
-        {
-            int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
-            for(uint b = 0; b < 4; b++)
-            {
-                char4 input_data = as_char4(int_data[b]);
-                result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0]));
-                result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1]));
-                result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2]));
-                result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3]));
-            }
-
-            input_idx += IN_X_PITCH;
-        }
-        input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
-    }
-
-#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-    const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
-#endif
-#endif
-
-#if defined AVG_POOLING
-    #if ENABLE_ROUND
-        #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-            for(uint b = 0; b < 4; b++)
-            {
-                for(uint i = 0; i < 4; i++)
-                {
-                    result[b][i] = TO_ACCUMULATOR_TYPE(round(((float)result[b][i] / max(num_elementes, (uint)1))));
-                }
-            }
-        #else
-            for(uint b = 0; b < 4; b++)
-            {
-                for(uint i = 0; i < 4; i++)
-                {
-                    result[b][i] = TO_ACCUMULATOR_TYPE(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
-                }
-            }
-        #endif
-    #else
-        #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-            for(uint b = 0; b < 4; b++)
-            {
-                for(uint i = 0; i < 4; i++)
-                {
-                    result[b][i] = TO_ACCUMULATOR_TYPE(((float)result[b][i] / max(num_elementes, (uint)1)));
-                }
-            }
-        #else
-            for(uint b = 0; b < 4; b++)
-            {
-                for(uint i = 0; i < 4; i++)
-                {
-                    result[b][i] = TO_ACCUMULATOR_TYPE((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X));
-                }
-            }
-        #endif
-    #endif  // ENABLE_ROUND
-#endif  // AVG_POOLING
-
-#if OUTPUT_TYPE_SIZE == 1
-    int4 final_result;
-
-    for(uint bi = 0; bi < 4; bi++)
-    {
-        #if HAS_FUSED_OPS
-            ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(convert_char4(result[bi]));
-            FUSED_OPS;
-            final_result[bi] = as_int(FUSED_OPS_RESULT);
-        #else
-            char4 char_result = ACTIVATION(convert_char4(result[bi]), ACTIVATION_PARAMS);
-            final_result[bi] = as_int(char_result);
-        #endif
-    }
-    const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
-    intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(final_result));
-
-#elif OUTPUT_TYPE_SIZE == 2 || OUTPUT_TYPE_SIZE == 4
-    OUTPUT_VEC4 final_result;
-
-    for(uint bi = 0; bi < 4; bi++)
-    {
-    #if HAS_FUSED_OPS
-        ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result[bi]));
-        FUSED_OPS;
-        final_result = FUSED_OPS_RESULT;
-    #else
-        char4 char_result = ACTIVATION(TO_OUTPUT_VEC4(result[bi]), ACTIVATION_PARAMS);
-        final_result = TO_OUTPUT_VEC4(char_result);
-    #endif
-        const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b + bi, f, y, x);
-        vstore4(final_result, 0, output + output_pos);
-    }
-#endif
-}
-
-#undef INIT_VAL
-#undef ACCUMULATOR_VEC4
-#undef ACCUMULATOR_VEC4
-
-#undef ACTIVATION_VEC4
-#undef TO_ACTIVATION_VEC4
-
-#undef OUTPUT_VEC4
-#undef TO_OUTPUT_VEC4
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl
deleted file mode 100644
index f439e9e6e30..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
-#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
-
-#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
-
-#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
-#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
-
-#if MAX_POOLING
-    #define INIT_VAL ACCUMULATOR_VAL_MIN
-#else
-    #error
-#endif
-
-inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
-{
-#if MAX_POOLING
-    return ACCUMULATOR_MAX_FUNC(tmp, in);
-#endif
-}
-
-__attribute__((intel_reqd_sub_group_size(32)))
-KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)(
-    const __global INPUT0_TYPE* input,
-    __global OUTPUT_TYPE* output
-#if HAS_FUSED_OPS_DECLS
-    , FUSED_OPS_DECLS
-#endif
-)
-{
-    const uint x    = (uint)get_group_id(0);
-    const uint y    = (uint)get_group_id(1);
-    const uint bf   = (uint)get_group_id(2) * BATCH_SG_COUNT + (uint)get_sub_group_id();
-	// we process 4 features per workitem that's why we need to divide it
-    const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
-    const uint f = ((bf * 32) % aligned32_features) + (get_sub_group_local_id() % 8) * 4;
-    const uint b = 4 * ((bf * 32) / aligned32_features) + (get_sub_group_local_id() / 8);
-    if (x >= OUTPUT_SIZE_X)
-    {
-        return;
-    }
-
-    const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
-    const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-
-    ACCUMULATOR_VEC4 result = INIT_VAL;
-
-    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
-        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
-    {
-        return;
-    }
-
-    const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, 0, 0);
-    __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        int input_offset_y = offset_y + j;
-
-        __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
-        for(uint i = 0; i < POOL_SIZE_X; i++)
-        {
-            int input_offset_x = offset_x + i;
-            bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-            bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-            bool zero =  (zero_x || zero_y);
-            const uint input_idx =  zero ? 0 : batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
-
-            const __global uint* input_uint = (const __global uint*)(input + input_idx);
-            int int_data = as_int(input_uint[0]);
-
-            char4 input_data = zero ? (char4)(INIT_VAL,INIT_VAL,INIT_VAL,INIT_VAL) : as_char4(int_data);
-            result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
-            result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
-            result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
-            result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
-        }
-    }
-
-    OUTPUT_VEC4 final_result;
-
-    #if HAS_FUSED_OPS
-        ACTIVATION_VEC4 pool_result;
-        pool_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result));
-        FUSED_OPS;
-        final_result = FUSED_OPS_RESULT;
-    #else
-        char4 pool_result;
-        for(uint op = 0; op < 4; op++)
-        {
-            pool_result[op] = ACTIVATION(TO_OUTPUT_TYPE(result[op]), ACTIVATION_PARAMS);
-        }
-        final_result = TO_OUTPUT_VEC4(pool_result);
-    #endif
-
-    const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
-    *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;
-}
-
-#undef INIT_VAL
-#undef ACCUMULATOR_VEC4
-
-#undef ACTIVATION_VEC4
-#undef TO_ACTIVATION_VEC4
-
-#undef OUTPUT_VEC4
-#undef TO_OUTPUT_VEC4
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl
deleted file mode 100644
index fde8e973320..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-#define LOCAL_SIZE INPUT0_BATCH_NUM
-
-KERNEL(scale_grad_weights_gpu_ref)(
-    const __global UNIT_TYPE* input,
-    const __global UNIT_TYPE* input_grad,
-    __global OUTPUT_TYPE* output,
-	__global float* scale,
-#if BIAS_TERM
-    __global float* bias,
-#endif
-#if MOMENTUM
-    __global float* prev_grad_w,
-#if BIAS_TERM
-    __global float* prev_grad_b,
-#endif
-#endif
-    const float lr
-    )
-{
-    __local ACCUMULATOR_TYPE grad_sum[LOCAL_SIZE];
-    __local ACCUMULATOR_TYPE grad_sum_in[LOCAL_SIZE];
-
-    const uint local_idx = (uint)get_local_id(0);
-    const uint f = (uint)get_global_id(1);
-
-    grad_sum[local_idx] = 0;
-    grad_sum_in[local_idx] = 0;
-
-    uint grad_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0);
-    for (uint y = 0; y < INPUT0_SIZE_Y; y++)
-    {
-        for (uint x = 0; x < INPUT0_SIZE_X; x++)
-        {
-            ACCUMULATOR_TYPE in_g = TO_ACCUMULATOR_TYPE(input_grad[grad_idx]);
-            grad_sum[local_idx] += in_g * lr;
-            grad_sum_in[local_idx] += in_g * TO_ACCUMULATOR_TYPE(input[grad_idx]) * lr; 
-            grad_idx += INPUT0_X_PITCH;
-        }
-        grad_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) 
-    {
-        if (local_idx < offset) 
-        {
-            grad_sum[local_idx] += grad_sum[local_idx + offset];
-            grad_sum_in[local_idx] += grad_sum_in[local_idx + offset];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    
-    if (local_idx == 0)
-    {
-#if MOMENTUM
-    ACCUMULATOR_TYPE update_gradient_w = grad_sum_in[0] + prev_grad_w[f] * MOMENTUM_FACTOR + DECAY_RATE * lr * scale[f];
-    scale[f] -= update_gradient_w;
-    prev_grad_w[f] = update_gradient_w;
-#else
-    scale[f] -= grad_sum_in[0] + DECAY_RATE * lr * scale[f];
-#endif
-
-#if BIAS_TERM
-#if MOMENTUM
-    ACCUMULATOR_TYPE update_gradient_b = prev_grad_b[f] * MOMENTUM_FACTOR + grad_sum[0];
-    bias[f] -= update_gradient_b;
-    prev_grad_b[f] = update_gradient_b;
-#else
-    bias[f] -= grad_sum[0];
-#endif
-#endif
-    }  
-}
-
-#undef LOCAL_SIZE
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl
deleted file mode 100644
index b3f09cc27b8..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-
-KERNEL(softmax_loss_grad_gpu_ref)(
-    const __global INPUT0_TYPE* input_pred,
-    __global OUTPUT_TYPE* output,
-    const __global INPUT1_TYPE* labels
-    )
-{
-    const uint b_x          = get_global_id(0);
-    const uint batch_id     = b_x / OUTPUT_SIZE_X;
-    const uint x            = b_x % OUTPUT_SIZE_X;
-
-    const uint input_pred_idx = GET_DATA_INDEX(INPUT0, batch_id, 0, 0, x);
-    const uint labels_idx = GET_DATA_INDEX(INPUT1, batch_id, 0, 0, 0);
-
-    UNIT_TYPE label = labels[labels_idx];
-    const uint output_idx = GET_DATA_INDEX(OUTPUT, batch_id, 0, 0, x);
-
-    if(label == x)
-        output[output_idx] = input_pred[input_pred_idx] - 1;
-    else
-        output[output_idx] = input_pred[input_pred_idx];
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp
index fb84aa9c33d..37a9258c658 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp
@@ -112,8 +112,6 @@ std::string common_kernel_base::CreateJit(const std::string& template_name,
 Arguments common_kernel_base::GetArgsDesc(uint32_t num_of_input,
                                           bool use_weights,
                                           bool use_bias,
-                                          bool use_quantization,
-                                          bool use_output_calibration,
                                           uint32_t number_of_inputs_for_fused_prim) const {
     Arguments args;
 
@@ -131,14 +129,6 @@ Arguments common_kernel_base::GetArgsDesc(uint32_t num_of_input,
         args.push_back({ArgumentDescriptor::Types::BIAS, 0});
     }
 
-    if (use_quantization && use_weights) {
-        args.push_back({ArgumentDescriptor::Types::WEIGHTS_QUANTIZATION_FACTORS, 0});
-    }
-
-    if (use_output_calibration) {
-        args.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 0});
-    }
-
     for (uint32_t i = 0; i < number_of_inputs_for_fused_prim; i++) {
         args.push_back({ArgumentDescriptor::Types::INPUT_OF_FUSED_PRIMITIVE, i});
     }
@@ -220,6 +210,6 @@ void common_kernel_base::FillCLKernelData(clKernelData& kernel,
     kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2};
     kernel.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode);
     kernel.arguments =
-        GetArgsDesc(number_of_inputs, weights, bias, false, false, number_of_inputs_for_fused_prims);
+        GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims);
 }
 }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h
index 3bacb38eee5..3dc1c5ffd53 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h
@@ -48,8 +48,6 @@ protected:
     Arguments GetArgsDesc(uint32_t num_of_input,
                           bool use_weights,
                           bool use_bias,
-                          bool use_quantization = false,
-                          bool use_calibration = 0,
                           uint32_t number_of_inputs_for_fused_prim = 0) const;
     std::shared_ptr<KernelString> GetKernelString(const std::string& kernel_name,
                                                   const std::string& jit,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
index 077aace30b6..a5449fc2157 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
@@ -557,8 +557,6 @@ JitConstants MakeActivationJitConstants(ActivationFunction activation_function,
     };
 
     std::string macro_def = name + (use_type_parameter ? "(jit_type, input, m, n)" : "(input, m, n)");
-    std::string macro_def_grad = name + (use_type_parameter ? "(jit_type, input_grad, input, m, n)"
-                                                            : "(input_grad, input, m, n)");
     std::string macro_def_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)";
 
     jitConstants.AddConstant(MakeJitConstant("ACTIVATION_PARAMS" + suffix, "NL_M" + suffix + ", NL_N" + suffix));
@@ -656,25 +654,6 @@ JitConstants MakeActivationJitConstants(ActivationFunction activation_function,
             jitConstants.AddConstant(MakeJitConstant(macro_def, "(pow(input," + m.str() + "))"));
             break;
         }
-        case ActivationFunction::RELU_GRAD:
-            jitConstants.AddConstant(MakeJitConstant(
-                macro_def_grad,
-                ("input_grad"_jit * ternary(input.gt(zero), one, zero)).str()));
-            macro_def_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)";
-            break;
-        case ActivationFunction::RELU_NEGATIVE_SLOPE_GRAD: {
-            const JitTerm slope = disable_type_conversion ? "m"_jit : to_type("m"_jit);
-            jitConstants.AddConstant(MakeJitConstant(
-                macro_def_grad,
-                ("input_grad"_jit * (ternary(input.gt(zero), one, zero) + (to_type(slope) * ternary(input.le(zero), one, zero))))
-                    .str()));
-            macro_def_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)";
-            break;
-        }
-        case ActivationFunction::NONE_GRAD:
-            jitConstants.AddConstant(MakeJitConstant(macro_def_grad, "input_grad"));
-            macro_def_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)";
-            break;
         case ActivationFunction::TAN:
             jitConstants.AddConstant(MakeJitConstant(macro_def, "(tan(input))"));
             break;
@@ -986,23 +965,14 @@ JitConstants MakeActivationJitConstants(std::vector<kernel_selector::base_activa
         res.Merge(jitConstants);
 
         if (i == 0) {
-            if (params[i].gradient) {
-                activation_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)";
-            } else {
-                activation_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)";
-            }
+            activation_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)";
             res_activation = "ACTIVATION_FUNC" + activation_suffix + activation_params;
         } else {
             res_activation = "ACTIVATION" + activation_suffix + "(" + (use_type_parameter ? "jit_type, " : "") +
-                             (params[i].gradient ? "input_grad, " : "") +
                              res_activation + ", ACTIVATION_PARAMS" + activation_suffix + ")";
         }
     }
-    if (params[params.size() - 1].gradient) {
-        activation_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)";
-    } else {
-        activation_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)";
-    }
+    activation_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)";
     res.AddConstant(MakeJitConstant("ACTIVATION_PARAMS" + suffix, "ACTIVATION_PARAMS" + suffix + "_0"));
     res.AddConstant(MakeJitConstant("ACTIVATION" + suffix + activation_params, res_activation));
     return res;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.cpp
deleted file mode 100644
index 1b650a6c59b..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "training_kernel_base.h"
-#include "weight_bias_kernel_base.h"
-
-namespace kernel_selector {
-JitConstants training_kernel_base::GetJitConstants(const training_params& params) const {
-    JitConstants jit = WeightBiasKernelBase::GetJitConstants(params);
-
-    if (params.use_momentum) {
-        jit.AddConstant(MakeJitConstant("MOMENTUM", 1));
-        jit.AddConstant(MakeJitConstant("MOMENTUM_FACTOR", params.momentum_factor));
-    }
-
-    jit.AddConstant(MakeJitConstant("DECAY_RATE", params.weights_decay));
-
-    return jit;
-}
-
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.h
deleted file mode 100644
index c6ca85d99dc..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "weight_bias_kernel_base.h"
-#include "training_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// training_kernel_base
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class training_kernel_base : public WeightBiasKernelBase {
-public:
-    using WeightBiasKernelBase::WeightBiasKernelBase;
-    virtual ~training_kernel_base() {}
-
-protected:
-    virtual JitConstants GetJitConstants(const training_params& params) const;
-};
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.cpp
deleted file mode 100644
index 0422b62a04e..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "training_params.h"
-
-namespace kernel_selector {
-ParamsKey training_params::GetParamsKey() const {
-    ParamsKey k = weight_bias_params::GetParamsKey();
-
-    if (use_momentum) {
-        k.EnableMomentum();
-    }
-
-    return k;
-}
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.h
deleted file mode 100644
index fda78213381..00000000000
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#pragma once
-
-#include "weight_bias_params.h"
-
-namespace kernel_selector {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// training_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct training_params : public weight_bias_params {
-    explicit training_params(KernelType kt) : weight_bias_params(kt) {}
-
-    bool use_momentum = false;
-    float weights_decay = 0.0;
-    float momentum_factor = 0.0;
-
-    ParamsKey GetParamsKey() const override;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// training_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct training_optional_params : weight_bias_optional_params {
-protected:
-    explicit training_optional_params(KernelType kt) : weight_bias_optional_params(kt) {}
-};
-
-}  // namespace kernel_selector
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
index 890c1662e50..3d23771d0fc 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
@@ -53,7 +53,6 @@ JitConstants KernelBase::MakeBaseParamsJitConstants(const base_params& params) c
         MakeJitConstant("INT64_UNIT_USED", IsTypeUsedIn(Datatype::INT64, params)),
         MakeJitConstant("UINT8_UNIT_USED", IsTypeUsedIn(Datatype::UINT8, params)),
         MakeJitConstant("UINT32_UNIT_USED", IsTypeUsedIn(Datatype::UINT32, params)),
-        MakeJitConstant("GRADIENT", params.gradient),
     };
 
     // for activation function
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
index d2539c19171..c420f51fc59 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
@@ -54,8 +54,6 @@ std::string toString(ActivationFunction activation) {
         case ActivationFunction::SQRT:                     method = "SQRT"; break;
         case ActivationFunction::LINEAR:                   method = "LINEAR"; break;
         case ActivationFunction::ELU:                      method = "ELU"; break;
-        case ActivationFunction::RELU_GRAD:                method = "RELU_GRAD"; break;
-        case ActivationFunction::RELU_NEGATIVE_SLOPE_GRAD: method = "RELU_NEGATIVE_SLOPE_GRAD"; break;
         case ActivationFunction::SIN:                      method = "SIN"; break;
         case ActivationFunction::ASIN:                     method = "ASIN"; break;
         case ActivationFunction::SINH:                     method = "SINH"; break;
@@ -70,7 +68,6 @@ std::string toString(ActivationFunction activation) {
         case ActivationFunction::NOT:                      method = "NOT"; break;
         case ActivationFunction::POW:                      method = "POW"; break;
         case ActivationFunction::NONE:                     method = "NONE"; break;
-        case ActivationFunction::NONE_GRAD:                method = "NONE_GRAD"; break;
         case ActivationFunction::TAN:                      method = "TAN"; break;
         case ActivationFunction::ATAN:                     method = "ATAN"; break;
         case ActivationFunction::ATANH:                    method = "ATANH"; break;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h
index 3a5ec305fa9..815cbdfa39e 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.h
@@ -120,20 +120,15 @@ struct ArgumentDescriptor {
         OUTPUT,
         WEIGHTS,
         BIAS,
-        PREV_WEIGHTS_GRADIENT,
-        PREV_BIAS_GRADIENT,
         SCALE_TABLE,
         SLOPE,
         SPLIT,
         INTERNAL_BUFFER,
         SCALAR,
-        WEIGHTS_QUANTIZATION_FACTORS,
-        OUTPUT_CALIBRATION_FACTORS,
         RECURRENT,  // RNN/LSTM/GRU recurrent weights
         HIDDEN,     // RNN/LSTM/GRU hidden input
         CELL,       // LSTM cell input
         LSTM_PACK,  // LSTM packed output
-        LEARNING_RATE,
         WEIGHTS_ZERO_POINTS,
         ACTIVATIONS_ZERO_POINTS,
         COMPENSATION,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp
index 92f2601e51e..d59da3e2289 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp
@@ -548,10 +548,6 @@ ParamsKey base_params::GetParamsKey() const {
         k.EnableFP16Emulation();
     }
 
-    if (gradient) {
-        k.EnableGradient();
-    }
-
     return k;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
index 436928b2fe3..f55490ba1cf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
@@ -83,10 +83,8 @@ public:
                 uint32_t nonBias : 1;
                 uint32_t activationAdditionalParamsAsInput : 1;
                 uint32_t FP16Emulation : 1;
-                uint32_t gradient : 1;
                 uint32_t momentum : 1;
                 uint32_t quantization : 1;
-                uint32_t output_calibration : 1;
                 uint32_t sym_quantization : 1;
                 uint32_t asym_w_quantization : 1;
                 uint32_t asym_d_quantization : 1;
@@ -190,7 +188,6 @@ public:
                     struct eltwise_t {
                         uint32_t stride : 1;
                         uint32_t broadcast : 1;
-                        uint32_t inputs_calibration : 1;
                     } eltwise;
                     struct lstm_gemm_t {
                         uint32_t bias : 1;
@@ -209,8 +206,6 @@ public:
                         uint32_t dilation : 1;
                         uint32_t depthwise_separable_opt : 1;
                         uint32_t transposed : 1;
-                        uint32_t quantization : 1;
-                        uint32_t calibration : 1;
                         uint32_t local : 1;
                         uint32_t grouped : 1;
                         // eltw
@@ -292,7 +287,6 @@ public:
     void EnableTensorOffset() { key.restrict.val.offset = 1; }
     void EnableTensorPitches() { key.restrict.val.pitches = 1; }
     void EnableBatching() { key.restrict.val.batching = 1; }
-    void EnableGradient() { key.restrict.val.gradient = 1; }
     void EnableSubGroup() { key.machineInfo.val.subgroup = 1; }
     void EnableSubGroupShort() { key.machineInfo.val.subgroupShort = 1; }
     void EnableSubGroupChar() { key.machineInfo.val.subgroupChar = 1; }
@@ -317,8 +311,6 @@ public:
     void EnableDepthwiseSeparableOpt() { key.restrict.val.dedicated.conv.depthwise_separable_opt = 1; }
     void EnableLocalConvolution() { key.restrict.val.dedicated.conv.local = 1; }
     void EnableGroupedConvolution() { key.restrict.val.dedicated.conv.grouped = 1; }
-    void EnableInt8Quantization() { key.restrict.val.quantization = 1; }
-    void EnableOutputCalibration() { key.restrict.val.output_calibration = 1; }
     void EnableDeformableMode() { key.restrict.val.dedicated.conv.deformable = 1; }
 
     void EnableFusedConvEltwSplitSupport() { key.restrict.val.dedicated.fused_conv_eltw.split = 1; }
@@ -329,8 +321,6 @@ public:
     void EnableFusedConvEltwLocalConvolution() { key.restrict.val.dedicated.fused_conv_eltw.local = 1; }
     void EnableFusedConvEltwGroupedConvolution() { key.restrict.val.dedicated.fused_conv_eltw.grouped = 1; }
     void EnableFusedConvEltwTranspose() { key.restrict.val.dedicated.fused_conv_eltw.transposed = 1; }
-    void EnableFusedConvEltwInt8Quantization() { key.restrict.val.dedicated.fused_conv_eltw.quantization = 1; }
-    void EnableFusedConvEltwOutputCalibration() { key.restrict.val.dedicated.fused_conv_eltw.calibration = 1; }
     void EnableFusedConvEltwEltwiseStride();
     void EnableFusedConvEltwDepthToSpaceFusing();
 
@@ -344,7 +334,6 @@ public:
     void EnableReampleType(ResampleType a);
     void EnableEltwiseStride();
     void EnableEltwiseBroadcast() { key.restrict.val.dedicated.eltwise.broadcast = 1; }
-    void EnableEltwiseInputsCalibration() { key.restrict.val.dedicated.eltwise.inputs_calibration = 1; }
 
     void EnableLSTMGEMMBias() { key.restrict.val.dedicated.lstm_gemm.bias = 1; }
     void EnableLSTMGEMMHidden() { key.restrict.val.dedicated.lstm_gemm.hidden = 1; }
@@ -427,14 +416,12 @@ struct base_activation_params {
     ActivationFunction function = ActivationFunction::NONE;
     float m = 1.f;
     float n = 0.f;
-    bool gradient = false;
 
     base_activation_params() = default;
     base_activation_params(const float m, const float n) : m(m), n(n) {}
-    base_activation_params(const ActivationFunction f, const float m, const float n, const bool gradinet = false) : function(f),
-                                                                                                                    m(m),
-                                                                                                                    n(n),
-                                                                                                                    gradient(gradinet) {}
+    base_activation_params(const ActivationFunction f, const float m, const float n) : function(f),
+                                                                                       m(m),
+                                                                                       n(n) {}
 
     virtual std::string to_string() const;
 };
@@ -594,7 +581,6 @@ struct base_params : public Params {
     std::vector<fused_operation_desc> fused_ops = {};
     MultiDataTensor inputs;
     DataTensor output;
-    bool gradient = false;
 
     std::string to_string() const override;
     std::string to_cache_string_v2() const override;
diff --git a/inference-engine/thirdparty/clDNN/src/activation_grad.cpp b/inference-engine/thirdparty/clDNN/src/activation_grad.cpp
deleted file mode 100644
index 3be8933f351..00000000000
--- a/inference-engine/thirdparty/clDNN/src/activation_grad.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "activation_grad_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id activation_grad::type_id() {
-    static primitive_type_base<activation_grad> instance;
-    return &instance;
-}
-
-layout activation_grad_inst::calc_output_layout(activation_grad_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "activation_grad_node!");
-    return node.input().get_non_padded_output_layout();
-}
-
-std::string activation_grad_inst::to_string(activation_grad_node const& node) {
-    auto node_info = node.desc_to_json();
-    auto desc = node.get_primitive();
-
-    std::stringstream primitive_description;
-
-    json_composite activation_grad_info;
-    activation_grad_info.add("activation_grad_func", static_cast<int>(desc->activation_grad_function));
-    activation_grad_info.add("additional_params.a", desc->additional_params.a);
-    activation_grad_info.add("additional_params.b", desc->additional_params.b);
-    activation_grad_info.add("additional_params input", desc->additional_params_input);
-
-    node_info->add("activation_grad info", activation_grad_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-activation_grad_inst::typed_primitive_inst(network_impl& network, activation_grad_node const& node)
-    : parent(network, node) {
-    auto input_grad_arg = node.input().get_output_layout();
-    auto input_arg = node.input_arg().get_output_layout();
-    auto output_arg = node.get_output_layout();
-
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "ReLU input_grad number",
-                          input_grad_arg.size.raw.size(),
-                          "ReLU input number",
-                          input_arg.size.raw.size(),
-                          "Relu input_grad/input num dismatch");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "ReLU input number",
-                          input_arg.size.raw.size(),
-                          "ReLU output number",
-                          output_arg.size.raw.size(),
-                          "Relu input/output num dismatch");
-
-    if (is_parameterized()) {
-        /// Slope input x dimension should be equal to input feature size (one slope per channel).
-        auto slope_input_size = node.slope_input().get_output_layout().size;
-        auto input_feature_size = node.input().get_output_layout().size.feature[0];
-
-        CLDNN_ERROR_LESS_THAN(node.id(),
-                              "Slope x size",
-                              slope_input_size.spatial[0],
-                              "input feature size",
-                              input_feature_size,
-                              "Dimensions mismatch between input and slope input in Activation layer(slope x size "
-                              "should be equal to input feature size)!");
-
-        // All other dimensions should be 1
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Slope input size count",
-                              slope_input_size.count(),
-                              "Slope input size x",
-                              slope_input_size.spatial[0],
-                              "Dimensions mismatch of slope input in Activation layer!");
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/apply_adam.cpp b/inference-engine/thirdparty/clDNN/src/apply_adam.cpp
deleted file mode 100644
index 89f0f8d6c3b..00000000000
--- a/inference-engine/thirdparty/clDNN/src/apply_adam.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "apply_adam_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-#include <memory>
-
-namespace cldnn {
-primitive_type_id apply_adam::type_id() {
-    static primitive_type_base<apply_adam> instance;
-    return &instance;
-}
-
-apply_adam_node::typed_program_node(const std::shared_ptr<apply_adam> prim, program_impl& prog) : parent(prim, prog) {
-    can_share_buffer(false);  // apply adam's output initial val should be either 0 or use same buffer as mutable_data
-                              // after it (no allocation needed)
-}
-layout apply_adam_inst::calc_output_layout(apply_adam_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for apply_adam_node!");
-    return node.input().get_non_padded_output_layout();
-}
-
-std::string apply_adam_inst::to_string(apply_adam_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto& m = node.m();
-    auto& v = node.v();
-    auto& beta1_power = node.beta1_power();
-    auto& beta2_power = node.beta2_power();
-
-    std::stringstream primitive_description;
-
-    json_composite apply_adam_info;
-    apply_adam_info.add("m_id", m.id());
-    apply_adam_info.add("v_id", v.id());
-    apply_adam_info.add("beta1_power_id", beta1_power.id());
-    apply_adam_info.add("beta2_power_id", beta2_power.id());
-    apply_adam_info.add("lr", desc->lr);
-    apply_adam_info.add("beta1", desc->beta1);
-    apply_adam_info.add("beta2", desc->beta2);
-    apply_adam_info.add("epsilon", desc->epsilon);
-
-    node_info->add("apply adam info", apply_adam_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-apply_adam_inst::typed_primitive_inst(network_impl& network, apply_adam_node const& node) : parent(network, node) {
-    auto m_format = node.m().get_output_layout().format;
-    auto v_format = node.v().get_output_layout().format;
-    auto beta1_power_format = node.beta1_power().get_output_layout().format;
-    auto beta2_power_format = node.beta2_power().get_output_layout().format;
-
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "M format",
-                                  m_format.value,
-                                  "supported m formats",
-                                  format::yxfb,
-                                  format::bfyx);
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "V format",
-                                  v_format.value,
-                                  "supported v formats",
-                                  format::yxfb,
-                                  format::bfyx);
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "beta1_power format",
-                                  beta1_power_format.value,
-                                  "supported beta1_power formats",
-                                  format::yxfb,
-                                  format::bfyx);
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "beta2_power format",
-                                  beta2_power_format.value,
-                                  "supported beta2_power formats",
-                                  format::yxfb,
-                                  format::bfyx);
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/batch_norm.cpp b/inference-engine/thirdparty/clDNN/src/batch_norm.cpp
deleted file mode 100644
index 0a9bdcb06f6..00000000000
--- a/inference-engine/thirdparty/clDNN/src/batch_norm.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include "mutable_data_inst.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id batch_norm::type_id() {
-    static primitive_type_base<batch_norm> instance;
-    return &instance;
-}
-
-layout batch_norm_inst::calc_output_layout(batch_norm_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for batch_norm_node!");
-    return node.input().get_non_padded_output_layout();
-}
-
-std::string batch_norm_inst::to_string(batch_norm_node const& node) {
-    bool variance_term = node.variance_term();
-
-    std::stringstream primitive_description;
-    json_composite batch_norm_info;
-    if (node.use_global_stats()) {
-        batch_norm_info.add("mean_id", node.mean().id());
-        if (variance_term) {
-            batch_norm_info.add("variance_id", node.variance().id());
-        }
-    }
-    if (node.use_scale_shift()) {
-        batch_norm_info.add("scale_id", node.scale().id());
-        batch_norm_info.add("shift_id", node.shift().id());
-    }
-    if (node.forwad_pass()) {
-        batch_norm_info.add("inv_var", node.inv_variance().id());
-    }
-    batch_norm_info.add("epsilon", node.get_primitive()->epsilon);
-
-    node.desc_to_json()->add("batch norm info", batch_norm_info);
-    node.desc_to_json()->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-batch_norm_inst::typed_primitive_inst(network_impl& network, batch_norm_node const& node) : parent(network, node) {
-    if (use_global_stats()) {
-        auto mean_format = node.mean().get_output_layout().format;
-        auto variance_format = node.variance().get_output_layout().format;
-
-        CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                      "Mean format",
-                                      mean_format.value,
-                                      "supported mean formats",
-                                      format::yxfb,
-                                      format::bfyx,
-                                      format::byxf);
-        CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                      "Variance format",
-                                      variance_format.value,
-                                      "supported variance formats",
-                                      format::yxfb,
-                                      format::bfyx,
-                                      format::byxf);
-
-        auto is_mean_mutable_data = node.mean().is_type<mutable_data>();
-        auto is_var_mutable_data = node.variance().is_type<mutable_data>();
-
-        CLDNN_ERROR_BOOL(node.id(),
-                         "mean and variance are not the same type",
-                         (is_mean_mutable_data != is_var_mutable_data),
-                         "");
-    }
-
-    if (use_scale_shift()) {
-        auto scale_format = node.scale().get_output_layout().format;
-        auto shift_format = node.shift().get_output_layout().format;
-
-        CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                      "Scale format",
-                                      scale_format.value,
-                                      "supported scale formats",
-                                      format::yxfb,
-                                      format::bfyx,
-                                      format::byxf);
-        CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                      "Shift format",
-                                      shift_format.value,
-                                      "supported shift formats",
-                                      format::yxfb,
-                                      format::bfyx,
-                                      format::byxf);
-    }
-
-    if (forwad_pass()) {
-        auto is_inv_var_mutable_data = node.inv_variance().is_type<mutable_data>();
-        CLDNN_ERROR_BOOL(node.id(), "inv_variance is not mutable_data type", !is_inv_var_mutable_data, "");
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp b/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp
deleted file mode 100644
index 18e057f145c..00000000000
--- a/inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_grad_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id batch_norm_grad::type_id() {
-    static primitive_type_base<batch_norm_grad> instance;
-    return &instance;
-}
-
-layout batch_norm_grad_inst::calc_output_layout(parent::typed_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for batch_norm_grad_node!");
-    return node.input().get_non_padded_output_layout();
-}
-
-std::string batch_norm_grad_inst::to_string(batch_norm_grad_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto& inv_var = node.inv_variance();
-
-    std::stringstream primitive_description;
-
-    json_composite batch_norm_grad_info;
-
-    batch_norm_grad_info.add("inv_variance_id", inv_var.id());
-
-    node_info->add("batch_norm_grad info", batch_norm_grad_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-batch_norm_grad_inst::typed_primitive_inst(network_impl& network, batch_norm_grad_node const& node)
-    : parent(network, node) {}
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/contract.cpp b/inference-engine/thirdparty/clDNN/src/contract.cpp
deleted file mode 100644
index 28a90c67c0c..00000000000
--- a/inference-engine/thirdparty/clDNN/src/contract.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "contract_inst.h"
-
-#include "error_handler.h"
-#include "json_object.h"
-#include "primitive_type_base.h"
-#include <string>
-#include <vector>
-#include <set>
-
-namespace cldnn {
-primitive_type_id contract::type_id() {
-    static primitive_type_base<contract> instance;
-    return &instance;
-}
-
-layout contract_inst::calc_output_layout(contract_node const& node) {
-    auto input_layout = node.input().get_output_layout();
-    const auto& input_sizes = input_layout.size;
-    auto desc = node.get_primitive();
-    auto reduction_axes = desc->reduction_axes;
-
-    std::vector<tensor::value_type> input_dims = {input_sizes.batch[0],
-                                                  input_sizes.feature[0],
-                                                  input_sizes.spatial[1],
-                                                  input_sizes.spatial[0]};
-    std::vector<tensor::value_type> output_sizes(4, 0);
-    int cur_dim = 3;
-    for (int i = 3; i >= 0; --i) {
-        while (std::find(reduction_axes.begin(), reduction_axes.end(), cur_dim) != reduction_axes.end() && cur_dim >= 0)
-            --cur_dim;
-        output_sizes.at(i) = cur_dim >= 0 ? input_dims.at(cur_dim--) : 1;
-    }
-
-    return {input_layout.data_type,
-            input_layout.format,
-            cldnn::tensor(output_sizes[0], output_sizes[1], output_sizes[3], output_sizes[2])};
-}
-
-std::string contract_inst::to_string(contract_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    const auto& reduction_axes = desc->reduction_axes;
-    auto& input = node.input();
-
-    std::stringstream primitive_description;
-    std::stringstream ss_reduction_axes;
-
-    for (size_t i = 0; i < reduction_axes.size(); ++i) {
-        ss_reduction_axes << reduction_axes.at(i);
-        i != (reduction_axes.size() - 1) ? ss_reduction_axes << ", " : ss_reduction_axes << "";
-    }
-
-    std::string str_mode;
-    switch (desc->mode) {
-        case contract_mode::sum:
-            str_mode = "sum";
-            break;
-        case contract_mode::prod:
-            str_mode = "product";
-            break;
-        case contract_mode::all:
-            str_mode = "all";
-            break;
-        case contract_mode::any:
-            str_mode = "any";
-            break;
-        case contract_mode::max:
-            str_mode = "max";
-            break;
-        default:
-            str_mode = "not supported mode";
-            break;
-    }
-
-    json_composite contract_info;
-    contract_info.add("input id", input.id());
-    contract_info.add("mode", str_mode);
-    contract_info.add("reduction axes", ss_reduction_axes.str());
-
-    node_info->add("contract info", contract_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-contract_inst::typed_primitive_inst(network_impl& network, contract_node const& node) : parent(network, node) {
-    std::set<uint16_t> existing;
-    const auto& reduction_axes = node.get_primitive()->reduction_axes;
-    size_t reduction_axes_size = reduction_axes.size();
-
-    if (reduction_axes.empty()) {
-        CLDNN_ERROR_MESSAGE(node.id(), "Incorrect parameters configuration: reduction_axes should not be empty.");
-    }
-    if (reduction_axes_size > 4) {
-        CLDNN_ERROR_MESSAGE(node.id(),
-                            "Incorrect parameters configuration: reduction_axes size should be less or equal 4.");
-    }
-    for (size_t i = 0; i < reduction_axes_size; ++i) {
-        if (reduction_axes.at(i) >= 4) {
-            CLDNN_ERROR_MESSAGE(
-                node.id(),
-                "Incorrect parameters configuration: reduction_axes index should be within reduction_axes range.");
-        }
-        if (existing.find(reduction_axes.at(i)) != existing.end()) {
-            CLDNN_ERROR_MESSAGE(
-                node.id(),
-                "Incorrect parameters configuration: Duplicate axes numbers was found in reduction_axes.");
-        }
-        existing.insert(reduction_axes.at(i));
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp b/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp
deleted file mode 100644
index 3a5a02ac509..00000000000
--- a/inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "convolution_grad_weights_inst.h"
-#include "primitive_type_base.h"
-#include "sliding_window_utils.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id convolution_grad_weights::type_id() {
-    static primitive_type_base<convolution_grad_weights> instance;
-    return &instance;
-}
-
-layout convolution_grad_weights_inst::calc_output_layout(convolution_grad_weights_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "convolution_grad_weights_node!");
-    // output buffer will not be used in this primitive unless output gradient weights is set
-    auto input_grad_layout_size = node.input(0).get_output_layout();
-    tensor output_sizes = {1, 1, 1, 1};
-    if (node.output_grad_w())
-        output_sizes = node.weights().get_output_layout().size;
-
-    return {input_grad_layout_size.data_type, input_grad_layout_size.format, output_sizes};
-}
-
-std::string convolution_grad_weights_inst::to_string(convolution_grad_weights_node const& node) {
-    auto desc = node.get_primitive();
-    auto strd = desc->stride;
-    auto dilation = desc->dilation;
-    auto split = desc->split();
-    auto node_info = node.desc_to_json();
-
-    std::stringstream primitive_description;
-    std::stringstream ss_weights, ss_biases;
-
-    for (size_t i = 0; i < desc->weights.size(); ++i) {
-        ss_weights << node.weights(i).id();
-        ss_weights << ", count: " << node.weights(i).get_output_layout().count();
-        i != (desc->weights.size() - 1) ? ss_weights << ", " : ss_weights << "";
-        if (node.get_depthwise_sep_opt())
-            break;
-    }
-
-    for (size_t i = 0; i < desc->bias.size(); ++i) {
-        ss_biases << node.bias(i).id();
-        ss_biases << ", count: " << node.bias(i).get_output_layout().count();
-        i != (desc->bias.size() - 1) ? ss_biases << ", " : ss_biases << "";
-    }
-
-    json_composite deconv_info;
-    deconv_info.add("weights count", desc->weights.size());
-    deconv_info.add("bias count", desc->bias.size());
-    deconv_info.add("stride", strd.to_string());
-    deconv_info.add("input offset", desc->input_offset.to_string());
-    deconv_info.add("dilation", dilation.to_string());
-    deconv_info.add("split", split);
-
-    node_info->add("convolution_grad_weights info", deconv_info);
-    node_info->dump(primitive_description);
-    return primitive_description.str();
-}
-
-convolution_grad_weights_inst::typed_primitive_inst(network_impl& network, convolution_grad_weights_node const& node)
-    : parent(network, node) {
-    auto stride = argument.stride;
-    auto dilation = argument.dilation;
-
-    auto input_inst = node.input(1).get_output_layout();
-    auto input_grad_inst = node.input().get_output_layout();
-    auto desc = node.get_primitive();
-    auto output_inst = node.get_output_layout();
-    auto output_size = output_inst.size;
-
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "convolution_grad_weights Input_grad size",
-                          input_grad_inst.size.raw.size(),
-                          "Input size",
-                          output_inst.size.raw.size(),
-                          "Input_grad/Input number of dimension does not match.");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "convolution_grad_weights Input size",
-                          input_inst.size.raw.size(),
-                          "output size",
-                          output_inst.size.raw.size(),
-                          "Input/output number of dimension does not match.");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "convolution_grad_weights Stride size",
-                          stride.raw.size(),
-                          "output size",
-                          output_inst.size.raw.size(),
-                          "Stride/output number of dimension does not match.");
-
-    // TODO: add support to dilation not equal 1, 1
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "convolution_grad_weights dilation x",
-                          dilation.spatial[0],
-                          "should be 1",
-                          1,
-                          "Only dilation x = 1 is supported right now.");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "convolution_grad_weights dilation y",
-                          dilation.spatial[1],
-                          "should be 1",
-                          1,
-                          "Only dilation y = 1 is supported right now.");
-
-    if (use_momentum()) {
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "number of weights",
-                              desc->weights.size(),
-                              "should be same as prev_weights_grad number",
-                              desc->prev_weights_grad.size(),
-                              "");
-        if (bias_term())
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "number of bias",
-                                  desc->bias.size(),
-                                  "should be same as prev_bias_grad number",
-                                  desc->prev_bias_grad.size(),
-                                  "");
-    }
-
-    auto split = node.get_split();
-    for (decltype(split) j = 0; j < split; j++) {
-        auto& filter_mem = node.weights(j);
-        auto filter_inst = filter_mem.get_output_layout();  // convolution_grad_weights filter
-        auto input_offset = argument.input_offset;
-
-        if (argument.bias.size() != 0) {
-            auto bias_inst = node.bias(j).get_output_layout();
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias batch[0]",
-                                  bias_inst.size.batch[0],
-                                  "dimension size",
-                                  1,
-                                  "Batch[0] of bias should be 1. Bias isn't 1D vector.");
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias feature[0]",
-                                  bias_inst.size.feature[0],
-                                  "dimension size",
-                                  input_grad_inst.size.feature[0] / split,
-                                  "Feature[0] of bias should be 1. Bias isn't 1D vector.");
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias spatial[1]",
-                                  bias_inst.size.spatial[1],
-                                  "dimension size",
-                                  1,
-                                  "Spatial[1] of bias should be 1. Bias isn't 1D vector.");
-
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias spatial[0]",
-                                  bias_inst.size.spatial[0],
-                                  "input_grad feature size / split",
-                                  1,
-                                  "Biases/output feature maps number does not match.");
-        }
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "convolution_grad_weights padding filling value",
-                              node.get_output_layout().data_padding.filling_value(),
-                              "padding mode",
-                              0.0f,
-                              "Unknown padding mode in convolution_grad_weights.");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Input offset size",
-                              input_offset.raw.size(),
-                              "input number of dimensions",
-                              input_inst.size.raw.size(),
-                              "");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Output feature size",
-                              output_size.feature.size(),
-                              "expected output feature size",
-                              1,
-                              "Only one-dimensional features are supported");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Output feature size",
-                              output_size.feature.size(),
-                              "expected output feature size",
-                              1,
-                              "Only one-dimensional features are supported");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Output batch size",
-                              output_size.batch.size(),
-                              "expected output batch size",
-                              1,
-                              "Only one-dimensional features are supported");
-
-        CLDNN_ERROR_LESS_THAN(node.id(),
-                              "Weights feature maps number",
-                              (input_grad_inst.size.feature[0] - input_offset.feature[0]) / split,
-                              "input_grad feature maps number",
-                              filter_inst.size.batch[0],
-                              "Weights/ifm mimsmatch");
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/deconvolution.cpp b/inference-engine/thirdparty/clDNN/src/deconvolution.cpp
index d74aafc3c8e..c63320da711 100644
--- a/inference-engine/thirdparty/clDNN/src/deconvolution.cpp
+++ b/inference-engine/thirdparty/clDNN/src/deconvolution.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -51,10 +51,6 @@ layout deconvolution_inst::calc_output_layout(deconvolution_node const& node) {
 
     auto number_of_features = weights_layout.size.batch[0] * static_cast<int32_t>(group);
 
-    // Deconvolution is used for convolution backward pass, but number of features will differ then
-    if (desc->gradient())
-        number_of_features = weights_layout.size.feature[0] * static_cast<int32_t>(group);
-
     if (desc->with_output_size) {
         CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
                                        "User-defined output spatial X",
@@ -240,22 +236,12 @@ deconvolution_inst::typed_primitive_inst(network_impl& network, deconvolution_no
                               "expected output batch size",
                               1,
                               "Only one-dimensional features are supported");
-
-        if (node.get_primitive()->gradient()) {
-            CLDNN_ERROR_LESS_THAN(node.id(),
-                                  "Weights feature maps number",
-                                  (input_inst.size.feature[0] - input_offset.feature[0]) / split,
-                                  "input feature maps number",
-                                  filter_inst.size.batch[0],
-                                  "Weights/ifm mimsmatch");
-        } else {
-            CLDNN_ERROR_LESS_THAN(node.id(),
-                                  "Weights feature maps number",
-                                  (input_inst.size.feature[0] - input_offset.feature[0]) / split,
-                                  "input feature maps number",
-                                  filter_inst.size.feature[0],
-                                  "Weights/ifm mimsmatch");
-        }
+        CLDNN_ERROR_LESS_THAN(node.id(),
+                              "Weights feature maps number",
+                              (input_inst.size.feature[0] - input_offset.feature[0]) / split,
+                              "input feature maps number",
+                              filter_inst.size.feature[0],
+                              "Weights/ifm mimsmatch");
     }
 }
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/eltwise.cpp b/inference-engine/thirdparty/clDNN/src/eltwise.cpp
index dc1d951948c..9101ddf330a 100644
--- a/inference-engine/thirdparty/clDNN/src/eltwise.cpp
+++ b/inference-engine/thirdparty/clDNN/src/eltwise.cpp
@@ -259,42 +259,6 @@ eltwise_inst::typed_primitive_inst(network_impl& network, eltwise_node const& no
             }
         }
     }
-
-    // Check inputs calibration factors
-    if (prim->inputs_calibration_factors.size() != 0) {
-        auto icf_size = prim->inputs_calibration_factors.size();
-
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Eltwise inputs calibration factors number",
-                              icf_size,
-                              "Eltwise inputs count",
-                              inputs_count,
-                              "");
-
-        for (size_t i = 0; i < icf_size; ++i) {
-            auto icf_size_local = node.input_calibration_factors(i).get_output_layout().size;
-            auto input_size = node.input(i).get_output_layout().size;
-
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Input feature number",
-                                  input_size.feature[0],
-                                  "Input calibration factors number",
-                                  icf_size_local.count(),
-                                  "");
-        }
-    }
-
-    // Check inputs quantization factors
-    if (!prim->input_quantization_factors.empty()) {
-        auto iqf_size = prim->input_quantization_factors.size();
-
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Eltwise inputs quantization factors number",
-                              iqf_size,
-                              "Eltwise inputs count",
-                              inputs_count,
-                              "");
-    }
 }
 
 void eltwise_inst::check_inputs_count(eltwise_node const& node) {
diff --git a/inference-engine/thirdparty/clDNN/src/embed.cpp b/inference-engine/thirdparty/clDNN/src/embed.cpp
deleted file mode 100644
index e45b28970e3..00000000000
--- a/inference-engine/thirdparty/clDNN/src/embed.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "embed_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id embed::type_id() {
-    static primitive_type_base<embed> instance;
-    return &instance;
-}
-
-layout embed_inst::calc_output_layout(embed_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for embed_node!");
-    auto input_layout = node.input().get_output_layout();
-    auto desc = node.get_primitive();
-    auto weights_layout = node.weights().get_output_layout();
-
-    auto result =
-        layout(input_layout.data_type,
-               format::bfyx,
-               tensor(input_layout.size.batch[0], input_layout.size.spatial[0], weights_layout.size.batch[0], 1));
-    return result;
-}
-
-std::string embed_inst::to_string(embed_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto bias_id = desc->bias != "" ? desc->bias : "no bias";
-    auto weights_id = desc->weights;
-
-    std::stringstream primitive_description;
-
-    json_composite embed_info;
-    embed_info.add("weights id", weights_id);
-    embed_info.add("bias id", bias_id);
-
-    node_info->add("embed info", embed_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-embed_inst::typed_primitive_inst(network_impl& network, embed_node const& node) : parent(network, node) {
-    auto input_size = node.input().get_output_layout();
-    auto output_size = output_memory().get_layout();
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "input format",
-                                  input_size.format.value,
-                                  "expected format",
-                                  format::yxfb,
-                                  format::bfyx);
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input size",
-                          input_size.size.raw.size(),
-                          "output size",
-                          output_size.size.raw.size(),
-                          "");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input batch",
-                          input_size.size.batch[0],
-                          "output batch",
-                          output_size.size.batch[0],
-                          "");
-    CLDNN_ERROR_NOT_EQUAL(node.id(), "Input feature", input_size.size.feature[0], "size 1", 1, "");
-    CLDNN_ERROR_NOT_EQUAL(node.id(), "Input y size ", input_size.size.spatial[1], "size 1", 1, "");
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp b/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp
deleted file mode 100644
index 7c2fc447ed7..00000000000
--- a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "fully_connected_grad_input_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id fully_connected_grad_input::type_id() {
-    static primitive_type_base<fully_connected_grad_input> instance;
-    return &instance;
-}
-
-layout fully_connected_grad_input_inst::calc_output_layout(fully_connected_grad_input_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "fully_connected_grad_input_node!");
-    auto desc = node.get_primitive();
-
-    auto input_layout = node.input().get_output_layout();
-    auto weights_layout = node.weights().get_output_layout();
-
-    return layout(input_layout.data_type,
-                  input_layout.format,
-                  tensor(input_layout.size.batch[0],
-                         weights_layout.size.feature[0],
-                         weights_layout.size.spatial[0],
-                         weights_layout.size.spatial[1]));
-}
-
-std::string fully_connected_grad_input_inst::to_string(fully_connected_grad_input_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto weights_id = desc->weights;
-
-    std::stringstream primitive_description;
-
-    json_composite fc_info;
-    fc_info.add("weights id", weights_id);
-
-    node_info->add("fully connected grad input info", fc_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-fully_connected_grad_input_inst::typed_primitive_inst(network_impl& network,
-                                                      fully_connected_grad_input_node const& node)
-    : parent(network, node) {
-    auto input_layout = node.input().get_output_layout();
-    auto output_layout = node.get_output_layout();
-
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input size",
-                          input_layout.size.raw.size(),
-                          "output size",
-                          output_layout.size.raw.size(),
-                          "");
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp b/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp
deleted file mode 100644
index 7c694e16b16..00000000000
--- a/inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "fully_connected_grad_weights_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id fully_connected_grad_weights::type_id() {
-    static primitive_type_base<fully_connected_grad_weights> instance;
-    return &instance;
-}
-
-layout fully_connected_grad_weights_inst::calc_output_layout(fully_connected_grad_weights_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "fully_connected_grad_weights_node!");
-    // output buffer will not be used in this primitive
-    auto input_grad_layout_size = node.input().get_output_layout();
-    return {input_grad_layout_size.data_type, input_grad_layout_size.format, {1, 1, 1, 1}};
-}
-
-std::string fully_connected_grad_weights_inst::to_string(fully_connected_grad_weights_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto bias_id = desc->bias != "" ? desc->bias : "no bias";
-    auto weights_id = desc->weights;
-
-    std::stringstream primitive_description;
-
-    json_composite fc_info;
-    fc_info.add("weights id", weights_id);
-    fc_info.add("bias id", bias_id);
-
-    node_info->add("fully connected grad weights info", fc_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-fully_connected_grad_weights_inst::typed_primitive_inst(network_impl& network,
-                                                        fully_connected_grad_weights_node const& node)
-    : parent(network, node) {
-    auto input_layout = node.input().get_output_layout();
-    auto output_layout = node.get_output_layout();
-
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input size",
-                          input_layout.size.raw.size(),
-                          "output size",
-                          output_layout.size.raw.size(),
-                          "");
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp b/inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp
deleted file mode 100644
index 41e88b36314..00000000000
--- a/inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "fused_conv_bn_scale_inst.h"
-#include "primitive_type_base.h"
-#include "sliding_window_utils.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id fused_conv_bn_scale::type_id() {
-    static primitive_type_base<fused_conv_bn_scale> instance;
-    return &instance;
-}
-// TODO: unify this code with regular convolution.
-layout fused_conv_bn_scale_inst::calc_output_layout(fused_conv_bn_scale_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "fused_conv_bn_scale_node!");
-    auto desc = node.get_primitive();
-
-    auto input_layout = node.input().get_output_layout();
-    auto weights_layout = node.weights(0).get_output_layout();  // weights are stored after inputs
-
-    auto input_offset = desc->input_offset;
-    auto stride = desc->stride;
-    auto split = desc->weights.size();
-    auto dilation = desc->dilation;
-
-    // compute how many outputs in rows and columns will be generate by filter.
-    // outp <= (input_size - (2*input_offset) - kernel_size)/ stride
-    auto filter_size = weights_layout.size;
-
-    CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
-                                   "Stride spatial X",
-                                   stride.spatial[0],
-                                   "value",
-                                   0,
-                                   "Stride spatial X must be positive (>= 1)");
-    CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
-                                   "Stride spatial Y",
-                                   stride.spatial[1],
-                                   "value",
-                                   0,
-                                   "Stride spatial Y must be positive (>= 1)");
-    CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
-                                   "Dilatation spatial X",
-                                   dilation.spatial[0],
-                                   "value",
-                                   0,
-                                   "Dilatation patial X must be positive (>= 1)");
-    CLDNN_ERROR_LESS_OR_EQUAL_THAN(node.id(),
-                                   "Dilatation spatial Y",
-                                   dilation.spatial[1],
-                                   "value",
-                                   0,
-                                   "Dilatation spatial Y must be positive (>= 1)");
-    CLDNN_ERROR_GREATER_THAN(node.id(),
-                             "Input offset spatial X",
-                             2 * input_offset.spatial[0],
-                             "input layout spatial X",
-                             input_layout.size.spatial[0],
-                             "There is no input data to process");
-    CLDNN_ERROR_GREATER_THAN(node.id(),
-                             "Input offset spatial Y",
-                             2 * input_offset.spatial[1],
-                             "input layout spatial Y",
-                             input_layout.size.spatial[1],
-                             "There is no input data to process");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input offset feature",
-                          input_offset.feature[0],
-                          "",
-                          0,
-                          "Input offset in feature is not supported");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input offset batch",
-                          input_offset.batch[0],
-                          "",
-                          0,
-                          "Input offset in batch is not supported");
-
-    // get output feature map from weights. It should be the same as number of biases. Will be verified in
-    // convolution::create()
-    auto number_of_features = weights_layout.size.batch[0] * static_cast<int32_t>(split);
-
-    auto output_range = calc_sliding_window_output_range<swor_mode::all>(input_layout.size,
-                                                                         filter_size,
-                                                                         input_offset,
-                                                                         stride,
-                                                                         {1, 1, 1, 1},
-                                                                         true,
-                                                                         1);
-
-    tensor output_size(input_layout.size.batch[0],
-                       number_of_features,
-                       output_range.spatial[0],
-                       output_range.spatial[1]);
-    return {input_layout.data_type, input_layout.format, output_size};
-}
-
-std::string fused_conv_bn_scale_inst::to_string(fused_conv_bn_scale_node const& node) {
-    auto desc = node.get_primitive();
-    auto strd = desc->stride;
-    auto split = node.get_split();
-    auto node_info = node.desc_to_json();
-
-    std::stringstream primitive_description;
-
-    json_composite fuse_info;
-    fuse_info.add("stride", strd.to_string());
-    fuse_info.add("input offset", desc->input_offset.to_string());
-    fuse_info.add("split", split);
-
-    node_info->add("fused_conv_bn_scale info", fuse_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-fused_conv_bn_scale_inst::typed_primitive_inst(network_impl& network, fused_conv_bn_scale_node const& node)
-    : parent(network, node) {
-    auto stride = argument.stride;
-
-    auto input_inst = node.input().get_output_layout();
-    auto output_inst = node.get_output_layout();
-    auto output_size = output_inst.size;
-
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Input number of dimensions",
-                          input_inst.size.raw.size(),
-                          "output number of dimensions",
-                          output_inst.size.raw.size(),
-                          "Input/output dims mismtach");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Stride number of dimensions",
-                          stride.raw.size(),
-                          "output number of dimensions",
-                          output_inst.size.raw.size(),
-                          "stride/output dims mismtach");
-
-    auto split = node.get_split();
-    for (decltype(split) j = 0; j < split; j++) {
-        auto filter_inst = node.weights(j).get_output_layout();  // convolution filter
-        if (bias_term()) {
-            auto bias_inst = node.bias(j).get_output_layout();
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias batch[0]",
-                                  bias_inst.size.batch[0],
-                                  "expected size of batch",
-                                  1,
-                                  "Biases isn't 1D vector.");
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias feature[0]",
-                                  bias_inst.size.feature[0],
-                                  "expected size of feature",
-                                  output_size.feature[0] / split,
-                                  "Bias/fm mismtach");
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias spatial[1]",
-                                  bias_inst.size.spatial[1],
-                                  "expected size of spatial[1]",
-                                  1,
-                                  "Biases isn't 1D vector.");
-
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias spatial[0]",
-                                  bias_inst.size.spatial[0],
-                                  "expected size of spatial[0]",
-                                  1,
-                                  "Biases isn't 1D vector.");
-        }
-
-        auto input_offset = argument.input_offset;
-
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Weights number of dimensions",
-                              filter_inst.size.raw.size(),
-                              "output number of dimensions",
-                              output_inst.size.raw.size(),
-                              "Weights/output dims mismtach");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Convolution padding mode",
-                              node.get_output_layout().data_padding.filling_value(),
-                              "padding value",
-                              0.0f,
-                              "Unknown padding mode.");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Input offset number of dimensions",
-                              input_offset.raw.size(),
-                              "input number of dimensions",
-                              input_inst.size.raw.size(),
-                              "Input offset/ input size mismtach");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Output feature size",
-                              output_size.feature.size(),
-                              "expected feature size",
-                              1,
-                              "Only one-dimensional features are supported");
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Output batch size",
-                              output_size.batch.size(),
-                              "expected output size",
-                              1,
-                              "Only one-dimensional batch size are supported");
-        CLDNN_ERROR_LESS_THAN(node.id(),
-                              "Weights feature maps number",
-                              (input_inst.size.feature[0] - input_offset.feature[0]) / split,
-                              "input feature maps number",
-                              filter_inst.size.feature[0],
-                              "Weights/ifm mismtach");
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp
deleted file mode 100644
index eef28f660ea..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "activation_grad_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "activation/activation_kernel_selector.h"
-#include "activation/activation_kernel_base.h"
-#include "api/activation_grad.hpp"
-#include "register_gpu.hpp"
-
-namespace cldnn {
-namespace gpu {
-
-struct activation_grad_gpu : typed_primitive_gpu_impl<activation_grad> {
-    using parent = typed_primitive_gpu_impl<activation_grad>;
-    using parent::parent;
-
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<activation_grad>& instance,
-                                                        int32_t split) const override {
-        kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
-
-        if (_outer.is_parameterized()) {
-            args.slope = (memory_impl::cptr) &instance.slope_memory();
-        }
-
-        return args;
-    }
-
-    static primitive_impl* create(const activation_grad_node& arg) {
-        auto activation_grad_params = get_default_params<kernel_selector::activation_params>(arg);
-        auto activation_grad_optional_params =
-            get_default_optional_params<kernel_selector::activation_optional_params>(arg.get_program());
-
-        const auto& primitive = arg.get_primitive();
-
-        activation_grad_params.gradient = true;
-        activation_grad_params.inputs.push_back(convert_data_tensor(arg.get_dependency(1).get_output_layout()));
-        convert_new_activation_grad_func(primitive, activation_grad_params.activations);
-        if (arg.is_parameterized()) {
-            const auto& slope_layout = arg.slope_input().get_output_layout();
-            const auto& output_layout = arg.get_output_layout();
-
-            const auto params_num =
-                kernel_selector::GetActivationAdditionalParamsNumber(activation_grad_params.activations[0].function);
-
-            CLDNN_ERROR_LESS_THAN(arg.id(),
-                                  "Slope layout size count",
-                                  slope_layout.size.count(),
-                                  "output_layout.size.feature[0] * params_num",
-                                  static_cast<size_t>(output_layout.size.feature[0] * params_num),
-                                  "Error - not enough data inside additional params buffer");
-
-            activation_grad_params.inputActivationParams.push_back(convert_data_tensor(slope_layout));
-        }
-
-        auto& kernel_selector = kernel_selector::activation_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(activation_grad_params, activation_grad_optional_params);
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto activation_grad = new activation_grad_gpu(arg, best_kernels[0]);
-
-        return activation_grad;
-    }
-};
-
-namespace detail {
-
-attach_activation_grad_gpu::attach_activation_grad_gpu() {
-    auto val_fw = activation_grad_gpu::create;
-
-    implementation_map<activation_grad>::add({
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw},
-    });
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/apply_adam_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/apply_adam_gpu.cpp
deleted file mode 100644
index d7ec0741297..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/apply_adam_gpu.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "apply_adam_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "eltwise/eltwise_kernel_selector.h"
-#include "eltwise/eltwise_kernel_base.h"
-#include <algorithm>
-
-namespace cldnn {
-namespace gpu {
-
-struct apply_adam_gpu : typed_primitive_gpu_impl<apply_adam> {
-    using parent = typed_primitive_gpu_impl<apply_adam>;
-    using parent::parent;
-
-protected:
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<apply_adam>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args;
-
-        args.inputs = { (memory_impl::cptr)&instance.input_memory(),
-                        (memory_impl::cptr)&instance.m_memory(),
-                        (memory_impl::cptr)&instance.v_memory(),
-                        (memory_impl::cptr)&instance.beta1_power_memory(),
-                        (memory_impl::cptr)&instance.beta2_power_memory()};
-        args.output = (memory_impl::cptr) &instance.output_memory();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const apply_adam_node& arg) {
-        auto ew_params = get_default_params<kernel_selector::eltwise_params>(arg);
-        auto ew_optional_params =
-            get_default_optional_params<kernel_selector::eltwise_optional_params>(arg.get_program());
-        const float lr = arg.get_primitive()->lr;
-        const float beta1 = arg.get_primitive()->beta1;
-        const float beta2 = arg.get_primitive()->beta2;
-        const float epsilon = (arg.input().get_output_layout().data_type == data_types::f16)
-                                  ? std::max(0.00007f, arg.get_primitive()->epsilon)
-                                  :  // prevent underflow if the epsilon is too small for fp16
-                                  arg.get_primitive()->epsilon;
-
-        ew_params.inputs.push_back(convert_data_tensor(arg.m().get_output_layout()));
-        ew_params.inputs.push_back(convert_data_tensor(arg.v().get_output_layout()));
-        ew_params.inputs.push_back(convert_data_tensor(arg.beta1_power().get_output_layout()));
-        ew_params.inputs.push_back(convert_data_tensor(arg.beta2_power().get_output_layout()));
-
-        // lr_t = lr * sqrt(1 - pow(beta2, t_f)) / (1 - pow(beta1, t_f))
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Scalar(1),
-                                         kernel_selector::eltwise_params::InputType::Buffer(3)},
-                                        kernel_selector::eltwise_mode::SUB});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Scalar(1),
-                                         kernel_selector::eltwise_params::InputType::Buffer(4)},
-                                        kernel_selector::eltwise_mode::SUB});
-
-        ew_params.operations.push_back(
-            {{kernel_selector::eltwise_params::InputType::Intermediate(1)}, kernel_selector::eltwise_mode::SQRT});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(2),
-                                         kernel_selector::eltwise_params::InputType::Scalar(lr)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(3),
-                                         kernel_selector::eltwise_params::InputType::Intermediate(0)},
-                                        kernel_selector::eltwise_mode::DIV});
-
-        // m_t = beta1 * m_f + (1 - beta1) * input_grad
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Scalar(beta1),
-                                         kernel_selector::eltwise_params::InputType::Buffer(1)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Scalar(1),
-                                         kernel_selector::eltwise_params::InputType::Scalar(beta1)},
-                                        kernel_selector::eltwise_mode::SUB});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(6),
-                                         kernel_selector::eltwise_params::InputType::Buffer(0)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(5),
-                                         kernel_selector::eltwise_params::InputType::Intermediate(7)},
-                                        kernel_selector::eltwise_mode::ADD});
-
-        // save the result in m mutable_data primitive
-        ew_params.updateInputIds.push_back({1, 8});
-
-        // v_t = beta2 * v_f + (1 - beta2) * input_grad * input_grad
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Scalar(beta2),
-                                         kernel_selector::eltwise_params::InputType::Buffer(2)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Scalar(1),
-                                         kernel_selector::eltwise_params::InputType::Scalar(beta2)},
-                                        kernel_selector::eltwise_mode::SUB});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(10),
-                                         kernel_selector::eltwise_params::InputType::Buffer(0)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(11),
-                                         kernel_selector::eltwise_params::InputType::Buffer(0)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(9),
-                                         kernel_selector::eltwise_params::InputType::Intermediate(12)},
-                                        kernel_selector::eltwise_mode::ADD});
-
-        // save the result in v mutable_data primitive
-        ew_params.updateInputIds.push_back({2, 13});
-
-        // result = var - lr_t * m_t / (sqrt(v_t) + epsilon)
-        ew_params.operations.push_back(
-            {{kernel_selector::eltwise_params::InputType::Intermediate(13)}, kernel_selector::eltwise_mode::SQRT});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(14),
-                                         kernel_selector::eltwise_params::InputType::Scalar(epsilon)},
-                                        kernel_selector::eltwise_mode::ADD});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(4),
-                                         kernel_selector::eltwise_params::InputType::Intermediate(8)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(16),
-                                         kernel_selector::eltwise_params::InputType::Intermediate(15)},
-                                        kernel_selector::eltwise_mode::DIV});
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::OutBuffer(),
-                                         kernel_selector::eltwise_params::InputType::Intermediate(17)},
-                                        kernel_selector::eltwise_mode::SUB});
-
-        ew_params.layoutBased = true;
-
-        auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto norm = new apply_adam_gpu(arg, best_kernels[0]);
-
-        return norm;
-    }
-};
-
-namespace detail {
-
-attach_apply_adam_gpu::attach_apply_adam_gpu() {
-    auto val_fw = apply_adam_gpu::create;
-
-    implementation_map<apply_adam>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw);
-    implementation_map<apply_adam>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw);
-    implementation_map<apply_adam>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
-    implementation_map<apply_adam>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
-    implementation_map<apply_adam>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
-    implementation_map<apply_adam>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp
deleted file mode 100644
index 3ecdb1e9eb2..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "batch_norm/batch_norm_kernel_base.h"
-#include "batch_norm/batch_norm_kernel_selector.h"
-#include "eltwise/eltwise_kernel_selector.h"
-#include "eltwise/eltwise_kernel_base.h"
-#include <algorithm>
-
-namespace cldnn {
-namespace gpu {
-
-struct batch_norm_gpu : typed_primitive_gpu_impl<batch_norm> {
-    using parent = typed_primitive_gpu_impl<batch_norm>;
-    using parent::parent;
-
-protected:
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<batch_norm>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args;
-
-        args.inputs = {(memory_impl::cptr) &instance.input_memory()};
-
-        if (instance.use_global_stats()) {
-            args.inputs.push_back((memory_impl::cptr) &instance.mean_memory());
-            args.inputs.push_back((memory_impl::cptr) &instance.variance_memory());
-        }
-
-        if (instance.use_scale_shift()) {
-            args.inputs.push_back((memory_impl::cptr) &instance.scale_memory());
-            args.inputs.push_back((memory_impl::cptr) &instance.shift_memory());
-        }
-
-        if (instance.forwad_pass())
-            args.inputs.push_back((memory_impl::cptr) &instance.inv_variance_memory());
-
-        args.output = (memory_impl::cptr) &instance.output_memory();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const batch_norm_node& arg) {
-        if (!arg.use_global_stats() || arg.calc_mean_var()) {
-            auto norm_params = get_default_params<kernel_selector::batch_norm_params>(arg);
-            auto norm_optional_params =
-                get_default_optional_params<kernel_selector::batch_norm_optional_params>(arg.get_program());
-
-            norm_params.batchNormParams.epsilon = arg.get_primitive()->epsilon;
-            norm_params.batchNormParams.with_inv_var = arg.forwad_pass();
-            norm_params.batchNormParams.with_scale_shift = arg.use_scale_shift();
-            if (arg.calc_mean_var())
-                norm_params.batchNormParams.with_mean_var_out = arg.calc_mean_var();
-
-            auto& kernel_selector = kernel_selector::batch_norm_kernel_selector::Instance();
-            auto best_kernels = kernel_selector.GetBestKernels(norm_params, norm_optional_params);
-
-            CLDNN_ERROR_BOOL(arg.id(),
-                             "Best_kernel.empty()",
-                             best_kernels.empty(),
-                             "Cannot find a proper kernel with this arguments");
-
-            auto norm = new batch_norm_gpu(arg, best_kernels[0]);
-
-            return norm;
-        } else {
-            auto ew_params = get_default_params<kernel_selector::eltwise_params>(arg);
-            auto ew_optional_params =
-                get_default_optional_params<kernel_selector::eltwise_optional_params>(arg.get_program());
-            const float epsilon = (arg.input().get_output_layout().data_type == data_types::f16)
-                                      ? std::max(0.00007f, arg.get_primitive()->epsilon)
-                                      :  // prevent underflow if the epsilon is too small for fp16
-                                      arg.get_primitive()->epsilon;
-
-            ew_params.inputs.push_back(convert_data_tensor(arg.mean().get_output_layout()));
-            ew_params.inputs.push_back(convert_data_tensor(arg.variance().get_output_layout()));
-
-            ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0),
-                                             kernel_selector::eltwise_params::InputType::Buffer(1)},
-                                            kernel_selector::eltwise_mode::SUB});
-
-            ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(2),
-                                             kernel_selector::eltwise_params::InputType::Scalar(epsilon)},
-                                            kernel_selector::eltwise_mode::ADD});
-
-            ew_params.operations.push_back(
-                {{kernel_selector::eltwise_params::InputType::Intermediate(1)}, kernel_selector::eltwise_mode::RSQRT});
-
-            ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(0),
-                                             kernel_selector::eltwise_params::InputType::Intermediate(2)},
-                                            kernel_selector::eltwise_mode::MUL});
-
-            if (arg.use_scale_shift()) {
-                ew_params.inputs.push_back(convert_data_tensor(arg.scale().get_output_layout()));
-                ew_params.inputs.push_back(convert_data_tensor(arg.shift().get_output_layout()));
-
-                ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(3),
-                                                 kernel_selector::eltwise_params::InputType::Buffer(3)},
-                                                kernel_selector::eltwise_mode::MUL});
-
-                ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Intermediate(4),
-                                                 kernel_selector::eltwise_params::InputType::Buffer(4)},
-                                                kernel_selector::eltwise_mode::ADD});
-            }
-
-            ew_params.layoutBased = true;
-
-            auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
-            auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
-
-            CLDNN_ERROR_BOOL(arg.id(),
-                             "Best_kernel.empty()",
-                             best_kernels.empty(),
-                             "Cannot find a proper kernel with this arguments");
-
-            auto norm = new batch_norm_gpu(arg, best_kernels[0]);
-
-            return norm;
-        }
-    }
-};
-
-namespace detail {
-
-attach_batch_norm_gpu::attach_batch_norm_gpu() {
-    auto val_fw = batch_norm_gpu::create;
-
-    implementation_map<batch_norm>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw);
-    implementation_map<batch_norm>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw);
-    implementation_map<batch_norm>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
-    implementation_map<batch_norm>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
-    implementation_map<batch_norm>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
-    implementation_map<batch_norm>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_grad_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_grad_gpu.cpp
deleted file mode 100644
index f24054f26e5..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/batch_norm_grad_gpu.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "batch_norm_grad_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "batch_norm_grad/batch_norm_grad_kernel_base.h"
-#include "batch_norm_grad/batch_norm_grad_kernel_selector.h"
-
-namespace cldnn {
-namespace gpu {
-
-struct batch_norm_grad_gpu : typed_primitive_gpu_impl<batch_norm_grad> {
-    using parent = typed_primitive_gpu_impl<batch_norm_grad>;
-    using parent::parent;
-
-protected:
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<batch_norm_grad>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args;
-
-        args.inputs = {
-            (memory_impl::cptr) &instance.input_memory(0),
-            (memory_impl::cptr) &instance.input_memory(1),
-            (memory_impl::cptr) &instance.inv_variance_memory()};
-        args.output = (memory_impl::cptr) &instance.output_memory();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const batch_norm_grad_node& arg) {
-        auto norm_params = get_default_params<kernel_selector::batch_norm_grad_params>(arg);
-        auto norm_optional_params =
-            get_default_optional_params<kernel_selector::batch_norm_grad_optional_params>(arg.get_program());
-
-        auto& kernel_selector = kernel_selector::batch_norm_grad_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(norm_params, norm_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto norm = new batch_norm_grad_gpu(arg, best_kernels[0]);
-
-        return norm;
-    }
-};
-
-namespace detail {
-
-attach_batch_norm_grad_gpu::attach_batch_norm_grad_gpu() {
-    auto val_fw = batch_norm_grad_gpu::create;
-
-    implementation_map<batch_norm_grad>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-                                                val_fw);
-    implementation_map<batch_norm_grad>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-                                                val_fw);
-    implementation_map<batch_norm_grad>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                                val_fw);
-    implementation_map<batch_norm_grad>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                                val_fw);
-    implementation_map<batch_norm_grad>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf),
-                                                val_fw);
-    implementation_map<batch_norm_grad>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf),
-                                                val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp
deleted file mode 100644
index 4459e4962b1..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "contract_inst.h"
-
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "kernel_selector_helper.h"
-#include "error_handler.h"
-#include "contract/contract_kernel_selector.h"
-#include "contract/contract_kernel_base.h"
-
-namespace cldnn {
-namespace gpu {
-
-namespace {
-inline kernel_selector::ContractMode convert_to_contract_mode(contract_mode mode) {
-    switch (mode) {
-        case contract_mode::sum:
-            return kernel_selector::ContractMode::SUM;
-        case contract_mode::prod:
-            return kernel_selector::ContractMode::PRODUCT;
-        case contract_mode::all:
-            return kernel_selector::ContractMode::ALL;
-        case contract_mode::any:
-            return kernel_selector::ContractMode::ANY;
-        case contract_mode::max:
-            return kernel_selector::ContractMode::MAX;
-
-        default:
-            return kernel_selector::ContractMode::SUM;
-    }
-}
-}  // namespace
-
-struct contract_gpu : typed_primitive_gpu_impl<contract> {
-    using parent = typed_primitive_gpu_impl<contract>;
-    using parent::parent;
-
-    static primitive_impl* create(const contract_node& arg) {
-        auto c_params = get_default_params<kernel_selector::contract_params>(arg, 1);
-        auto c_optional_params =
-            get_default_optional_params<kernel_selector::contract_optional_params>(arg.get_program());
-
-        c_params.reduction_axes = arg.get_primitive()->reduction_axes;
-        c_params.mode = convert_to_contract_mode(arg.get_primitive()->mode);
-
-        auto& kernel_selector = kernel_selector::contract_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(c_params, c_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        return new contract_gpu(arg, best_kernels[0]);
-    }
-};
-
-namespace detail {
-
-attach_contract_gpu::attach_contract_gpu() {
-    auto val_fw = contract_gpu::create;
-
-    implementation_map<contract>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
-    implementation_map<contract>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
-    implementation_map<contract>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
-    implementation_map<contract>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
-    implementation_map<contract>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
-    implementation_map<contract>::add(std::make_tuple(engine_types::ocl, data_types::i64, format::bfyx), val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp
deleted file mode 100644
index 1ccbdc4b1c9..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "convolution_grad_weights_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "network_impl.h"
-#include "kernel_selector_helper.h"
-#include "convolution_grad_weights/convolution_grad_weights_kernel_selector.h"
-#include "convolution_grad_weights/convolution_grad_weights_kernel_base.h"
-#include <algorithm>
-
-namespace cldnn {
-namespace gpu {
-
-struct convolution_grad_weights_gpu : typed_primitive_gpu_impl<convolution_grad_weights> {
-    using parent = typed_primitive_gpu_impl<convolution_grad_weights>;
-    using parent::parent;
-
-protected:
-    bool validate_impl(const typed_primitive_inst<convolution_grad_weights>& instance) const override {
-        bool res = true;
-
-        CLDNN_ERROR_NOT_EQUAL(_outer.id(),
-                              "convolution_grad_weights filling value",
-                              _outer.get_output_layout().data_padding.filling_value(),
-                              "padding mode",
-                              0.0f,
-                              "Unknown padding mode in convolution_grad_weights.");
-        // Check whether all memory elements use the same unit type (FP16 or FP32).
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(),
-                                        "Input grad memory",
-                                        instance.input_memory().get_layout().data_type,
-                                        "output memory",
-                                        instance.output_memory().get_layout().data_type,
-                                        "");
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(),
-                                        "Input memory",
-                                        instance.input_memory(1).get_layout().data_type,
-                                        "output memory",
-                                        instance.output_memory().get_layout().data_type,
-                                        "");
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(),
-                                        "Fp32",
-                                        data_types::f32,
-                                        "filter memory",
-                                        instance.weights_memory(0).get_layout().data_type,
-                                        "");
-
-        if (instance.use_momentum()) {
-            CLDNN_ERROR_LAYOUT_MISMATCH(_outer.id(),
-                                        "Filter memory",
-                                        instance.weights_memory(0).get_layout(),
-                                        "previous weights grad memory",
-                                        _outer.prev_weights_grad(0).get_output_layout(),
-                                        "");
-            CLDNN_ERROR_LAYOUT_MISMATCH(_outer.id(),
-                                        "Bias memory",
-                                        instance.bias_memory(0).get_layout(),
-                                        "previous bias grad memory",
-                                        _outer.prev_bias_grad(0).get_output_layout(),
-                                        "");
-        }
-
-        return res;
-    }
-
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<convolution_grad_weights>& instance,
-                                                        int32_t split) const override {
-        kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
-
-        args.weights = (memory_impl::cptr) &instance.weights_memory(split);
-        args.bias = (memory_impl::cptr) (instance.bias_term() ? &instance.bias_memory(split) : nullptr);
-        args.prev_weights_grad = (memory_impl::cptr) (instance.use_momentum() ? &instance.prev_weights_grad(split) : nullptr);
-        args.prev_bias_grad =
-            (memory_impl::cptr) (instance.bias_term() ? instance.use_momentum() ? &instance.prev_bias_grad(split) : nullptr : nullptr);
-        args.lr = instance.get_network().get_learning_rate();
-
-        return args;
-    }
-
-    int32_t get_split() const override { return _outer.get_split(); }
-
-public:
-    static primitive_impl* create(const convolution_grad_weights_node& arg) {
-        const auto& primitive = arg.get_primitive();
-        const auto& weights_layout = arg.weights(0).get_output_layout();
-
-        switch (weights_layout.fused_format()) {
-            case fuse(data_types::f32, format::bfyx):
-            case fuse(data_types::f32, format::yxfb):
-            case fuse(data_types::f32, format::oiyx):
-            case fuse(data_types::f16, format::bfyx):
-            case fuse(data_types::f16, format::yxfb):
-            case fuse(data_types::f16, format::oiyx):
-                break;
-            default:
-                throw std::runtime_error("convolution_grad_weights weights format unsupported");
-        }
-
-        const auto& weights_size = weights_layout.size;
-
-        const auto& split = primitive->split();
-        const auto& stride = primitive->stride;
-#if 0  // TODO: support dilation
-        const auto& dilation = primitive->dilation;
-#else
-        const tensor dilation = {0, 0, 1, 1};
-#endif
-        const auto depthwise_separable_opt = arg.get_depthwise_sep_opt();
-        const auto output_grad_w = arg.output_grad_w();
-
-        const auto& input_offset = primitive->input_offset;
-
-        auto conv_grad_weights_params = get_default_learning_params<kernel_selector::convolution_grad_weights_params>(
-            arg,
-            depthwise_separable_opt ? 1 : split);
-        auto conv_grad_weights_optional_params =
-            get_default_learning_optional_params<kernel_selector::convolution_grad_weights_optional_params>(
-                arg.get_program());
-
-        conv_grad_weights_params.depthwise_separable_opt = depthwise_separable_opt;
-        conv_grad_weights_params.output_grad_w = output_grad_w;
-
-        conv_grad_weights_params.gradient = true;
-        conv_grad_weights_params.inputs.push_back(convert_data_tensor(arg.get_dependency(1).get_output_layout()));
-
-        conv_grad_weights_params.split = split;
-        conv_grad_weights_params.filterSize = {
-            (uint32_t)weights_size.spatial[0],
-            (uint32_t)weights_size.spatial[1],
-        };
-
-        conv_grad_weights_params.padding = {(uint32_t)std::max(-input_offset.spatial[0], 0),
-                                            (uint32_t)std::max(-input_offset.spatial[1], 0)};
-
-        conv_grad_weights_params.stride = {(uint32_t)stride.spatial[0], (uint32_t)stride.spatial[1]};
-
-        conv_grad_weights_params.dilation = {(uint32_t)dilation.spatial[0], (uint32_t)dilation.spatial[1]};
-
-        auto& kernel_selector = kernel_selector::convolution_grad_weights_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(conv_grad_weights_params, conv_grad_weights_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto deconv = new convolution_grad_weights_gpu(arg, best_kernels[0]);
-
-        return deconv;
-    }
-};
-
-namespace detail {
-
-attach_convolution_grad_weights_gpu::attach_convolution_grad_weights_gpu() {
-    implementation_map<convolution_grad_weights>::add(
-        std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-        convolution_grad_weights_gpu::create);
-    implementation_map<convolution_grad_weights>::add(
-        std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-        convolution_grad_weights_gpu::create);
-    implementation_map<convolution_grad_weights>::add(
-        std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-        convolution_grad_weights_gpu::create);
-    implementation_map<convolution_grad_weights>::add(
-        std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-        convolution_grad_weights_gpu::create);
-    implementation_map<convolution_grad_weights>::add(
-        std::make_tuple(engine_types::ocl, data_types::f32, format::byxf),
-        convolution_grad_weights_gpu::create);
-    implementation_map<convolution_grad_weights>::add(
-        std::make_tuple(engine_types::ocl, data_types::f16, format::byxf),
-        convolution_grad_weights_gpu::create);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp
index 5b137d9138b..b4abdf1f715 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/deconvolution_gpu.cpp
@@ -101,8 +101,6 @@ public:
                                   (uint32_t)dilation.spatial[1],
                                   (uint32_t)dilation.spatial[2]};
 
-        deconv_params.gradient = primitive->gradient();
-
         auto& kernel_selector = kernel_selector::deconvolution_kernel_selector::Instance();
         auto best_kernels = kernel_selector.GetBestKernels(deconv_params, deconv_optional_params);
 
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
index c66dbbdd3a5..f1fe37ccdde 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
@@ -79,14 +79,8 @@ struct eltwise_gpu : typed_primitive_gpu_impl<eltwise> {
 
 protected:
     kernel::kernel_arguments_data get_arguments(typed_primitive_inst<eltwise>& instance,
-                                                        int32_t split) const override {
+                                                int32_t split) const override {
         kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
-
-        args.output_calibration_factors =
-            (memory_impl::cptr) (instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory() : nullptr);
-        // TODO Inputs calibration factors - skipping for now as currently they should never be used in eltwise, create
-        // will throw
-
         return args;
     }
 
@@ -157,19 +151,6 @@ public:
             ew_params.broadcast = true;
         }
 
-        if (primitive->output_calibration_factors.size() > 0 || primitive->output_quantization_factor != 1.0f) {
-            ew_params.int8_quantization = true;
-
-            if (primitive->output_calibration_factors.size() > 0) {
-                ew_params.output_calibration = true;
-                ew_params.output_calibration_factors.push_back(
-                    convert_data_tensor(arg.output_calibration_factors().get_output_layout())
-                        .FlattenFeatureAndSpatials());
-            } else {
-                ew_params.output_quantization_factor = arg.get_output_qf();
-            }
-        }
-
         // TODO [LOW PRECISION]: check if this parameter's really needed. Maybe data types are enough
         bool quantization = true;
         for (size_t i = 0; i < arg.inputs_count(); i++) {
@@ -180,35 +161,6 @@ public:
         }
         ew_params.int8_quantization = quantization;
 
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Eltwise inputs calibration term",
-                         arg.inputs_calibration_term(),
-                         "Eltwise does not yet support inputs calibration, it should be fused with convolution");
-
-        if (arg.inputs_calibration_term()) {
-            ew_params.int8_quantization = true;
-            ew_params.inputs_calibration = true;
-
-            for (size_t i = 0; i < primitive->inputs_calibration_factors.size(); ++i) {
-                auto icf_layout = arg.input_calibration_factors(i).get_output_layout();
-                ew_params.inputs_calibration_factors.push_back(
-                    convert_data_tensor(icf_layout).FlattenFeatureAndSpatials());
-            }
-        }
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Eltwise inputs quantization term",
-                         arg.inputs_quantization_term(),
-                         "Eltwise does not yet support inputs quantization, it should be fused with convolution");
-
-        if (arg.inputs_quantization_term()) {
-            ew_params.int8_quantization = true;
-
-            for (const auto& iqf : primitive->input_quantization_factors) {
-                ew_params.input_quantization_factors.push_back(iqf);
-            }
-        }
-
         auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
         auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
 
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/embed_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/embed_gpu.cpp
deleted file mode 100644
index 36abcd3c712..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/embed_gpu.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "embed_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "kernel_runner.h"
-
-#include "embed/embed_kernel_selector.h"
-#include "embed/embed_params.h"
-
-#include "api/input_layout.hpp"
-#include <vector>
-
-namespace cldnn {
-namespace gpu {
-
-struct embed_gpu : typed_primitive_gpu_impl<embed> {
-    using parent = typed_primitive_gpu_impl<embed>;
-    memory_impl::cptr new_input_mem;
-
-    embed_gpu(const embed_node& arg, const kernel_selector::kernel_data& kd) : parent(arg, kd) {}
-
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<embed>& instance, int32_t) const override {
-        kernel::kernel_arguments_data args;
-        args.inputs = {new_input_mem};
-        args.output = (memory_impl::cptr) &instance.output_memory();
-        args.weights = (memory_impl::cptr) &instance.weights_memory();
-        args.bias = (memory_impl::cptr) (instance.bias_term() ? &instance.bias_memory() : nullptr);
-
-        return args;
-    }
-
-    event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, embed_inst& instance) override {
-        std::vector<event_impl::ptr> tmp_events(events);
-        new_input_mem = (memory_impl::cptr) &instance.input_memory();
-
-        return parent::execute_impl(tmp_events, instance);
-    }
-
-    static primitive_impl* create(const embed_node& arg) {
-        auto embed_params = get_weights_bias_default_params<kernel_selector::embed_params>(arg);
-        auto embed_optional_params =
-            get_default_weights_bias_optional_params<kernel_selector::embed_optional_params>(arg.get_program());
-        embed_params.output = embed_params.output.FlattenFeatureAndSpatials();
-
-        auto& kernel_selector = kernel_selector::embed_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(embed_params, embed_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto embed_node = new embed_gpu(arg, best_kernels[0]);
-
-        return embed_node;
-    }
-};
-
-namespace detail {
-
-attach_embed_gpu::attach_embed_gpu() {
-    implementation_map<embed>::add(
-        {{std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), embed_gpu::create},
-         {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), embed_gpu::create}});
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_input_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_input_gpu.cpp
deleted file mode 100644
index 2f20ac24d74..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_input_gpu.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fully_connected_grad_input_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h"
-#include "fully_connected_grad_input/fully_connected_grad_input_kernel_base.h"
-#include "api/fully_connected_grad_input.hpp"
-
-namespace cldnn {
-namespace gpu {
-
-struct fully_connected_grad_input_gpu : typed_primitive_gpu_impl<fully_connected_grad_input> {
-    using parent = typed_primitive_gpu_impl<fully_connected_grad_input>;
-    using parent::parent;
-
-protected:
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<fully_connected_grad_input>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args = parent::get_arguments(instance, 1);
-        args.weights = (memory_impl::cptr) &instance.weights_memory();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const fully_connected_grad_input_node& arg) {
-        auto fully_connected_grad_input_params =
-            get_default_params<kernel_selector::fully_connected_grad_input_params>(arg);
-        auto fully_connected_grad_input_optional_params =
-            get_default_optional_params<kernel_selector::fully_connected_grad_input_optional_params>(arg.get_program());
-
-        const auto& weights_layout = arg.weights().get_output_layout();
-        fully_connected_grad_input_params.weights = convert_weights_tensor(weights_layout);
-        fully_connected_grad_input_params.gradient = true;
-        fully_connected_grad_input_params.inputs.push_back(
-            convert_data_tensor(arg.get_dependency(1).get_output_layout()));
-
-        auto& kernel_selector = kernel_selector::fully_connected_grad_input_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(fully_connected_grad_input_params,
-                                                           fully_connected_grad_input_optional_params);
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto fully_connected_grad_input = new fully_connected_grad_input_gpu(arg, best_kernels[0]);
-
-        return fully_connected_grad_input;
-    }
-};
-
-namespace detail {
-
-attach_fully_connected_grad_input_gpu::attach_fully_connected_grad_input_gpu() {
-    auto val_fw = fully_connected_grad_input_gpu::create;
-
-    implementation_map<fully_connected_grad_input>::add({
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw},
-    });
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp
deleted file mode 100644
index aeb8aca456b..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fully_connected_grad_weights_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "network_impl.h"
-#include "kernel_selector_helper.h"
-#include "fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h"
-#include "fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h"
-#include "api/fully_connected_grad_weights.hpp"
-
-namespace cldnn {
-namespace gpu {
-
-struct fully_connected_grad_weights_gpu : typed_primitive_gpu_impl<fully_connected_grad_weights> {
-    using parent = typed_primitive_gpu_impl<fully_connected_grad_weights>;
-    using parent::parent;
-
-protected:
-    bool validate_impl(const typed_primitive_inst<fully_connected_grad_weights>& instance) const override {
-        bool res = true;
-
-        if (instance.use_momentum()) {
-            CLDNN_ERROR_LAYOUT_MISMATCH(_outer.id(),
-                                        "Filter memory",
-                                        instance.weights_memory().get_layout(),
-                                        "previous weights grad memory",
-                                        _outer.prev_weights_grad().get_output_layout(),
-                                        "");
-            CLDNN_ERROR_LAYOUT_MISMATCH(_outer.id(),
-                                        "Bias memory",
-                                        instance.bias_memory().get_layout(),
-                                        "previous bias grad memory",
-                                        _outer.prev_bias_grad().get_output_layout(),
-                                        "");
-        }
-
-        return res;
-    }
-
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<fully_connected_grad_weights>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args = parent::get_arguments(instance, 1);
-        args.weights = (memory_impl::cptr) &instance.weights_memory();
-        args.bias = (memory_impl::cptr) (instance.bias_term() ? &instance.bias_memory() : nullptr);
-        args.prev_weights_grad = (memory_impl::cptr) (instance.use_momentum() ? &instance.prev_weights_grad() : nullptr);
-        args.prev_bias_grad =
-            (memory_impl::cptr) (instance.bias_term() ? instance.use_momentum() ? &instance.prev_bias_grad() : nullptr : nullptr);
-
-        args.lr = instance.get_network().get_learning_rate();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const fully_connected_grad_weights_node& arg) {
-        auto fully_connected_grad_weights_params =
-            get_default_learning_params<kernel_selector::fully_connected_grad_weights_params>(arg);
-        auto fully_connected_grad_weights_optional_params =
-            get_default_learning_optional_params<kernel_selector::fully_connected_grad_weights_optional_params>(
-                arg.get_program());
-
-        fully_connected_grad_weights_params.gradient = true;
-        fully_connected_grad_weights_params.inputs.push_back(
-            convert_data_tensor(arg.get_dependency(1).get_output_layout()));
-
-        auto& kernel_selector = kernel_selector::fully_connected_grad_weights_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(fully_connected_grad_weights_params,
-                                                           fully_connected_grad_weights_optional_params);
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto fully_connected_grad_weights = new fully_connected_grad_weights_gpu(arg, best_kernels[0]);
-
-        return fully_connected_grad_weights;
-    }
-};
-
-namespace detail {
-
-attach_fully_connected_grad_weights_gpu::attach_fully_connected_grad_weights_gpu() {
-    auto val_fw = fully_connected_grad_weights_gpu::create;
-
-    implementation_map<fully_connected_grad_weights>::add({
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw},
-        {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw},
-    });
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp
deleted file mode 100644
index 3b960472120..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "fused_conv_bn_scale_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "kernel_runner.h"
-#include "fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h"
-#include "fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h"
-#include <algorithm>
-#include <memory>
-
-namespace cldnn {
-namespace gpu {
-
-struct fused_conv_bn_scale_gpu : typed_primitive_gpu_impl<fused_conv_bn_scale> {
-    using parent = typed_primitive_gpu_impl<fused_conv_bn_scale>;
-    using parent::parent;
-
-protected:
-    bool validate_impl(const typed_primitive_inst<fused_conv_bn_scale>& instance) const override {
-        bool res = true;
-
-        // Check whether all memory elements use the same unit type (FP16 or FP32).
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(),
-                                        "Input memory",
-                                        instance.node.input().get_output_layout().data_type,
-                                        "output memory",
-                                        instance.node.get_output_layout().data_type,
-                                        "");
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(),
-                                        "Input memory",
-                                        instance.node.input().get_output_layout().data_type,
-                                        "filter memory",
-                                        instance.weights_memory(0).get_layout().data_type,
-                                        "");
-
-        return res;
-    }
-
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<fused_conv_bn_scale>& instance,
-                                                        int32_t split) const override {
-        kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
-        auto desc = std::static_pointer_cast<const fused_conv_bn_scale>(instance.desc());
-
-        args.weights = (memory_impl::cptr) &instance.weights_memory(split);
-        args.bias = (memory_impl::cptr) (instance.bias_term() ? &instance.bias_memory(split) : nullptr);
-
-        if (!desc->scale_bias.empty()) {
-            if (instance.is_fused_in_training()) {
-                args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 4)));
-                args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 3)));
-                args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 2)));
-                args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 1)));
-            } else {
-                args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 1)));
-            }
-        } else if (instance.is_fused_in_training()) {
-            args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 3)));
-            args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 2)));
-            args.inputs.push_back((memory_impl::cptr) (&instance.dep_memory(instance.dependencies().size() - 1)));
-        }
-
-        return args;
-    }
-
-    int32_t get_split() const override { return _outer.get_split(); }
-
-public:
-    static primitive_impl* create(const fused_conv_bn_scale_node& arg) {
-        const auto& primitive = arg.get_primitive();
-        const auto& input_layout = arg.input().get_output_layout();
-        const auto& weights_layout = arg.weights(0).get_output_layout();
-        const auto& weights_size = weights_layout.size;
-
-        const auto& split = primitive->split();
-        const auto& stride = primitive->stride;
-        const auto& input_offset = primitive->input_offset;
-        const auto& dilation = primitive->dilation;
-
-        assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]);
-
-        auto fuse_params = get_weights_bias_default_params<kernel_selector::fused_conv_bn_scale_params>(arg, split);
-        auto fuse_optional_params =
-            get_default_weights_bias_optional_params<kernel_selector::fused_conv_bn_scale_optional_params>(
-                arg.get_program());
-
-        const auto additional_offset = tensor::max(input_offset, (tensor) 0);
-        if (additional_offset != (tensor) 0) {
-            fuse_params.inputs[0] = convert_data_tensor(input_layout, split, additional_offset);
-        }
-
-        fuse_params.epsilon = arg.get_primitive()->epsilon;
-
-        fuse_params.fused_in_training = arg.is_fused_in_training();
-        fuse_params.scale_bias = arg.scale_bias_term();
-
-        fuse_params.split = split;
-        fuse_params.filterSize = {
-            (uint32_t)weights_size.spatial[0],
-            (uint32_t)weights_size.spatial[1],
-        };
-
-        fuse_params.padding = {(uint32_t)std::max(-input_offset.spatial[0], 0),
-                               (uint32_t)std::max(-input_offset.spatial[1], 0)};
-
-        fuse_params.stride = {(uint32_t)stride.spatial[0], (uint32_t)stride.spatial[1]};
-
-        fuse_params.dilation = {(uint32_t)dilation.spatial[0], (uint32_t)dilation.spatial[1]};
-
-        auto& kernel_selector = kernel_selector::fused_conv_bn_scale_kernel_selector::Instance();
-
-        const auto& tuning_config = arg.get_program().get_options().get<build_option_type::tuning_config>();
-
-        if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache ||
-            tuning_config->config.mode == tuning_mode::tuning_retune_and_cache) {
-            fuse_optional_params.tuningParams.runner =
-                std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), arg.get_program().get_id(), true);
-        }
-
-        kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(fuse_params, fuse_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto fuse = new fused_conv_bn_scale_gpu(arg, best_kernels[0]);
-
-        return fuse;
-    }
-};
-
-namespace detail {
-
-attach_fused_conv_bn_scale_gpu::attach_fused_conv_bn_scale_gpu() {
-    implementation_map<fused_conv_bn_scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                                 fused_conv_bn_scale_gpu::create);
-    implementation_map<fused_conv_bn_scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                                 fused_conv_bn_scale_gpu::create);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp
index 66bb526baa8..c652514b8d1 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp
@@ -54,14 +54,6 @@ protected:
 
         args.weights = (memory_impl::cptr) &instance.weights_memory(split);
         args.bias = (memory_impl::cptr) (instance.bias_term() ? &instance.bias_memory(split) : nullptr);
-        args.weights_quantization_factors = (memory_impl::cptr) (instance.weights_quantization_factors_term()
-                                                ? &instance.weights_quantization_factors_memory(split)
-                                                : nullptr);
-        args.output_calibration_factors = (memory_impl::cptr) (instance.conv_output_calibration_factors_term()
-                                              ? &instance.output_calibration_factors_memory(split)
-                                              : nullptr);
-        if (instance.eltw_output_calibration_factors_term())
-            args.fused_op_calibration_factors.push_back((memory_impl::cptr) &instance.eltw_output_calibration_factors_memory());
         return args;
     }
 
@@ -112,7 +104,6 @@ public:
 
         fused_params.conv.transposed = transposed;
 
-        fused_params.non_conv_scale = primitive->non_conv_scale;
         fused_params.second_input_in_output = primitive->second_input_in_output;
         fused_params.depth_to_space_already_fused = primitive->depth_to_space_already_fused;
 
@@ -131,38 +122,6 @@ public:
         conv_params.stride = {(uint32_t)stride.spatial[0], (uint32_t)stride.spatial[1], (uint32_t)stride.spatial[2]};
         conv_params.dilation = {(uint32_t)dilation.spatial[0], (uint32_t)dilation.spatial[1], (uint32_t)dilation.spatial[2] };
 
-        if (primitive->conv.weights_quantization_factors.size() > 0) {
-            conv_params.int8_quantization = true;
-            conv_params.weights_quantization_factors.push_back(
-                convert_data_tensor(arg.weights_quantization_factors().get_output_layout())
-                    .FlattenFeatureAndSpatials());
-            conv_params.input_quantization_factor = arg.get_conv_input_qf();
-
-            if (primitive->conv.output_calibration_factors.size() > 0) {
-                conv_params.output_calibration = true;
-                conv_params.output_calibration_factors.push_back(
-                    convert_data_tensor(arg.conv_output_calibration_factors().get_output_layout())
-                        .FlattenFeatureAndSpatials());
-            } else {
-                conv_params.output_quantization_factor = arg.get_conv_output_qf();
-            }
-        }
-
-        // eltw params
-        if (primitive->eltw.output_calibration_factors.size() > 0 ||
-            primitive->eltw.output_quantization_factor != 1.0f) {
-            eltw_params.int8_quantization = true;
-
-            if (primitive->eltw.output_calibration_factors.size() > 0) {
-                eltw_params.output_calibration = true;
-                eltw_params.output_calibration_factors.push_back(
-                    convert_data_tensor(arg.eltw_output_calibration_factors().get_output_layout())
-                        .FlattenFeatureAndSpatials());
-            } else {
-                eltw_params.output_quantization_factor = arg.get_eltw_output_qf();
-            }
-        }
-
         // stride
         if (!primitive->eltw.stride.empty()) {
             const auto& eltw_stride = primitive->eltw.stride;
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp
deleted file mode 100644
index f3a184a2388..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "index_select_inst.h"
-
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "kernel_selector_helper.h"
-#include "index_select/index_select_kernel_selector.h"
-#include "index_select/index_select_kernel_base.h"
-#include "error_handler.h"
-#include <vector>
-
-namespace cldnn {
-namespace gpu {
-
-namespace {
-inline std::vector<kernel_selector::IndexSelectAxis> convert_to_index_select_axis(
-    std::vector<index_select_axis_name> axes) {
-    std::vector<kernel_selector::IndexSelectAxis> axes_names = {};
-    for (size_t i = 0; i < axes.size(); i++) {
-        switch (axes[i]) {
-            case index_select_axis_name::along_b:
-                axes_names.push_back(kernel_selector::IndexSelectAxis::BATCH);
-                break;
-            case index_select_axis_name::along_f:
-                axes_names.push_back(kernel_selector::IndexSelectAxis::FEATURE);
-                break;
-            case index_select_axis_name::along_x:
-                axes_names.push_back(kernel_selector::IndexSelectAxis::X);
-                break;
-            case index_select_axis_name::along_y:
-                axes_names.push_back(kernel_selector::IndexSelectAxis::Y);
-                break;
-            default:
-                axes_names.push_back(kernel_selector::IndexSelectAxis::BATCH);
-                break;
-        }
-    }
-    return axes_names;
-}
-}  // namespace
-
-struct index_select_gpu : typed_primitive_gpu_impl<index_select> {
-    using parent = typed_primitive_gpu_impl<index_select>;
-    using parent::parent;
-
-    static primitive_impl* create(const index_select_node& arg) {
-        auto index_select_params = get_default_params<kernel_selector::index_select_params>(arg, 1);
-        auto index_select_optional_params =
-            get_default_optional_params<kernel_selector::index_select_optional_params>(arg.get_program());
-
-        if (!arg.get_reverse())
-            index_select_params.inputs.push_back(convert_data_tensor(arg.indices().get_output_layout()));
-
-        index_select_params.axes = convert_to_index_select_axis(arg.get_axes());
-        index_select_params.reverse = arg.get_reverse();
-
-        auto& kernel_selector = kernel_selector::index_select_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(index_select_params, index_select_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        return new index_select_gpu(arg, best_kernels[0]);
-    }
-};
-
-namespace detail {
-
-attach_index_select_gpu::attach_index_select_gpu() {
-    auto val_fw = index_select_gpu::create;
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                          val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                          val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx),
-                                          val_fw);
-
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-                                          val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-                                          val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb), val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::yxfb), val_fw);
-    implementation_map<index_select>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::yxfb),
-                                          val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp
index ebd89c891f0..cf666b1fc59 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp
@@ -109,42 +109,6 @@ void set_arguments(kernels_cache::kernel_type& kernel,
                         status = kernel.setArg(i, dynamic_cast<const gpu::gpu_buffer&>(*data.bias).get_buffer());
                 }
                 break;
-            case kernel_selector::kernel_argument_types::PREV_WEIGHTS_GRADIENT:
-                if (data.prev_weights_grad) {
-                    if (data.prev_weights_grad->get_layout().format.is_image_2d())
-                        status =
-                            kernel.setArg(i,
-                                          dynamic_cast<const gpu::gpu_image2d&>(*data.prev_weights_grad).get_buffer());
-                    else if (memory_capabilities::is_usm_type(data.prev_weights_grad->get_allocation_type()))
-                        status =
-                            kernel.setArgUsm(i,
-                                            dynamic_cast<const gpu::gpu_usm&>(*data.prev_weights_grad).get_buffer());
-                    else
-                        status =
-                            kernel.setArg(i,
-                                          dynamic_cast<const gpu::gpu_buffer&>(*data.prev_weights_grad).get_buffer());
-                }
-                break;
-            case kernel_selector::kernel_argument_types::PREV_BIAS_GRADIENT:
-                if (data.prev_bias_grad) {
-                    if (memory_capabilities::is_usm_type(data.prev_bias_grad->get_allocation_type()))
-                        status = kernel.setArgUsm(i, dynamic_cast<const gpu::gpu_usm&>(*data.prev_bias_grad).get_buffer());
-                    else
-                        status = kernel.setArg(i, dynamic_cast<const gpu::gpu_buffer&>(*data.prev_bias_grad).get_buffer());
-                }
-                break;
-            case kernel_selector::kernel_argument_types::WEIGHTS_QUANTIZATION_FACTORS:
-                if (data.weights_quantization_factors) {
-                    if (memory_capabilities::is_usm_type(data.weights_quantization_factors->get_allocation_type()))
-                        status = kernel.setArgUsm(
-                            i,
-                            dynamic_cast<const gpu::gpu_usm&>(*data.weights_quantization_factors).get_buffer());
-                    else
-                        status = kernel.setArg(
-                            i,
-                            dynamic_cast<const gpu::gpu_buffer&>(*data.weights_quantization_factors).get_buffer());
-                }
-                break;
             case kernel_selector::kernel_argument_types::WEIGHTS_ZERO_POINTS:
                 if (data.weights_zero_points) {
                     if (memory_capabilities::is_usm_type(data.weights_zero_points->get_allocation_type()))
@@ -180,35 +144,6 @@ void set_arguments(kernels_cache::kernel_type& kernel,
                                  i,
                                  dynamic_cast<const gpu::gpu_buffer&>(*data.compensation).get_buffer());
                 }
-                break;
-            case kernel_selector::kernel_argument_types::OUTPUT_CALIBRATION_FACTORS:
-                if (args[i].index == 0) {
-                    if (data.output_calibration_factors) {
-                        if (memory_capabilities::is_usm_type(data.output_calibration_factors->get_allocation_type()))
-                            status = kernel.setArgUsm(
-                                i,
-                                dynamic_cast<const gpu::gpu_usm&>(*data.output_calibration_factors).get_buffer());
-                        else
-                            status = kernel.setArg(
-                                i,
-                                dynamic_cast<const gpu::gpu_buffer&>(*data.output_calibration_factors).get_buffer());
-                    }
-                } else {
-                    size_t new_idx = args[i].index - 1;
-                    if (new_idx < data.fused_op_calibration_factors.size() &&
-                        data.fused_op_calibration_factors[new_idx]) {
-                        if (memory_capabilities::is_usm_type(data.fused_op_calibration_factors[new_idx]->get_allocation_type()))
-                            status = kernel.setArgUsm(
-                                i,
-                                dynamic_cast<const gpu::gpu_usm&>(*data.fused_op_calibration_factors[new_idx]).get_buffer());
-                        else
-                            status = kernel.setArg(
-                                i,
-                                dynamic_cast<const gpu::gpu_buffer&>(*data.fused_op_calibration_factors[new_idx])
-                                    .get_buffer());
-                    }
-                }
-
                 break;
             case kernel_selector::kernel_argument_types::SCALE_TABLE:
                 if (data.scale_table) {
@@ -229,9 +164,6 @@ void set_arguments(kernels_cache::kernel_type& kernel,
             case kernel_selector::kernel_argument_types::SPLIT:
                 status = kernel.setArg(i, data.split);
                 break;
-            case kernel_selector::kernel_argument_types::LEARNING_RATE:
-                status = kernel.setArg(i, data.lr);
-                break;
             case kernel_selector::kernel_argument_types::SCALAR:
                 if (data.scalars && args[i].index < data.scalars->size()) {
                     const auto& scalar = (*data.scalars)[args[i].index];
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel.h b/inference-engine/thirdparty/clDNN/src/gpu/kernel.h
index ea40daabdbe..46ceee87d3c 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernel.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel.h
@@ -71,18 +71,13 @@ public:
         memory_impl::cptr hidden;
         memory_impl::cptr cell;
         memory_impl::cptr bias;
-        memory_impl::cptr weights_quantization_factors;
-        memory_impl::cptr output_calibration_factors;
         memory_impl::cptr weights_zero_points;
         memory_impl::cptr activations_zero_points;
         memory_impl::cptr compensation;
         memory_impl::cptr lookup_table;
         memory_impl::cptr scale_table;
         memory_impl::cptr slope;
-        memory_impl::cptr prev_weights_grad;
-        memory_impl::cptr prev_bias_grad;
         // used for fused primitives
-        std::vector<memory_impl::cptr> fused_op_calibration_factors;
         std::vector<memory_impl::cptr> fused_op_inputs;
         int32_t split = 0;
         float lr;
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp
deleted file mode 100644
index e139dbda52e..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "lookup_table_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "lookup_table/lookup_table_kernel_selector.h"
-#include "lookup_table/lookup_table_kernel_base.h"
-#include "kernel_runner.h"
-
-namespace cldnn {
-namespace gpu {
-
-struct lookup_table_gpu : typed_primitive_gpu_impl<lookup_table> {
-    using parent = typed_primitive_gpu_impl<lookup_table>;
-    using parent::parent;
-
-protected:
-    bool validate_impl(const typed_primitive_inst<lookup_table>& instance) const override {
-        bool res = true;
-
-        // Check whether all memory elements use the same unit type (FP16 or FP32).
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(_outer.id(),
-                                        "Input memory",
-                                        instance.input_memory(1).get_layout().data_type,
-                                        "output memory",
-                                        instance.output_memory().get_layout().data_type,
-                                        "");
-
-        return res;
-    }
-
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<lookup_table>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args = parent::get_arguments(instance, 0);
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const lookup_table_node& arg) {
-        const auto& primitive = arg.get_primitive();
-        // const auto& input_layout = arg.input().get_output_layout();
-
-        // const auto& input_size = input_layout.size;
-
-        const auto& axis = primitive->axis;
-        const auto& with_axis = primitive->with_axis;
-
-        auto lookt_params = get_default_params<kernel_selector::lookup_table_params>(arg);
-        auto lookt_optional_params =
-            get_default_optional_params<kernel_selector::lookup_table_optional_params>(arg.get_program());
-
-        lookt_params.inputIndices = convert_data_tensor(arg.indices().get_output_layout());
-        if (with_axis) {
-            switch (axis) {
-                case lookup_table::batch:
-                    lookt_params.lookUpTableAxis = kernel_selector::lookt_axis::BATCH;
-                    lookt_params.numberOfValues = (uint32_t)lookt_params.inputIndices.Batch().v;
-                    break;
-                case lookup_table::feature:
-                    lookt_params.lookUpTableAxis = kernel_selector::lookt_axis::FEATURE;
-                    lookt_params.numberOfValues = (uint32_t)lookt_params.inputIndices.Feature().v;
-                    break;
-                case lookup_table::x:
-                    lookt_params.lookUpTableAxis = kernel_selector::lookt_axis::X;
-                    lookt_params.numberOfValues = (uint32_t)lookt_params.inputIndices.X().v;
-                    break;
-                case lookup_table::y:
-                    lookt_params.lookUpTableAxis = kernel_selector::lookt_axis::Y;
-                    lookt_params.numberOfValues = (uint32_t)lookt_params.inputIndices.Y().v;
-                    break;
-                default:
-                    break;
-            }
-        } else {
-            lookt_params.numberOfValues = (uint32_t)lookt_params.inputIndices.X().v;
-        }
-
-        auto& kernel_selector = kernel_selector::lookup_table_kernel_selector::Instance();
-
-        kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(lookt_params, lookt_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto conv = new lookup_table_gpu(arg, best_kernels[0]);
-
-        return conv;
-    }
-};
-
-namespace detail {
-
-attach_lookup_table_gpu::attach_lookup_table_gpu() {
-    implementation_map<lookup_table>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-                                          lookup_table_gpu::create);
-    implementation_map<lookup_table>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-                                          lookup_table_gpu::create);
-    implementation_map<lookup_table>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::yxfb),
-                                          lookup_table_gpu::create);
-    implementation_map<lookup_table>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                          lookup_table_gpu::create);
-    implementation_map<lookup_table>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                          lookup_table_gpu::create);
-    implementation_map<lookup_table>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx),
-                                          lookup_table_gpu::create);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
index bcf0872a53b..1efa95d04d9 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.cpp
@@ -24,20 +24,14 @@ namespace cldnn { namespace gpu {
 
 void register_implementations_gpu() {
     REGISTER_GPU(activation);
-    REGISTER_GPU(activation_grad);
-    REGISTER_GPU(apply_adam);
     REGISTER_GPU(arg_max_min);
     REGISTER_GPU(average_unpooling);
-    REGISTER_GPU(batch_norm);
-    REGISTER_GPU(batch_norm_grad);
     REGISTER_GPU(binary_convolution);
     REGISTER_GPU(border);
     REGISTER_GPU(broadcast);
     REGISTER_GPU(concatenation);
     REGISTER_GPU(condition);
-    REGISTER_GPU(contract);
     REGISTER_GPU(convolution);
-    REGISTER_GPU(convolution_grad_weights);
     REGISTER_GPU(crop);
     REGISTER_GPU(custom_gpu_primitive);
     REGISTER_GPU(data);
@@ -48,15 +42,10 @@ void register_implementations_gpu() {
     REGISTER_GPU(batch_to_space);
     REGISTER_GPU(detection_output);
     REGISTER_GPU(eltwise);
-    REGISTER_GPU(embed);
     REGISTER_GPU(fully_connected);
-    REGISTER_GPU(fully_connected_grad_input);
-    REGISTER_GPU(fully_connected_grad_weights);
     REGISTER_GPU(gather);
     REGISTER_GPU(gemm);
-    REGISTER_GPU(index_select);
     REGISTER_GPU(input_layout);
-    REGISTER_GPU(lookup_table);
     REGISTER_GPU(lrn);
     REGISTER_GPU(lstm_gemm);
     REGISTER_GPU(lstm_elt);
@@ -79,17 +68,13 @@ void register_implementations_gpu() {
     REGISTER_GPU(reverse_sequence);
     REGISTER_GPU(roi_pooling);
     REGISTER_GPU(scale);
-    REGISTER_GPU(scale_grad_input);
-    REGISTER_GPU(scale_grad_weights);
     REGISTER_GPU(select);
     REGISTER_GPU(shuffle_channels);
     REGISTER_GPU(softmax);
-    REGISTER_GPU(softmax_loss_grad);
     REGISTER_GPU(space_to_batch);
     REGISTER_GPU(space_to_depth);
     REGISTER_GPU(strided_slice);
     REGISTER_GPU(tile);
-    REGISTER_GPU(fused_conv_bn_scale);
     REGISTER_GPU(fused_conv_eltwise);
     REGISTER_GPU(lstm_dynamic_input);
     REGISTER_GPU(lstm_dynamic_timeloop);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
index 23daa9ece89..7c756f82ca4 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/register_gpu.hpp
@@ -18,21 +18,15 @@
 #pragma once
 
 #include "api/activation.hpp"
-#include "api/activation_grad.hpp"
-#include "api/apply_adam.hpp"
 #include "api/arg_max_min.hpp"
 #include "api/average_unpooling.hpp"
-#include "api/batch_norm.hpp"
-#include "api/batch_norm_grad.hpp"
 #include "api/batch_to_space.hpp"
 #include "api/binary_convolution.hpp"
 #include "api/border.hpp"
 #include "api/broadcast.hpp"
 #include "api/concatenation.hpp"
 #include "api/condition.hpp"
-#include "api/contract.hpp"
 #include "api/convolution.hpp"
-#include "api/convolution_grad_weights.hpp"
 #include "api/crop.hpp"
 #include "api/custom_gpu_primitive.hpp"
 #include "api/data.hpp"
@@ -40,15 +34,10 @@
 #include "api/depth_to_space.hpp"
 #include "api/detection_output.hpp"
 #include "api/eltwise.hpp"
-#include "api/embed.hpp"
 #include "api/fully_connected.hpp"
-#include "api/fully_connected_grad_input.hpp"
-#include "api/fully_connected_grad_weights.hpp"
 #include "api/gather.hpp"
 #include "api/gemm.hpp"
-#include "api/index_select.hpp"
 #include "api/input_layout.hpp"
-#include "api/lookup_table.hpp"
 #include "api/lrn.hpp"
 #include "api/lstm.hpp"
 #include "api/lstm_dynamic.hpp"
@@ -71,18 +60,14 @@
 #include "api/reverse_sequence.hpp"
 #include "api/roi_pooling.hpp"
 #include "api/scale.hpp"
-#include "api/scale_grad_input.hpp"
-#include "api/scale_grad_weights.hpp"
 #include "api/select.hpp"
 #include "api/shuffle_channels.hpp"
 #include "api/softmax.hpp"
-#include "api/softmax_loss_grad.hpp"
 #include "api/space_to_batch.hpp"
 #include "api/strided_slice.hpp"
 #include "api/tile.hpp"
 #include "api/resample.hpp"
 #include "api/gather_tree.hpp"
-#include "api_extension/fused_conv_bn_scale.hpp"
 #include "api_extension/fused_conv_eltwise.hpp"
 #include "api_extension/lstm_dynamic_input.hpp"
 #include "api_extension/lstm_dynamic_timeloop.hpp"
@@ -103,21 +88,15 @@ namespace detail {
     }
 
 REGISTER_GPU(activation);
-REGISTER_GPU(activation_grad);
-REGISTER_GPU(apply_adam);
 REGISTER_GPU(arg_max_min);
 REGISTER_GPU(average_unpooling);
-REGISTER_GPU(batch_norm);
-REGISTER_GPU(batch_norm_grad);
 REGISTER_GPU(batch_to_space);
 REGISTER_GPU(binary_convolution);
 REGISTER_GPU(border);
 REGISTER_GPU(broadcast);
 REGISTER_GPU(concatenation);
 REGISTER_GPU(condition);
-REGISTER_GPU(contract);
 REGISTER_GPU(convolution);
-REGISTER_GPU(convolution_grad_weights);
 REGISTER_GPU(crop);
 REGISTER_GPU(custom_gpu_primitive);
 REGISTER_GPU(data);
@@ -129,11 +108,8 @@ REGISTER_GPU(detection_output);
 REGISTER_GPU(eltwise);
 REGISTER_GPU(embed);
 REGISTER_GPU(fully_connected);
-REGISTER_GPU(fully_connected_grad_input);
-REGISTER_GPU(fully_connected_grad_weights);
 REGISTER_GPU(gather);
 REGISTER_GPU(gemm);
-REGISTER_GPU(index_select);
 REGISTER_GPU(input_layout);
 REGISTER_GPU(lookup_table);
 REGISTER_GPU(lrn);
@@ -158,17 +134,13 @@ REGISTER_GPU(reshape);
 REGISTER_GPU(reverse_sequence);
 REGISTER_GPU(roi_pooling);
 REGISTER_GPU(scale);
-REGISTER_GPU(scale_grad_input);
-REGISTER_GPU(scale_grad_weights);
 REGISTER_GPU(select);
 REGISTER_GPU(shuffle_channels);
 REGISTER_GPU(softmax);
-REGISTER_GPU(softmax_loss_grad);
 REGISTER_GPU(space_to_batch);
 REGISTER_GPU(space_to_depth);
 REGISTER_GPU(strided_slice);
 REGISTER_GPU(tile);
-REGISTER_GPU(fused_conv_bn_scale);
 REGISTER_GPU(fused_conv_eltwise);
 REGISTER_GPU(lstm_dynamic_input);
 REGISTER_GPU(lstm_dynamic_timeloop);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/scale_grad_input_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/scale_grad_input_gpu.cpp
deleted file mode 100644
index 5740aa6b16d..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/scale_grad_input_gpu.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_input_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "kernel_selector_helper.h"
-#include "eltwise/eltwise_kernel_selector.h"
-#include "eltwise/eltwise_kernel_base.h"
-#include "error_handler.h"
-
-using namespace cldnn;
-
-namespace cldnn {
-namespace gpu {
-
-struct scale_grad_input_gpu : typed_primitive_gpu_impl<scale_grad_input> {
-    using parent = typed_primitive_gpu_impl<scale_grad_input>;
-    using parent::parent;
-
-protected:
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<scale_grad_input>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args;
-        args.inputs = { (memory_impl::cptr) &instance.input_memory(), (memory_impl::cptr) &instance.scale_input_memory()};
-        args.output = (memory_impl::cptr) &instance.output_memory();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const scale_grad_input_node& arg) {
-        auto ew_params = get_default_params<kernel_selector::eltwise_params>(arg);
-        auto ew_optional_params =
-            get_default_optional_params<kernel_selector::eltwise_optional_params>(arg.get_program());
-
-        ew_params.inputs.push_back(convert_data_tensor(arg.scale_in().get_output_layout()));
-
-        ew_params.operations.push_back({{kernel_selector::eltwise_params::InputType::Buffer(0),
-                                         kernel_selector::eltwise_params::InputType::Buffer(1)},
-                                        kernel_selector::eltwise_mode::MUL});
-
-        ew_params.layoutBased = true;
-
-        auto& kernel_selector = kernel_selector::eltwise_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(ew_params, ew_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto scale_grad_input = new scale_grad_input_gpu(arg, best_kernels[0]);
-
-        return scale_grad_input;
-    }
-};
-
-namespace detail {
-
-attach_scale_grad_input_gpu::attach_scale_grad_input_gpu() {
-    auto val_fw = scale_grad_input_gpu::create;
-
-    implementation_map<scale_grad_input>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-                                              val_fw);
-    implementation_map<scale_grad_input>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-                                              val_fw);
-    implementation_map<scale_grad_input>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                              val_fw);
-    implementation_map<scale_grad_input>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                              val_fw);
-    implementation_map<scale_grad_input>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf),
-                                              val_fw);
-    implementation_map<scale_grad_input>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf),
-                                              val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/scale_grad_weights_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/scale_grad_weights_gpu.cpp
deleted file mode 100644
index 7da6f91ba2b..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/scale_grad_weights_gpu.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_weights_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "kernel_selector_helper.h"
-#include "scale_grad_weights/scale_grad_weights_kernel_selector.h"
-#include "scale_grad_weights/scale_grad_weights_kernel_base.h"
-#include "error_handler.h"
-#include "network_impl.h"
-
-using namespace cldnn;
-
-namespace cldnn {
-namespace gpu {
-
-struct scale_grad_weights_gpu : typed_primitive_gpu_impl<scale_grad_weights> {
-    using parent = typed_primitive_gpu_impl<scale_grad_weights>;
-    using parent::parent;
-
-protected:
-    kernel::kernel_arguments_data get_arguments(typed_primitive_inst<scale_grad_weights>& instance,
-                                                        int32_t) const override {
-        kernel::kernel_arguments_data args;
-        args.inputs = {(memory_impl::cptr) &instance.input_memory(0), (memory_impl::cptr) &instance.input_memory(1)};
-        args.output = (memory_impl::cptr) &instance.output_memory();
-
-        args.bias = (memory_impl::cptr) (_outer.bias_term() ? &instance.bias_memory() : nullptr);
-        args.weights = (memory_impl::cptr) &instance.weights_memory();
-
-        args.prev_weights_grad = (memory_impl::cptr) (instance.use_momentum() ? &instance.prev_scale_grad() : nullptr);
-        args.prev_bias_grad =
-            (memory_impl::cptr) (instance.bias_term() ? instance.use_momentum() ? &instance.prev_bias_grad() : nullptr : nullptr);
-        args.lr = instance.get_network().get_learning_rate();
-
-        return args;
-    }
-
-public:
-    static primitive_impl* create(const scale_grad_weights_node& arg) {
-        auto scale_params = get_default_learning_params<kernel_selector::scale_grad_weights_params>(arg);
-        auto scale_optional_params =
-            get_default_learning_optional_params<kernel_selector::scale_grad_weights_optional_params>(
-                arg.get_program());
-
-        auto& kernel_selector = kernel_selector::scale_grad_weights_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(scale_params, scale_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto scale_grad_weights = new scale_grad_weights_gpu(arg, best_kernels[0]);
-
-        return scale_grad_weights;
-    }
-};
-
-namespace detail {
-
-attach_scale_grad_weights_gpu::attach_scale_grad_weights_gpu() {
-    auto val_fw = scale_grad_weights_gpu::create;
-
-    implementation_map<scale_grad_weights>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-                                                val_fw);
-    implementation_map<scale_grad_weights>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-                                                val_fw);
-    implementation_map<scale_grad_weights>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                                val_fw);
-    implementation_map<scale_grad_weights>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                                val_fw);
-    implementation_map<scale_grad_weights>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf),
-                                                val_fw);
-    implementation_map<scale_grad_weights>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf),
-                                                val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/softmax_loss_grad_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/softmax_loss_grad_gpu.cpp
deleted file mode 100644
index 2dd03647f60..00000000000
--- a/inference-engine/thirdparty/clDNN/src/gpu/softmax_loss_grad_gpu.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "softmax_loss_grad_inst.h"
-#include "primitive_gpu_base.h"
-#include "implementation_map.h"
-#include "kernel_selector_helper.h"
-#include "softmax_loss_grad/softmax_loss_grad_kernel_selector.h"
-#include "softmax_loss_grad/softmax_loss_grad_kernel_base.h"
-#include "error_handler.h"
-
-namespace cldnn {
-namespace gpu {
-
-struct softmax_loss_grad_gpu : typed_primitive_gpu_impl<softmax_loss_grad> {
-    using parent = typed_primitive_gpu_impl<softmax_loss_grad>;
-    using parent::parent;
-
-    static primitive_impl* create(const softmax_loss_grad_node& arg) {
-        auto sm_params = get_default_params<kernel_selector::softmax_loss_grad_params>(arg);
-        auto sm_optional_params =
-            get_default_optional_params<kernel_selector::softmax_loss_grad_optional_params>(arg.get_program());
-
-        sm_params.gradient = true;
-        sm_params.inputs.push_back(convert_data_tensor(arg.get_dependency(1).get_output_layout()));
-
-        auto& kernel_selector = kernel_selector::softmax_loss_grad_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(sm_params, sm_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto softmax_loss_grad_node = new softmax_loss_grad_gpu(arg, best_kernels[0]);
-
-        return softmax_loss_grad_node;
-    }
-};
-
-namespace detail {
-
-attach_softmax_loss_grad_gpu::attach_softmax_loss_grad_gpu() {
-    auto val_fw = softmax_loss_grad_gpu::create;
-    implementation_map<softmax_loss_grad>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb),
-                                               val_fw);
-    implementation_map<softmax_loss_grad>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb),
-                                               val_fw);
-    implementation_map<softmax_loss_grad>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                               val_fw);
-    implementation_map<softmax_loss_grad>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                               val_fw);
-    implementation_map<softmax_loss_grad>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf),
-                                               val_fw);
-    implementation_map<softmax_loss_grad>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf),
-                                               val_fw);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp
deleted file mode 100644
index c71acfd9537..00000000000
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "pass_manager.h"
-#include "batch_norm_inst.h"
-#include "reshape_inst.h"
-#include <vector>
-#include <memory>
-
-using namespace cldnn;
-
-// Some primitives require a specific shape for thier inputs/parameters.
-// We should check this and add reshape to be compliant with this.
-//
-// Example: batch_norm primitive requires that mean/variance/scale/shift is shape {1, X, 1, 1}
-void add_reshape_to_primitives::run(program_impl& p) {
-    auto processing_order = p.get_processing_order();
-
-    for (auto& node : processing_order) {
-        // if node is batch_norm and mean/var are given (i.e. use eltwise kernel to calculate batch_norm)
-        if (node->is_type<batch_norm>() &&
-            (!node->as<batch_norm>().calc_mean_var() && node->as<batch_norm>().use_global_stats())) {
-            auto mean_layout = node->as<batch_norm>().mean().get_output_layout();
-            auto mean_size = mean_layout.size;
-            auto mean_x = mean_size.spatial[0];
-            auto mean_y = mean_size.spatial[1];
-            auto mean_b = mean_size.batch[0];
-
-            if (mean_x != 1 || mean_y != 1 || mean_b != 1) {
-                auto mean_name = node->as<batch_norm>().mean().id();
-                std::vector<int32_t> mean_sizes = mean_size.sizes();
-                int32_t mean_max_size = *std::max_element(std::begin(mean_sizes), std::end(mean_sizes));
-
-                auto r_prim = std::make_shared<reshape>("reshape_" + mean_name + "_" + node->id(),
-                                                        mean_name,
-                                                        tensor(1, mean_max_size, 1, 1));
-                auto& r_prim_node = p.get_or_create(r_prim);
-
-                p.add_intermediate(r_prim_node, *node, 1, true);
-            }
-
-            auto variance_size = node->as<batch_norm>().variance().get_output_layout().size;
-            auto variance_x = variance_size.spatial[0];
-            auto variance_y = variance_size.spatial[1];
-            auto variance_b = variance_size.batch[0];
-
-            if (variance_x != 1 || variance_y != 1 || variance_b != 1) {
-                auto variance_name = node->as<batch_norm>().variance().id();
-                std::vector<int32_t> variance_sizes = variance_size.sizes();
-                int32_t variance_max_size = *std::max_element(std::begin(variance_sizes), std::end(variance_sizes));
-
-                auto r_prim = std::make_shared<reshape>("reshape_" + variance_name + "_" + node->id(),
-                                                        variance_name,
-                                                        tensor(1, variance_max_size, 1, 1));
-                auto& r_prim_node = p.get_or_create(r_prim);
-
-                p.add_intermediate(r_prim_node, *node, 2, true);
-            }
-
-            if (node->as<batch_norm>().use_scale_shift()) {
-                auto scale_size = node->as<batch_norm>().scale().get_output_layout().size;
-                auto scale_x = scale_size.spatial[0];
-                auto scale_y = scale_size.spatial[1];
-                auto scale_b = scale_size.batch[0];
-
-                if (scale_x != 1 || scale_y != 1 || scale_b != 1) {
-                    auto scale_name = node->as<batch_norm>().scale().id();
-                    std::vector<int32_t> scale_sizes = scale_size.sizes();
-                    int32_t scale_max_size = *std::max_element(std::begin(scale_sizes), std::end(scale_sizes));
-
-                    auto r_prim = std::make_shared<reshape>("reshape_" + scale_name + "_" + node->id(),
-                                                            scale_name,
-                                                            tensor(1, scale_max_size, 1, 1));
-                    auto& r_prim_node = p.get_or_create(r_prim);
-
-                    p.add_intermediate(r_prim_node, *node, 3, true);
-                }
-
-                auto shift_size = node->as<batch_norm>().shift().get_output_layout().size;
-                auto shift_x = shift_size.spatial[0];
-                auto shift_y = shift_size.spatial[1];
-                auto shift_b = shift_size.batch[0];
-
-                if (shift_x != 1 || shift_y != 1 || shift_b != 1) {
-                    auto shift_name = node->as<batch_norm>().shift().id();
-                    std::vector<int32_t> shift_sizes = shift_size.sizes();
-                    int32_t shift_max_size = *std::max_element(std::begin(shift_sizes), std::end(shift_sizes));
-
-                    auto r_prim = std::make_shared<reshape>("reshape_" + shift_name + "_" + node->id(),
-                                                            shift_name,
-                                                            tensor(1, shift_max_size, 1, 1));
-                    auto& r_prim_node = p.get_or_create(r_prim);
-
-                    p.add_intermediate(r_prim_node, *node, 4, true);
-                }
-            }
-        }
-    }
-}
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp
index aea2b1b5ce2..5a4204c7683 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_optimize_bias.cpp
@@ -57,7 +57,6 @@ template void pre_optimize_bias::optimize_bias<deconvolution_node>(deconvolution
 template void pre_optimize_bias::optimize_bias<fully_connected_node>(fully_connected_node& node,
                                                                      reorder_factory& rf,
                                                                      program_impl& p);
-template void pre_optimize_bias::optimize_bias<embed_node>(embed_node& node, reorder_factory& rf, program_impl& p);
 
 void pre_optimize_bias::run(program_impl& p, reorder_factory& rf) {
     for (auto& prim : p.get_processing_order()) {
@@ -67,8 +66,6 @@ void pre_optimize_bias::run(program_impl& p, reorder_factory& rf) {
             optimize_bias(prim->as<deconvolution>(), rf, p);
         } else if (prim->type() == fully_connected::type_id()) {
             optimize_bias(prim->as<fully_connected>(), rf, p);
-        } else if (prim->type() == embed::type_id()) {
-            optimize_bias(prim->as<embed>(), rf, p);
         }
     }
 }
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp
index d1958fd9180..95306643560 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp
@@ -60,7 +60,7 @@ void pre_replace_deconv::run(program_impl& p) {
             bool unit_stride = std::all_of(deconv_prim->stride.spatial.begin(),
                                            deconv_prim->stride.spatial.end(),
                                            [](tensor::value_type v) { return v == 1; });
-            if (unit_stride && !deconv_prim->gradient()) {
+            if (unit_stride) {
                 primitive_id deconv_id = node->id();
                 auto& input_node = node->get_dependency(0);
                 auto groups = deconv_node.get_groups();
@@ -201,8 +201,7 @@ void pre_replace_deconv::run(program_impl& p) {
                filter_size.spatial[0] == 9 && filter_size.spatial[1] == 9 &&
                deconv_prim->input_offset.spatial[0] == -4 && deconv_prim->input_offset.spatial[1] == -4 &&
                weights_vec.size() == 1 && deconv_prim->bias.size() == 1 &&
-               node->get_dependency(0).get_output_layout().format == format::bfyx &&
-               !deconv_prim->gradient()) {
+               node->get_dependency(0).get_output_layout().format == format::bfyx) {
                 primitive_id deconv_id = node->id();
                 auto& input_node = node->get_dependency(0);
                 primitive_id input_id = deconv_prim->input[0];
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
index c1ca243198f..b8b8ee8ff01 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@@ -26,12 +26,9 @@
 #include "quantize_inst.h"
 #include "binary_convolution_inst.h"
 #include "activation_inst.h"
-#include "batch_norm_inst.h"
-#include "batch_norm_grad_inst.h"
 #include "batch_to_space_inst.h"
 #include "crop_inst.h"
 #include "eltwise_inst.h"
-#include "fused_conv_bn_scale_inst.h"
 #include "fused_conv_eltwise_inst.h"
 #include "gemm_inst.h"
 #include "lrn_inst.h"
@@ -43,7 +40,6 @@
 #include "reshape_inst.h"
 #include "softmax_inst.h"
 #include "scale_inst.h"
-#include "scale_grad_weights_inst.h"
 #include "resample_inst.h"
 #include "depth_to_space_inst.h"
 #include "gather_inst.h"
@@ -196,7 +192,7 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) {
                 // TODO: new api needs to be created to read such caps
                 // right now use whitelist so no new primitives will be affected in case of lack of fused activation
                 // support
-                (!input.is_type<batch_norm>() && !input.is_type<concatenation>() && !input.is_type<convolution>() &&
+                (!input.is_type<concatenation>() && !input.is_type<convolution>() &&
                  !input.is_type<crop>() && !input.is_type<deconvolution>() && !input.is_type<eltwise>() &&
                  !input.is_type<fully_connected>() && !input.is_type<lrn>() && !input.is_type<normalize>() &&
                  !input.is_type<permute>() && !input.is_type<pooling>() && !input.is_type<reorder>() &&
@@ -768,17 +764,6 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
         new_eltw_strides.push_back(eltw.stride[eltw_second_input_idx]);
     }
 
-    // Get scaling of second eltwise input - only per tensor supported for now
-    float eltw_scale = 1.f;
-
-    if (eltw_node->inputs_quantization_term()) {
-        eltw_scale = eltw.input_quantization_factors[eltw_second_input_idx] /
-                     eltw.input_quantization_factors[eltw_fused_input_idx];
-    }
-
-    if (eltw_node->inputs_calibration_term())
-        return;
-
     auto conv_id = conv_node->id();
     auto eltw_id = eltw_node->id();
 
@@ -794,11 +779,6 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
                                              eltw.mode,
                                              conv.weights,
                                              conv.bias,
-                                             std::vector<primitive_id>{},
-                                             std::vector<primitive_id>{},
-                                             0.0f,
-                                             eltw_scale,  // eltw_scale
-                                             eltw.output_calibration_factors,
                                              new_eltw_strides,
                                              new_conv_stride,
                                              conv.input_offset,
@@ -819,12 +799,6 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
         new_node.add_fused_activation(eltw_node->get_fused_activations_funcs()[i],
                                       eltw_node->get_fused_activations_params()[i]);
 
-    // Copy output calibration factors pointer as replace will remove eltwise node
-    program_node* output_calibration_factors = nullptr;
-    if (eltw_node->output_calibration_term()) {
-        output_calibration_factors = &eltw_node->output_calibration_factors();
-    }
-
     p.replace(*eltw_node, new_node);
 
     // TODO: do it better, now it's done in a very ugly way to have good dependency order
@@ -846,10 +820,6 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
         conv_node->remove_dependency(1);
     }
 
-    if (output_calibration_factors != nullptr) {
-        updated_deps.push_back(output_calibration_factors);
-    }
-
     new_node.dependencies = updated_deps;
 
     if (if_already_depth_to_space_fused) {
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp
index 2eb39caa321..5f6c9dce17d 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/trim_to_outputs.cpp
@@ -20,7 +20,6 @@
 
 // ToDo: remove those include with the appropriate code below once we will have support for multiple outputs of a
 // primitive
-#include "batch_norm_inst.h"
 #include "max_unpooling_inst.h"
 #include "pooling_inst.h"
 #include <vector>
@@ -46,7 +45,6 @@ void trim_to_outputs::run(program_impl& p) {
                                               // it may have not been marked at this place but we don't want to remove it
             node->is_type<max_unpooling>() ||  // ToDo: remove this after support for multi-outputs in primitives will
                                                // be implemented.
-            node->is_type<batch_norm>() ||
             (node->is_type<pooling>() && node->as<pooling>().get_primitive()->mode == pooling_mode::max_with_argmax))
             special_nodes.push_back(node);
     }
diff --git a/inference-engine/thirdparty/clDNN/src/include/activation_grad_inst.h b/inference-engine/thirdparty/clDNN/src/include/activation_grad_inst.h
deleted file mode 100644
index adc349025ec..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/activation_grad_inst.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/activation_grad.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<activation_grad> : public typed_program_node_base<activation_grad> {
-    using parent = typed_program_node_base<activation_grad>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& input_arg() const { return get_dependency(1); }
-    program_node& slope_input() const { return get_dependency(2); }
-
-    bool is_parameterized() const { return !typed_desc()->additional_params_input.empty(); }
-};
-
-using activation_grad_node = typed_program_node<activation_grad>;
-
-template <>
-class typed_primitive_inst<activation_grad> : public typed_primitive_inst_base<activation_grad> {
-    using parent = typed_primitive_inst_base<activation_grad>;
-
-public:
-    static layout calc_output_layout(activation_grad_node const& node);
-    static std::string to_string(activation_grad_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, activation_grad_node const& node);
-
-    memory_impl& slope_memory() const { return dep_memory(2); }
-
-    bool is_parameterized() const { return !argument.additional_params_input.empty(); }
-};
-
-using activation_grad_inst = typed_primitive_inst<activation_grad>;
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h b/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h
deleted file mode 100644
index c754b4e414d..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/apply_adam.hpp"
-#include "primitive_inst.h"
-#include <string>
-#include <memory>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<apply_adam> : public typed_program_node_base<apply_adam> {
-    typed_program_node(const std::shared_ptr<apply_adam> prim, program_impl& prog);
-    using parent = typed_program_node_base<apply_adam>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& m() const { return get_dependency(1); }
-    program_node& v() const { return get_dependency(2); }
-    program_node& beta1_power() const { return get_dependency(3); }
-    program_node& beta2_power() const { return get_dependency(4); }
-    program_node& additional_dep() const { return get_dependency(5); }
-
-    bool has_additional_dep() const { return get_dependencies().size() > 5; }
-};
-
-using apply_adam_node = typed_program_node<apply_adam>;
-
-template <>
-class typed_primitive_inst<apply_adam> : public typed_primitive_inst_base<apply_adam> {
-    using parent = typed_primitive_inst_base<apply_adam>;
-
-public:
-    static layout calc_output_layout(apply_adam_node const& node);
-    static std::string to_string(apply_adam_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, apply_adam_node const& node);
-
-    memory_impl& m_memory() const { return dep_memory(1); }
-    memory_impl& v_memory() const { return dep_memory(2); }
-    memory_impl& beta1_power_memory() const { return dep_memory(3); }
-    memory_impl& beta2_power_memory() const { return dep_memory(4); }
-    memory_impl& additional_dep() const { return dep_memory(5); }
-
-    bool has_additional_dep() const { return _deps.size() > 5; }
-};
-
-using apply_adam_inst = typed_primitive_inst<apply_adam>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/batch_norm_grad_inst.h b/inference-engine/thirdparty/clDNN/src/include/batch_norm_grad_inst.h
deleted file mode 100644
index 0d2c617b00d..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/batch_norm_grad_inst.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/batch_norm_grad.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<batch_norm_grad> : public typed_program_node_base<batch_norm_grad> {
-    using parent = typed_program_node_base<batch_norm_grad>;
-
-public:
-    using parent::parent;
-    program_node& input() const { return get_dependency(0); }
-    program_node& inv_variance() const { return get_dependency(2); }
-};
-
-using batch_norm_grad_node = typed_program_node<batch_norm_grad>;
-
-template <>
-class typed_primitive_inst<batch_norm_grad> : public typed_primitive_inst_base<batch_norm_grad> {
-    using parent = typed_primitive_inst_base<batch_norm_grad>;
-
-public:
-    typed_primitive_inst(network_impl& network, batch_norm_grad_node const& desc);
-
-    memory_impl& inv_variance_memory() const { return dep_memory(2); }
-
-    static layout calc_output_layout(batch_norm_grad_node const& node);
-    static std::string to_string(batch_norm_grad_node const& node);
-};
-
-using batch_norm_grad_inst = typed_primitive_inst<batch_norm_grad>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h b/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h
deleted file mode 100644
index 485131ebab7..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/batch_norm.hpp"
-#include "primitive_inst.h"
-#include "mutable_data_inst.h"
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<batch_norm> : public typed_program_node_base<batch_norm> {
-    using parent = typed_program_node_base<batch_norm>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& mean() const { return get_dependency(1); }
-    program_node& variance() const { return get_dependency(2); }
-    program_node& scale() const {
-        if (get_dependencies().size() >= 5)
-            return get_dependency(3);
-        else
-            return get_dependency(1);
-    }
-    program_node& shift() const {
-        if (get_dependencies().size() >= 5)
-            return get_dependency(4);
-        else
-            return get_dependency(2);
-    }
-    program_node& inv_variance() const {
-        if (get_dependencies().size() == 2)
-            return get_dependency(1);
-        else if (get_dependencies().size() == 6)
-            return get_dependency(5);
-        else
-            return get_dependency(3);
-    }
-    bool variance_term() const { return !get_primitive()->variance.empty(); }
-    bool use_global_stats() const { return !get_primitive()->mean.empty() && !get_primitive()->variance.empty(); }
-    bool use_scale_shift() const { return !get_primitive()->scale.empty() && !get_primitive()->shift.empty(); }
-    bool forwad_pass() const { return !get_primitive()->inv_variance.empty(); }
-    bool calc_mean_var() const {
-        return (use_global_stats() && mean().is_type<mutable_data>() && variance().is_type<mutable_data>());
-    }
-};
-
-using batch_norm_node = typed_program_node<batch_norm>;
-
-template <>
-class typed_primitive_inst<batch_norm> : public typed_primitive_inst_base<batch_norm> {
-    using parent = typed_primitive_inst_base<batch_norm>;
-
-public:
-    static layout calc_output_layout(batch_norm_node const& node);
-    static std::string to_string(batch_norm_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, batch_norm_node const& node);
-
-    memory_impl& mean_memory() const { return dep_memory(1); }
-    memory_impl& variance_memory() const { return dep_memory(2); }
-    memory_impl& scale_memory() const {
-        if (dependencies().size() >= 5)
-            return dep_memory(3);
-        else
-            return dep_memory(1);
-    }
-    memory_impl& shift_memory() const {
-        if (dependencies().size() >= 5)
-            return dep_memory(4);
-        else
-            return dep_memory(2);
-    }
-    memory_impl& inv_variance_memory() const {
-        if (dependencies().size() == 2)
-            return dep_memory(1);
-        else if (dependencies().size() == 6)
-            return dep_memory(5);
-        else
-            return dep_memory(3);
-    }
-    bool use_global_stats() const { return !argument.mean.empty() && !argument.variance.empty(); }
-    bool use_scale_shift() const { return !argument.scale.empty() && !argument.scale.empty(); }
-    bool forwad_pass() const { return !argument.inv_variance.empty(); }
-    bool calc_mean_var() const { return node.calc_mean_var(); }
-};
-
-using batch_norm_inst = typed_primitive_inst<batch_norm>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/contract_inst.h b/inference-engine/thirdparty/clDNN/src/include/contract_inst.h
deleted file mode 100644
index 08ff773e194..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/contract_inst.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-
-#include <api/contract.hpp>
-
-#include "primitive_inst.h"
-#include <string>
-#include <memory>
-
-namespace cldnn {
-template <>
-struct typed_program_node<contract> : typed_program_node_base<contract> {
-private:
-    using parent = typed_program_node_base<contract>;
-
-public:
-    using parent::parent;
-
-    typed_program_node(const std::shared_ptr<contract> prim, program_impl& prog) : parent(prim, prog) {
-        support_padding_all(true);
-    }
-    program_node& input() const { return get_dependency(0); }
-};
-
-using contract_node = typed_program_node<contract>;
-
-template <>
-class typed_primitive_inst<contract> : public typed_primitive_inst_base<contract> {
-    using parent = typed_primitive_inst_base<contract>;
-
-public:
-    static layout calc_output_layout(contract_node const& node);
-    static std::string to_string(contract_node const& node);
-    typed_primitive_inst(network_impl& network, contract_node const& node);
-};
-
-using contract_inst = typed_primitive_inst<contract>;
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h b/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h
deleted file mode 100644
index 698ad1f8db8..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/convolution_grad_weights.hpp"
-#include "primitive_inst.h"
-#include <string>
-#include <memory>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<convolution_grad_weights> : public typed_program_node_base<convolution_grad_weights> {
-    using parent = typed_program_node_base<convolution_grad_weights>;
-
-public:
-    typed_program_node(std::shared_ptr<primitive> prim, program_impl& prog)
-        : parent(prim, prog), split(this->get_primitive()->split()), depthwise_sep_opt(false) {}
-
-    void set_split(int32_t node_split) { split = node_split; }
-    int32_t get_split() const { return split; }
-
-    void set_depthwise_sep_opt(bool node_depthwise_sep_opt) { depthwise_sep_opt = node_depthwise_sep_opt; }
-    bool get_depthwise_sep_opt() const { return depthwise_sep_opt; }
-
-    program_node& input(size_t idx = 0) const { return get_dependency(idx); }
-
-    program_node& weights(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= get_split())
-            throw std::range_error("weights offset too big");
-
-        return get_dependency(2 + idx);
-    }
-
-    program_node& bias(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= get_split())
-            throw std::range_error("bias offset too big");
-
-        return get_dependency(2 + this->get_split() + idx);
-    }
-
-    program_node& prev_weights_grad(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= get_split())
-            throw std::range_error("prev weights grad offset too big");
-        return get_dependency(2 + (bias_term() ? 2 : 1) * get_split() + idx);
-    }
-
-    program_node& prev_bias_grad(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= get_split())
-            throw std::range_error("prev bias grad offset too big");
-        return get_dependency(2 + 3 * get_split() + idx);
-    }
-
-    bool use_momentum() const {
-        if (get_primitive()->prev_weights_grad.size() != 0)
-            return true;
-        else
-            return false;
-    }
-
-    bool bias_term() const {
-        if (get_primitive()->bias.size() != 0)
-            return true;
-        else
-            return false;
-    }
-
-    bool output_grad_w() const { return get_primitive()->output_grad_w; }
-
-private:
-    int32_t split;
-    bool depthwise_sep_opt;
-};
-
-using convolution_grad_weights_node = typed_program_node<convolution_grad_weights>;
-
-template <>
-class typed_primitive_inst<convolution_grad_weights> : public typed_primitive_inst_base<convolution_grad_weights> {
-    using parent = typed_primitive_inst_base<convolution_grad_weights>;
-
-public:
-    static layout calc_output_layout(convolution_grad_weights_node const& node);
-    static std::string to_string(convolution_grad_weights_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, convolution_grad_weights_node const& node);
-
-    memory_impl& weights_memory(size_t index) const {
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("weights offset too big");
-
-        return dep_memory(2 + index);
-    }
-
-    memory_impl& bias_memory(size_t index) const {
-        if (argument.bias.size() == 0 && static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("no bias data");
-
-        if (static_cast<int32_t>(index) > node.get_split())
-            throw std::range_error("bias offset too big");
-
-        return dep_memory(2 + node.get_split() + index);
-    }
-
-    memory_impl& prev_weights_grad(size_t index) const {
-        if (argument.prev_weights_grad.size() == 0 && static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("no prev weights grad data");
-
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("prev weights grad offset too big");
-
-        return dep_memory(2 + (bias_term() ? 2 : 1) * node.get_split() + index);
-    }
-
-    memory_impl& prev_bias_grad(size_t index) const {
-        if (argument.prev_bias_grad.size() == 0 && static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("no prev bias grad data");
-
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("prev bias grad offset too big");
-
-        return dep_memory(2 + 3 * node.get_split() + index);
-    }
-
-    bool use_momentum() const {
-        if (argument.prev_weights_grad.size() != 0)
-            return true;
-        else
-            return false;
-    }
-
-    bool bias_term() const {
-        if (argument.bias.size() != 0)
-            return true;
-        else
-            return false;
-    }
-
-    bool output_grad_w() const { return argument.output_grad_w; }
-};
-
-using convolution_grad_weights_inst = typed_primitive_inst<convolution_grad_weights>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h b/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h
index b7803ad7ab9..e1cb4dc3d44 100644
--- a/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/eltwise_inst.h
@@ -30,36 +30,16 @@ struct typed_program_node<eltwise> : public typed_program_node_base<eltwise> {
 
 public:
     typed_program_node(std::shared_ptr<primitive> prim, program_impl& prog)
-        : parent(prim, prog),
-          output_qf(get_primitive()->output_quantization_factor),
-          output_cf(!get_primitive()->output_calibration_factors.empty()),
-          inputs_cf(!get_primitive()->inputs_calibration_factors.empty()),
-          inputs_qf(!get_primitive()->input_quantization_factors.empty()) {
+        : parent(prim, prog) {
         support_padding_all(true);
     }
 
     program_node& input(size_t idx = 0) const { return get_dependency(idx); }
     size_t inputs_count() const { return get_primitive()->input.size(); }
-    program_node& output_calibration_factors() const { return get_dependency(inputs_count()); }
-    bool output_calibration_term() const { return !get_primitive()->output_calibration_factors.empty(); }
-    float get_output_qf() const { return output_qf; }
-
-    program_node& input_calibration_factors(size_t idx = 0) const {
-        size_t ocf_offset = output_calibration_term() ? 1 : 0;
-        return get_dependency(inputs_count() + ocf_offset + idx);
-    }
-    bool inputs_calibration_term() const { return inputs_cf; }
-    bool inputs_quantization_term() const { return inputs_qf; }
 
     std::shared_ptr<kernel_selector::fuse_params> get_fuse_params() const override {
         return std::make_shared<kernel_selector::eltwise_fuse_params>();
     }
-
-private:
-    float output_qf;
-    bool output_cf;  // to know if we have calibration factors
-    bool inputs_cf;
-    bool inputs_qf;
 };
 
 using eltwise_node = typed_program_node<eltwise>;
@@ -75,19 +55,6 @@ public:
 
 public:
     typed_primitive_inst(network_impl& network, eltwise_node const& node);
-
-    memory_impl& output_calibration_factors_memory() const {
-        return dep_memory(node.inputs_count());
-    }  // because last place should be reserved for calibration factors
-    bool output_calibration_factors_term() const { return node.output_calibration_term(); }
-
-    memory_impl& input_calibration_factors_memory(size_t idx) const {
-        auto inputs_offset = inputs_memory_count();
-        size_t ocf_offset = node.output_calibration_term() ? 1 : 0;
-        return dep_memory(inputs_offset + ocf_offset + idx);
-    }
-
-    bool inputs_calibration_factors_term() const { return node.inputs_calibration_term(); }
 };
 
 using eltwise_inst = typed_primitive_inst<eltwise>;
diff --git a/inference-engine/thirdparty/clDNN/src/include/embed_inst.h b/inference-engine/thirdparty/clDNN/src/include/embed_inst.h
deleted file mode 100644
index e5ae3bb3986..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/embed_inst.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/embed.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-template <>
-struct typed_program_node<embed> : public typed_program_node_base<embed> {
-    using parent = typed_program_node_base<embed>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& weights() const { return get_dependency(1); }
-    program_node& bias() const { return get_dependency(2); }
-    bool bias_term() const { return !get_primitive()->bias.empty(); }
-};
-
-using embed_node = typed_program_node<embed>;
-
-template <>
-class typed_primitive_inst<embed> : public typed_primitive_inst_base<embed> {
-    using parent = typed_primitive_inst_base<embed>;
-
-public:
-    static layout calc_output_layout(embed_node const& node);
-    static std::string to_string(embed_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, embed_node const& node);
-    memory_impl& weights_memory() const { return dep_memory(1); }
-    memory_impl& bias_memory() const { return dep_memory(2); }
-    bool bias_term() const { return !argument.bias.empty(); }
-};
-
-using embed_inst = typed_primitive_inst<embed>;
-
-}  // namespace cldnn
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_input_inst.h b/inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_input_inst.h
deleted file mode 100644
index e015f853bdb..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_input_inst.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/fully_connected_grad_input.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-template <>
-struct typed_program_node<fully_connected_grad_input> : public typed_program_node_base<fully_connected_grad_input> {
-    using parent = typed_program_node_base<fully_connected_grad_input>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& weights() const { return get_dependency(2); }
-};
-
-using fully_connected_grad_input_node = typed_program_node<fully_connected_grad_input>;
-
-template <>
-class typed_primitive_inst<fully_connected_grad_input> : public typed_primitive_inst_base<fully_connected_grad_input> {
-    using parent = typed_primitive_inst_base<fully_connected_grad_input>;
-
-public:
-    static layout calc_output_layout(fully_connected_grad_input_node const& node);
-    static std::string to_string(fully_connected_grad_input_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, fully_connected_grad_input_node const& node);
-
-    memory_impl& weights_memory() const { return dep_memory(2); }
-    bool bias_term() const { return false; }
-};
-
-using fully_connected_grad_input_inst = typed_primitive_inst<fully_connected_grad_input>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_weights_inst.h b/inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_weights_inst.h
deleted file mode 100644
index 9b63ea64e4e..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_weights_inst.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/fully_connected_grad_weights.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-template <>
-struct typed_program_node<fully_connected_grad_weights> : public typed_program_node_base<fully_connected_grad_weights> {
-    using parent = typed_program_node_base<fully_connected_grad_weights>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& weights() const { return get_dependency(2); }
-    program_node& bias() const { return get_dependency(3); }
-    program_node& prev_weights_grad() const { return bias_term() ? get_dependency(4) : get_dependency(3); }
-    program_node& prev_bias_grad() const { return get_dependency(5); }
-    bool use_momentum() const { return !get_primitive()->prev_weights_grad.empty(); }
-    bool bias_term() const { return !get_primitive()->bias.empty(); }
-};
-
-using fully_connected_grad_weights_node = typed_program_node<fully_connected_grad_weights>;
-
-template <>
-class typed_primitive_inst<fully_connected_grad_weights>
-    : public typed_primitive_inst_base<fully_connected_grad_weights> {
-    using parent = typed_primitive_inst_base<fully_connected_grad_weights>;
-
-public:
-    static layout calc_output_layout(fully_connected_grad_weights_node const& node);
-    static std::string to_string(fully_connected_grad_weights_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, fully_connected_grad_weights_node const& node);
-
-    memory_impl& weights_memory() const { return dep_memory(2); }
-    memory_impl& bias_memory() const { return dep_memory(3); }
-    memory_impl& prev_weights_grad() const { return bias_term() ? dep_memory(4) : dep_memory(3); }
-    memory_impl& prev_bias_grad() const { return dep_memory(5); }
-    bool use_momentum() const { return !argument.prev_weights_grad.empty(); }
-    bool bias_term() const { return !argument.bias.empty(); }
-};
-
-using fully_connected_grad_weights_inst = typed_primitive_inst<fully_connected_grad_weights>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h b/inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h
deleted file mode 100644
index 460f02c568a..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api_extension/fused_conv_bn_scale.hpp"
-#include "primitive_inst.h"
-
-#include <memory>
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<fused_conv_bn_scale> : public typed_program_node_base<fused_conv_bn_scale> {
-    using parent = typed_program_node_base<fused_conv_bn_scale>;
-
-public:
-    typed_program_node(std::shared_ptr<primitive> prim, program_impl& prog)
-        : parent(prim, prog), split(this->get_primitive()->split()) {}
-
-    void set_split(int32_t node_split) { split = node_split; }
-    int32_t get_split() const { return split; }
-
-    program_node& input(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= static_cast<int32_t>(desc->input.size()))
-            throw std::range_error("input index too big");
-
-        return get_dependency(idx);
-    }
-
-    program_node& weights(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= this->get_split())
-            throw std::range_error("weights offset too big");
-
-        return get_dependency(desc->input.size() + idx);
-    }
-
-    program_node& bias(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= this->get_split())
-            throw std::range_error("bias offset too big");
-
-        return get_dependency(desc->input.size() + this->get_split() + idx);
-    }
-
-    program_node& weights_quantization_factors(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= this->get_split())
-            throw std::range_error("quantization factor offset too big");
-
-        return get_dependency(desc->input.size() + 2 * this->get_split() + idx);
-    }
-
-    program_node& output_calibration_factors(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= this->get_split())
-            throw std::range_error("calibration factor offset too big");
-
-        return get_dependency(desc->input.size() + 3 * this->get_split() + idx);
-    }
-
-    bool bias_term() const { return get_primitive()->bias.size() > 0; }
-
-    bool scale_bias_term() const { return !get_primitive()->scale_bias.empty(); }
-
-    bool is_fused_in_training() const { return !get_primitive()->inv_variance.empty(); }
-
-private:
-    int32_t split;
-};
-
-using fused_conv_bn_scale_node = typed_program_node<fused_conv_bn_scale>;
-
-template <>
-class typed_primitive_inst<fused_conv_bn_scale> : public typed_primitive_inst_base<fused_conv_bn_scale> {
-    using parent = typed_primitive_inst_base<fused_conv_bn_scale>;
-
-public:
-    static layout calc_output_layout(fused_conv_bn_scale_node const& node);
-    static std::string to_string(fused_conv_bn_scale_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, fused_conv_bn_scale_node const& node);
-
-    memory_impl& weights_memory(size_t index) const {
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("weights offset too big");
-
-        return dep_memory(inputs_memory_count() + index);
-    }
-
-    memory_impl& bias_memory(size_t index) const {
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("bias offset too big");
-
-        return dep_memory(inputs_memory_count() + node.get_split() + index);
-    }
-
-    bool bias_term() const { return node.bias_term(); }
-
-    bool scale_bias_term() const { return node.scale_bias_term(); }
-
-    bool is_fused_in_training() const { return node.is_fused_in_training(); }
-};
-
-using fused_conv_bn_scale_inst = typed_primitive_inst<fused_conv_bn_scale>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h b/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h
index f7fe3afacb2..af7155a73a4 100644
--- a/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/fused_conv_eltwise_inst.h
@@ -33,9 +33,7 @@ public:
         : parent(prim, prog),
           split(this->get_primitive()->split()),
           depthwise_sep_opt(false),
-          transposed(false),
-          conv_input_qf(this->get_primitive()->conv.input_quantization_factor),
-          conv_output_qf(this->get_primitive()->conv.output_quantization_factor) {
+          transposed(false) {
         if (get_primitive()->eltw.with_activation) {
             auto slope = get_primitive()->eltw.activation_negative_slope;
             if (slope == 0.f) {
@@ -76,46 +74,12 @@ public:
         return get_dependency(desc->input.size() + this->get_split() + idx);
     }
 
-    program_node& weights_quantization_factors(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= this->get_split())
-            throw std::range_error("quantization factor offset too big");
-
-        return get_dependency(desc->input.size() + (1 + 1 * bias_term()) * this->get_split() + idx);
-    }
-
-    program_node& conv_output_calibration_factors(size_t idx = 0) const {
-        if (static_cast<int32_t>(idx) >= this->get_split())
-            throw std::range_error("calibration factor offset too big");
-
-        return get_dependency(desc->input.size() +
-                              (1 + 1 * bias_term() + 1 * weights_quantization_term()) * this->get_split() + idx);
-    }
-
-    program_node& eltw_output_calibration_factors() const {
-        return get_dependency(desc->input.size() + (1 + 1 * bias_term() + 1 * weights_quantization_term() +
-                                                    1 * conv_output_calibration_term()) *
-                                                       this->get_split());
-    }
-
     bool bias_term() const { return get_primitive()->conv.bias.size() > 0; }
 
-    bool weights_quantization_term() const { return get_primitive()->conv.weights_quantization_factors.size() > 0; }
-
-    bool conv_output_calibration_term() const { return get_primitive()->conv.output_calibration_factors.size() > 0; }
-
-    bool eltw_output_calibration_term() const { return get_primitive()->eltw.output_calibration_factors.size() > 0; }
-
-    float get_conv_input_qf() const { return conv_input_qf; }
-    float get_conv_output_qf() const { return conv_output_qf; }
-    float get_eltw_output_qf() const { return eltw_output_qf; }
-
 private:
     int32_t split;
     bool depthwise_sep_opt;
     bool transposed;
-    float conv_input_qf;
-    float conv_output_qf;
-    float eltw_output_qf;
 };
 
 using fused_conv_eltwise_node = typed_program_node<fused_conv_eltwise>;
@@ -145,34 +109,7 @@ public:
         return dep_memory(2 + node.get_split() + index);
     }
 
-    memory_impl& weights_quantization_factors_memory(size_t index) const {
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("quantization factors offset too big");
-
-        return dep_memory(2 + (1 + 1 * bias_term()) * node.get_split() + index);
-    }
-
-    memory_impl& output_calibration_factors_memory(size_t index) const {
-        if (static_cast<int32_t>(index) >= node.get_split())
-            throw std::range_error("quantization factors offset too big");
-
-        return dep_memory(2 + (1 + 1 * bias_term() + 1 * weights_quantization_factors_term()) * node.get_split() +
-                          index);
-    }
-
-    memory_impl& eltw_output_calibration_factors_memory() const {
-        return dep_memory(2 + (1 + 1 * bias_term() + 1 * weights_quantization_factors_term() +
-                               1 * conv_output_calibration_factors_term()) *
-                                  node.get_split());
-    }
-
     bool bias_term() const { return node.bias_term(); }
-
-    bool weights_quantization_factors_term() const { return node.weights_quantization_term(); }
-
-    bool conv_output_calibration_factors_term() const { return node.conv_output_calibration_term(); }
-
-    bool eltw_output_calibration_factors_term() const { return node.eltw_output_calibration_term(); }
 };
 
 using fused_conv_eltwise_inst = typed_primitive_inst<fused_conv_eltwise>;
diff --git a/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h b/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h
deleted file mode 100644
index dbb2b615769..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/index_select_inst.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/index_select.hpp"
-#include "primitive_inst.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<index_select> : public typed_program_node_base<index_select> {
-    using parent = typed_program_node_base<index_select>;
-
-public:
-    typed_program_node(std::shared_ptr<primitive> prim, program_impl& prog) : parent(prim, prog) {}
-    program_node& input() const { return get_dependency(0); }
-    program_node& indices() const { return get_dependency(1); }
-    bool get_reverse() const { return get_primitive()->reverse; }
-    std::vector<index_select_axis_name> get_axes() const { return get_primitive()->axis; }
-};
-
-using index_select_node = typed_program_node<index_select>;
-
-template <>
-class typed_primitive_inst<index_select> : public typed_primitive_inst_base<index_select> {
-    using parent = typed_primitive_inst_base<index_select>;
-
-public:
-    static layout calc_output_layout(index_select_node const& node);
-    static std::string to_string(index_select_node const& node);
-    typed_primitive_inst(network_impl& network, index_select_node const& node);
-
-    memory_impl& input() const { return dep_memory(0); }
-    memory_impl& indices() const { return dep_memory(1); }
-    bool get_reverse() const { return node.get_reverse(); }
-    std::vector<index_select_axis_name> get_axes() const { return node.get_axes(); }
-};
-
-using index_select_inst = typed_primitive_inst<index_select>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h b/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h
index bf24649ff46..c3d6af7ebe5 100644
--- a/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h
+++ b/inference-engine/thirdparty/clDNN/src/include/kernel_selector_helper.h
@@ -94,7 +94,6 @@ using params = kernel_selector::Params;
 using weights_reorder_params = kernel_selector::WeightsReorderParams;
 using generic_kernel_params = kernel_selector::GenericKernelParams;
 
-struct training_params;
 }  // namespace kernel_selector
 
 kernel_selector::data_type to_data_type(data_types dt);
@@ -111,8 +110,6 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split
 kernel_selector::weights_tensor convert_weights_tensor(const layout& l);
 layout from_weights_tensor(const kernel_selector::weights_tensor& t);
 kernel_selector::activation_function get_kernel_selector_activation_param(activation_func activation_func);
-kernel_selector::activation_function get_kernel_selector_activation_grad_param(
-    activation_grad_func activation_grad_func);
 
 template <typename T = std::uint32_t>
 kernel_selector::dim_tensor<T> convert_dim_vector(const tensor& t) {
@@ -151,14 +148,6 @@ inline void convert_new_activation_func(const p_type primitive, std::vector<kern
                                    primitive->additional_params.b});
 }
 
-template <typename p_type>
-inline void convert_new_activation_grad_func(const p_type primitive, std::vector<kernel_selector::base_activation_params>& params) {
-    params.insert(params.begin(), {get_kernel_selector_activation_grad_param(primitive->activation_grad_function),
-                                   primitive->additional_params.a,
-                                   primitive->additional_params.b,
-                                   true});
-}
-
 void set_params(const program_node& node, kernel_selector::params& params);
 
 template <typename params_t, typename arg_t>
@@ -241,15 +230,6 @@ params_t get_weight_bias_zero_point_default_params(const arg_t& arg, uint32_t sp
     return params;
 }
 
-void set_learning_params(const program_node& node, kernel_selector::training_params& params, bool use_momentum);
-
-template <typename params_t, typename arg_t>
-inline params_t get_default_learning_params(const arg_t& arg, uint32_t split = 1) {
-    params_t params = get_weights_bias_default_params<params_t>(arg, split);
-    set_learning_params(arg, params, arg.use_momentum());
-    return params;
-}
-
 void set_optional_params(const program_impl& program, kernel_selector::optional_params& params);
 
 template <typename optional_params_t>
@@ -263,8 +243,3 @@ template <typename optional_params_t>
 inline optional_params_t get_default_weights_bias_optional_params(const program_impl& program) {
     return get_default_optional_params<optional_params_t>(program);
 }
-
-template <typename optional_params_t>
-inline optional_params_t get_default_learning_optional_params(const program_impl& program) {
-    return get_default_weights_bias_optional_params<optional_params_t>(program);
-}
diff --git a/inference-engine/thirdparty/clDNN/src/include/layout_optimizer.h b/inference-engine/thirdparty/clDNN/src/include/layout_optimizer.h
index b09da198388..891de708c91 100644
--- a/inference-engine/thirdparty/clDNN/src/include/layout_optimizer.h
+++ b/inference-engine/thirdparty/clDNN/src/include/layout_optimizer.h
@@ -27,7 +27,6 @@
 #include "fully_connected_inst.h"
 #include "detection_output_inst.h"
 #include "binary_convolution_inst.h"
-#include "embed_inst.h"
 #include "lstm_gemm_inst.h"
 #include "generic_layer.hpp"
 
diff --git a/inference-engine/thirdparty/clDNN/src/include/lookup_table_inst.h b/inference-engine/thirdparty/clDNN/src/include/lookup_table_inst.h
deleted file mode 100644
index 6e07af20ae0..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/lookup_table_inst.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/lookup_table.hpp"
-#include "primitive_inst.h"
-
-#include <memory>
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<lookup_table> : public typed_program_node_base<lookup_table> {
-    using parent = typed_program_node_base<lookup_table>;
-
-public:
-    typed_program_node(std::shared_ptr<primitive> prim, program_impl& prog) : parent(prim, prog) {}
-    program_node& input() const { return get_dependency(0); }
-    program_node& indices() const { return get_dependency(1); }
-};
-
-using lookup_table_node = typed_program_node<lookup_table>;
-
-template <>
-class typed_primitive_inst<lookup_table> : public typed_primitive_inst_base<lookup_table> {
-    using parent = typed_primitive_inst_base<lookup_table>;
-
-public:
-    static layout calc_output_layout(lookup_table_node const& node);
-    static std::string to_string(lookup_table_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, lookup_table_node const& node);
-};
-
-using lookup_table_inst = typed_primitive_inst<lookup_table>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/scale_grad_input_inst.h b/inference-engine/thirdparty/clDNN/src/include/scale_grad_input_inst.h
deleted file mode 100644
index 29815debbcd..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/scale_grad_input_inst.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/scale_grad_input.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<scale_grad_input> : public typed_program_node_base<scale_grad_input> {
-    using parent = typed_program_node_base<scale_grad_input>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& scale_in() const { return get_dependency(1); }
-};
-
-using scale_grad_input_node = typed_program_node<scale_grad_input>;
-
-template <>
-class typed_primitive_inst<scale_grad_input> : public typed_primitive_inst_base<scale_grad_input> {
-    using parent = typed_primitive_inst_base<scale_grad_input>;
-
-public:
-    static layout calc_output_layout(scale_grad_input_node const& node);
-    static std::string to_string(scale_grad_input_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, scale_grad_input_node const& desc);
-
-    memory_impl& scale_input_memory() const { return dep_memory(1); }
-};
-
-using scale_grad_input_inst = typed_primitive_inst<scale_grad_input>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/scale_grad_weights_inst.h b/inference-engine/thirdparty/clDNN/src/include/scale_grad_weights_inst.h
deleted file mode 100644
index ecef6d1d5a9..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/scale_grad_weights_inst.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/scale_grad_weights.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-
-template <>
-struct typed_program_node<scale_grad_weights> : public typed_program_node_base<scale_grad_weights> {
-    using parent = typed_program_node_base<scale_grad_weights>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-    program_node& input_grad() const { return get_dependency(1); }
-    program_node& weights() const { return get_dependency(2); }
-    program_node& bias() const { return get_dependency(3); }
-    program_node& prev_scale_grad() const { return bias_term() ? get_dependency(4) : get_dependency(3); }
-    program_node& prev_bias_grad() const { return get_dependency(5); }
-
-    bool use_momentum() const { return !get_primitive()->prev_scale_grad.empty(); }
-    bool bias_term() const { return get_dependencies().size() > 3; }
-};
-
-using scale_grad_weights_node = typed_program_node<scale_grad_weights>;
-
-template <>
-class typed_primitive_inst<scale_grad_weights> : public typed_primitive_inst_base<scale_grad_weights> {
-    using parent = typed_primitive_inst_base<scale_grad_weights>;
-
-public:
-    static layout calc_output_layout(scale_grad_weights_node const& node);
-    static std::string to_string(scale_grad_weights_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, scale_grad_weights_node const& desc);
-
-    memory_impl& weights_memory() const { return dep_memory(2); }
-    memory_impl& bias_memory() const { return dep_memory(3); }
-    memory_impl& prev_scale_grad() const { return bias_term() ? dep_memory(4) : dep_memory(3); }
-    memory_impl& prev_bias_grad() const { return dep_memory(5); }
-
-    bool use_momentum() const { return !argument.prev_scale_grad.empty(); }
-    bool bias_term() const { return _deps.size() > 3; }
-};
-
-using scale_grad_weights_inst = typed_primitive_inst<scale_grad_weights>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/include/softmax_loss_grad_inst.h b/inference-engine/thirdparty/clDNN/src/include/softmax_loss_grad_inst.h
deleted file mode 100644
index 88cc8d17254..00000000000
--- a/inference-engine/thirdparty/clDNN/src/include/softmax_loss_grad_inst.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#pragma once
-#include "api/softmax_loss_grad.hpp"
-#include "primitive_inst.h"
-#include <string>
-
-namespace cldnn {
-using softmax_loss_grad_node = typed_program_node<softmax_loss_grad>;
-
-template <>
-class typed_primitive_inst<softmax_loss_grad> : public typed_primitive_inst_base<softmax_loss_grad> {
-    using parent = typed_primitive_inst_base<softmax_loss_grad>;
-
-public:
-    static layout calc_output_layout(softmax_loss_grad_node const& node);
-    static std::string to_string(softmax_loss_grad_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, softmax_loss_grad_node const& desc);
-};
-
-using softmax_loss_grad_inst = typed_primitive_inst<softmax_loss_grad>;
-
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/index_select.cpp b/inference-engine/thirdparty/clDNN/src/index_select.cpp
deleted file mode 100644
index 9fc03be030b..00000000000
--- a/inference-engine/thirdparty/clDNN/src/index_select.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "index_select_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id index_select::type_id() {
-    static primitive_type_base<index_select> instance;
-    return &instance;
-}
-
-layout index_select_inst::calc_output_layout(index_select_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "index_select_node!");
-    auto desc = node.get_primitive();
-
-    auto input_layout = node.input().get_output_layout();
-
-    int32_t output_b = input_layout.size.batch[0];
-    int32_t output_f = input_layout.size.feature[0];
-    int32_t output_x = input_layout.size.spatial[0];
-    int32_t output_y = input_layout.size.spatial[1];
-
-    if (!node.get_reverse()) {
-        auto indices_layout = node.indices().get_output_layout();
-        auto indices_size = indices_layout.size.spatial[0];
-        auto axes = node.get_axes();
-        for (size_t i = 0; i < axes.size(); i++) {
-            switch (axes[i]) {
-                case index_select_axis_name::along_b:
-                    output_b = indices_size;
-                    break;
-                case index_select_axis_name::along_f:
-                    output_f = indices_size;
-                    break;
-                case index_select_axis_name::along_x:
-                    output_x = indices_size;
-                    break;
-                case index_select_axis_name::along_y:
-                    output_y = indices_size;
-                    break;
-                default:
-                    CLDNN_ERROR_MESSAGE(node.id(), "UNSUPPORTED AXIS");
-                    break;
-            }
-        }
-    }
-    return layout{input_layout.data_type, input_layout.format, {output_b, output_f, output_x, output_y}};
-}
-
-std::string index_select_inst::to_string(index_select_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    std::stringstream primitive_description;
-
-    std::string axis_str = "";
-    for (size_t i = 0; i < desc->axis.size(); i++) {
-        switch (desc->axis.at(i)) {
-            case index_select_axis_name::along_b:
-                axis_str += "along_b, ";
-                break;
-            case index_select_axis_name::along_f:
-                axis_str += "along_f, ";
-                break;
-            case index_select_axis_name::along_y:
-                axis_str += "along_y, ";
-                break;
-            case index_select_axis_name::along_x:
-                axis_str += "along_x, ";
-                break;
-            default:
-                axis_str += "not supported axis, ";
-                break;
-        }
-    }
-
-    json_composite index_select_info;
-    index_select_info.add("axes", axis_str);
-
-    node_info->add("index_select_info", index_select_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-index_select_inst::typed_primitive_inst(network_impl& network, index_select_node const& node) : parent(network, node) {
-    auto& input = node.input();
-    auto input_layout = input.get_output_layout();
-    auto const node_id = node.id();
-
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node_id,
-                                  "input_format",
-                                  input_layout.format,
-                                  "supported input format",
-                                  format::bfyx,
-                                  format::yxfb);
-
-    if (!node.get_reverse()) {
-        auto& indices = node.indices();
-        auto indices_layout = indices.get_output_layout();
-
-        CLDNN_ERROR_DATA_TYPES_MISMATCH(node_id,
-                                        "indicies data_type",
-                                        indices_layout.data_type,
-                                        "i32 data_type ",
-                                        data_types::i32,
-                                        "");
-        CLDNN_ERROR_NOT_EQUAL(node_id, "indicies batch_size", indices_layout.size.batch[0], "expected size", 1, "");
-        CLDNN_ERROR_NOT_EQUAL(node_id, "indicies feature_size", indices_layout.size.feature[0], "expected size", 1, "");
-        CLDNN_ERROR_NOT_EQUAL(node_id, "indicies y_size", indices_layout.size.spatial[1], "expected size", 1, "");
-        CLDNN_ERROR_LESS_THAN(node_id, "indicies x_size", indices_layout.size.spatial[0], "expected size", 1, "");
-        CLDNN_ERROR_NOT_PROPER_FORMAT(node_id,
-                                      "input_format",
-                                      indices_layout.format,
-                                      "supported indicies format",
-                                      format::bfyx,
-                                      format::yxfb);
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
index 88c47643528..87776e97629 100644
--- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
+++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
@@ -21,7 +21,6 @@
 #include "program_node.h"
 #include "program_impl.h"
 
-#include "training_params.h"
 #include <string>
 #include <vector>
 
@@ -675,21 +674,6 @@ kernel_selector::activation_function get_kernel_selector_activation_param(activa
     }
 }
 
-kernel_selector::activation_function get_kernel_selector_activation_grad_param(
-    activation_grad_func activation_grad_func) {
-    switch (activation_grad_func) {
-        case cldnn::activation_grad_func::none:
-            return kernel_selector::activation_function::NONE_GRAD;
-        case cldnn::activation_grad_func::relu:
-            return kernel_selector::activation_function::RELU_GRAD;
-        case cldnn::activation_grad_func::relu_negative_slope:
-            return kernel_selector::activation_function::RELU_NEGATIVE_SLOPE_GRAD;
-        default:
-            throw std::runtime_error("Unknown activation_grad function");
-            break;
-    }
-}
-
 void set_params(const program_node& node, kernel_selector::params& params) {
     const auto& program = node.get_program();
     const auto& context = program.get_engine().get_context();
@@ -722,18 +706,6 @@ void set_params(const program_node& node, kernel_selector::params& params) {
     }
 }
 
-void set_learning_params(const program_node& node, kernel_selector::training_params& params, bool use_momentum) {
-    const auto learning_params =
-        node.get_program().get_options().template get<build_option_type::learning_config>()->params;
-
-    if (use_momentum) {
-        params.use_momentum = true;
-    }
-
-    params.momentum_factor = learning_params.momentum;
-    params.weights_decay = learning_params.weights_decay;
-}
-
 void set_optional_params(const program_impl& program, kernel_selector::optional_params& params) {
     const auto& context = program.get_engine().get_context();
 
diff --git a/inference-engine/thirdparty/clDNN/src/lookup_table.cpp b/inference-engine/thirdparty/clDNN/src/lookup_table.cpp
deleted file mode 100644
index 8f2ba63be8f..00000000000
--- a/inference-engine/thirdparty/clDNN/src/lookup_table.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "lookup_table_inst.h"
-#include "primitive_type_base.h"
-#include "sliding_window_utils.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id lookup_table::type_id() {
-    static primitive_type_base<lookup_table> instance;
-    return &instance;
-}
-
-layout lookup_table_inst::calc_output_layout(lookup_table_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "lookup_table_node!");
-    auto desc = node.get_primitive();
-
-    auto input_data_layout = node.input().get_output_layout();
-    auto input_indices_layout = node.indices().get_output_layout();
-
-    return layout{input_data_layout.data_type, input_data_layout.format, input_indices_layout.size};
-}
-
-std::string lookup_table_inst::to_string(lookup_table_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto axis = desc->with_axis ? "true" : "false";
-
-    std::stringstream primitive_description;
-
-    json_composite conv_info;
-    conv_info.add("with axis", axis);
-    if (desc->with_axis)
-        conv_info.add("axis", desc->axis);
-    node_info->add("lookup_table info", conv_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-lookup_table_inst::typed_primitive_inst(network_impl& network, lookup_table_node const& node) : parent(network, node) {}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/network.cpp b/inference-engine/thirdparty/clDNN/src/network.cpp
index 944f55e8026..490b5713456 100644
--- a/inference-engine/thirdparty/clDNN/src/network.cpp
+++ b/inference-engine/thirdparty/clDNN/src/network.cpp
@@ -75,14 +75,6 @@ void network::set_output_memory(const primitive_id& id, const memory& mem) const
     _impl->set_output_memory(id, *mem.get());
 }
 
-void network::set_learning_rate(const float lr) {
-    _impl->set_learning_rate(lr);
-}
-
-float network::get_learning_rate() {
-    return _impl->get_learning_rate();
-}
-
 uint32_t network::get_id() {
     return _impl->get_id();
 }
diff --git a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
index da4ffc4aac6..7e14f584ebf 100644
--- a/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
+++ b/inference-engine/thirdparty/clDNN/src/primitive_inst.cpp
@@ -22,7 +22,6 @@
 #include "input_layout_inst.h"
 #include "max_unpooling_inst.h"
 #include "arg_max_min_inst.h"
-#include "apply_adam_inst.h"
 #include "fused_conv_eltwise_inst.h"
 
 #include "network_impl.h"
@@ -129,10 +128,6 @@ primitive_inst::primitive_inst(network_impl& network, program_node const& node,
             // Get mutable_data nodes count from nodes users
             if (user->is_type<mutable_data>()) {
                 mutable_data_count++;
-            // For certain primitives, it is known which dependency is used for synchronization only
-            } else if (user->is_type<apply_adam>() && (user->as<apply_adam>().has_additional_dep()) &&
-                     (user->as<apply_adam>().additional_dep().id() == node.id())) {
-                user_count--;
             } else if (user->is_type<fused_conv_eltwise>()) {
                 if (!user->as<fused_conv_eltwise>().get_users().empty() &&
                     (*user->as<fused_conv_eltwise>().get_users().begin())->is_type<mutable_data>()) {
diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp
index caed95e72d4..58658af625b 100644
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -396,9 +396,6 @@ void program_impl::pre_optimize_graph(bool is_internal) {
     // handle symmetric and asymmetric padding for input
     apply_opt_pass<handle_input_padding>();
 
-    // add reshape to input/parameters for some primitives
-    apply_opt_pass<add_reshape_to_primitives>();
-
     processing_order.calculate_BFS_processing_order();  // this method makes sense only for OOOQ (out of order execution queue)
 
     apply_opt_pass<reverse_optional_nodes_outputs>();
diff --git a/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp b/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp
deleted file mode 100644
index 471e526556c..00000000000
--- a/inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_input_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id scale_grad_input::type_id() {
-    static primitive_type_base<scale_grad_input> instance;
-    return &instance;
-}
-
-layout scale_grad_input_inst::calc_output_layout(scale_grad_input_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "scale_grad_input_node!");
-    auto result = node.input().get_non_padded_output_layout();
-
-    auto scale_in_sizes = node.scale_in().get_non_padded_output_layout().size;
-    auto input_sizes = result.size;
-
-    auto scale_in_x_size = scale_in_sizes.spatial[0];
-    auto scale_in_y_size = scale_in_sizes.spatial[1];
-
-    auto input_x_size = input_sizes.spatial[0];
-    auto input_y_size = input_sizes.spatial[1];
-
-    if (scale_in_x_size != 1) {
-        CLDNN_ERROR_NOT_EQUAL(node.id(), "Scale x size", scale_in_x_size, "input x size", input_x_size, "");
-    }
-    if (scale_in_y_size != 1) {
-        CLDNN_ERROR_NOT_EQUAL(node.id(), "Scale y size", scale_in_y_size, "input y size", input_y_size, "");
-    }
-
-    return result;
-}
-
-std::string scale_grad_input_inst::to_string(scale_grad_input_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto& input = node.input();
-    auto& scale_input = node.scale_in();
-
-    std::stringstream primitive_description;
-
-    json_composite scale_grad_input_info;
-    scale_grad_input_info.add("input", input.id());
-    scale_grad_input_info.add("scale input", scale_input.id());
-
-    node_info->add("scale_grad_input info", scale_grad_input_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-scale_grad_input_inst::typed_primitive_inst(network_impl& network, scale_grad_input_node const& node)
-    : parent(network, node) {
-    auto scale_input_layout = node.scale_in().get_output_layout();
-    auto scale_input_batch_size = scale_input_layout.size.batch[0];
-    auto scale_input_feature_size = scale_input_layout.size.feature[0];
-
-    auto input_layout = node.input().get_output_layout();
-    auto input_batch_size = input_layout.size.batch[0];
-    auto input_feature_size = input_layout.size.feature[0];
-
-    if (scale_input_batch_size != 1) {
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Scale batch size",
-                              scale_input_batch_size,
-                              "input batch size",
-                              input_batch_size,
-                              "");
-    }
-
-    if (scale_input_feature_size != 1) {
-        CLDNN_ERROR_NOT_EQUAL(node.id(),
-                              "Scale feature size",
-                              scale_input_feature_size,
-                              "input feature size",
-                              input_feature_size,
-                              "");
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp b/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp
deleted file mode 100644
index e9063358c53..00000000000
--- a/inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "scale_grad_weights_inst.h"
-#include "primitive_type_base.h"
-#include "error_handler.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id scale_grad_weights::type_id() {
-    static primitive_type_base<scale_grad_weights> instance;
-    return &instance;
-}
-
-layout scale_grad_weights_inst::calc_output_layout(scale_grad_weights_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "scale_grad_weights_node!");
-    // output buffer will not be used in this primitive
-    auto input_grad_layout_size = node.input().get_output_layout();
-    return {input_grad_layout_size.data_type, input_grad_layout_size.format, {1, 1, 1, 1}};
-}
-
-std::string scale_grad_weights_inst::to_string(scale_grad_weights_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-    auto& input = node.input();
-    auto& scale_input = node.weights();
-    auto& input_grad = node.input_grad();
-
-    std::stringstream primitive_description;
-
-    json_composite scale_grad_weights_info;
-    scale_grad_weights_info.add("input", input.id());
-    scale_grad_weights_info.add("scale input", scale_input.id());
-    scale_grad_weights_info.add("input grad", input_grad.id());
-    if (node.bias_term())
-        scale_grad_weights_info.add("bias", node.bias().id());
-
-    node_info->add("scale_grad_weights info", scale_grad_weights_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-scale_grad_weights_inst::typed_primitive_inst(network_impl& network, scale_grad_weights_node const& node)
-    : parent(network, node) {
-    auto scale_layout = node.weights().get_output_layout();
-    auto scale_format = scale_layout.format;
-
-    auto scale_sizes = scale_layout.size;
-    auto scale_feature_size = scale_layout.size.feature[0];
-
-    auto input_layout = node.input().get_output_layout();
-    auto input_feature_size = input_layout.size.feature[0];
-
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Scale feature size",
-                          scale_feature_size,
-                          "input feature size",
-                          input_feature_size,
-                          "");
-
-    if (scale_sizes.spatial[0] != 1 || scale_sizes.spatial[1] != 1 ||
-        scale_sizes.batch[0] != 1) {  // Remove if support for other scale sizes will be added.
-        CLDNN_ERROR_MESSAGE(node.id(), "All sizes in scale_input except feature should be 1.");
-    }
-
-    if (node.use_momentum()) {
-        CLDNN_ERROR_LAYOUT_MISMATCH(node.id(),
-                                    "Scale memory",
-                                    node.weights().get_output_layout(),
-                                    "previous scale grad memory",
-                                    node.prev_scale_grad().get_output_layout(),
-                                    "");
-        CLDNN_ERROR_LAYOUT_MISMATCH(node.id(),
-                                    "Bias memory",
-                                    node.bias().get_output_layout(),
-                                    "previous bias grad memory",
-                                    node.prev_bias_grad().get_output_layout(),
-                                    "");
-    }
-
-    if (node.bias_term()) {
-        auto bias_layout = node.bias().get_output_layout();
-        auto bias_format = bias_layout.format;
-        auto bias_raw_sizes = bias_layout.size.raw;
-
-        CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(), "Scale format", scale_format.value, "bias format", bias_format);
-
-        for (size_t i = 0; i < bias_layout.size.raw.size(); ++i) {
-            if (scale_layout.size.raw[i] != bias_raw_sizes[i])
-                CLDNN_ERROR_MESSAGE(node.id(),
-                                    "Scale input size do not match bias size! Size index:" + std::to_string(i));
-        }
-    }
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp b/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp
deleted file mode 100644
index 13e6b1a8f06..00000000000
--- a/inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "softmax_loss_grad_inst.h"
-#include "primitive_type_base.h"
-#include "json_object.h"
-#include <string>
-
-namespace cldnn {
-primitive_type_id softmax_loss_grad::type_id() {
-    static primitive_type_base<softmax_loss_grad> instance;
-    return &instance;
-}
-
-layout softmax_loss_grad_inst::calc_output_layout(softmax_loss_grad_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "softmax_loss_grad_node!");
-    return node.input().get_non_padded_output_layout();
-}
-
-std::string softmax_loss_grad_inst::to_string(softmax_loss_grad_node const& node) {
-    auto desc = node.get_primitive();
-    auto node_info = node.desc_to_json();
-
-    std::stringstream primitive_description;
-
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-softmax_loss_grad_inst::typed_primitive_inst(network_impl& network, softmax_loss_grad_node const& node)
-    : parent(network, node) {
-    // TODO: add size check here for labels
-}
-}  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp
index 916f3a27f7c..c5646a78ae3 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/add_reorders_gpu_test.cpp
@@ -30,73 +30,16 @@
 #include <api/tile.hpp>
 #include <api/reshape.hpp>
 
-#include <api/batch_norm.hpp>
 #include <api/concatenation.hpp>
 
 using namespace cldnn;
 using namespace tests;
 
 /*
-These tests are inteded to check if additional reorders are being added  properly during 
+These tests are inteded to check if additional reorders are being added  properly during
 add_reorders optimization pass.
 */
 
-//Input has incompatible format
-TEST(add_reorders_gpu, basic1) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::fyxb,{ 2, 2, 3, 2 } }); //format unsupported by batch_norm!
-    auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    set_values(input, {
-        1.f, 2.f, -10.f,
-        3.f, 4.f, -14.f,
-        5.f, 6.f, -12.f,
-        7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 9.f, -17.f
-    });
-
-    set_values(mean, { 0.1f, 0.2f });
-    set_values(variance, { 0.4f, 0.5f });
-
-    float epsilon = 1e-3f;
-    float expected_out[] = {
-        1.42125f,  3.00042f,
-       -0.28256f, -0.28256f,
-      -15.94960f,  4.57958f,
-      -15.82340f,  0.42384f,
-        6.15875f,-22.26620f,
-       -0.98896f,-21.47460f,
-        7.73791f,  9.31708f,
-        1.83664f,  7.06401f,
-       -19.1079f,  10.8962f,
-       -18.6490f,  16.6711f,
-        12.4754f, -25.4246f,
-        12.4327f, -24.3002f};
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", epsilon));
-
-    network network(engine, topology); // without additional reorders we would get an exception here
-    network.set_input_data("input", input);
-
-    EXPECT_EQ(network.get_all_primitive_org_ids().size(), size_t(5));
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory().pointer<float>();
-    for (int i = 0; i < 2 * 2 * 3 * 2; i++)
-    {
-        EXPECT_NEAR(expected_out[i], output[i], epsilon);
-    }
-}
-
 //concatenation of incompatible convolutions
 TEST(add_reorders_gpu, two_convolutions_and_concatenation) {
     const auto& engine = get_test_engine();
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp
deleted file mode 100644
index ce1730bd542..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/apply_adam.hpp"
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-#include <api/reorder.hpp>
-#include <api/data.hpp>
-#include <api/activation.hpp>
-#include <api/mutable_data.hpp>
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(apply_adam_gpu, basic_in2x2x3x2_bfyx) {
-    // Test creates topology with two apply adam primitives (t = [0, 1]) with the same output variable which is updated.
-
-    const auto& engine = get_test_engine();
-
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto var = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto m = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto v = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto beta1_power = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto beta2_power = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-
-    float input_grad_f = 100.f;
-    float var_f = 3.f;
-    float m_f = 50.f;
-    float v_f = 121.f;
-    float beta1 = 0.9f;
-    float beta2 = 0.999f;
-    float beta1_power_f = beta1;
-    float beta2_power_f = beta2;
-    float lr = 0.001f;
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input_grad.get_layout()));
-    topology.add(mutable_data("m", m));
-    topology.add(mutable_data("v", v));
-    topology.add(data("beta1_power_t1", beta1_power));
-    topology.add(data("beta2_power_t1", beta2_power));
-    topology.add(apply_adam("apply_adam", "input", "m", "v", "beta1_power_t1", "beta2_power_t1", lr, beta1, beta2, epsilon));
-    topology.add(activation("relu", "input", activation_func::linear, { 4.f, 0.f }));
-    topology.add(activation("beta1_power_t2", "beta1_power_t1", activation_func::linear, { beta1, 0.f }));
-    topology.add(activation("beta2_power_t2", "beta2_power_t1", activation_func::linear, { beta2, 0.f }));
-    topology.add(apply_adam("apply_adam2", "relu", "m", "v", "beta1_power_t2", "beta2_power_t2", lr, beta1, beta2, epsilon, "apply_adam"));
-    topology.add(mutable_data("var", { "apply_adam", "apply_adam2" }, var));
-
-    set_values(input_grad, {
-        input_grad_f
-    });
-
-    set_values(m, { m_f });
-    set_values(v, { v_f });
-    set_values(beta1_power, { beta1_power_f });
-    set_values(beta2_power, { beta2_power_f });
-    set_values(var, { var_f });
-
-    build_options bo;
-    bo.set_option(build_option::optimize_data(true));
-    network network(engine, topology, bo);
-
-    network.set_input_data("input", input_grad);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("var").get_memory();
-    auto output_ptr = output.pointer<float>();
-    auto m_ptr = m.pointer<float>();
-    auto v_ptr = v.pointer<float>();
-
-    float lr_t1 = lr * sqrt(1 - beta2_power_f) / (1 - beta1_power_f);
-    float m_t1 = beta1 * m_f + (1 - beta1) * input_grad_f;
-    float v_t1 = beta2 * v_f + (1 - beta2) * input_grad_f * input_grad_f;
-    float result_t1 = var_f - lr_t1 * m_t1 / (sqrt(v_t1) + epsilon);
-
-    beta1_power_f *= beta1;
-    beta2_power_f *= beta2;
-    float input_grad2_f = input_grad_f * 4;
-    float lr_t2 = lr * sqrt(1 - beta2_power_f) / (1 - beta1_power_f);
-    float m_t2 = beta1 * m_t1 + (1 - beta1) * input_grad2_f;
-    float v_t2 = beta2 * v_t1 + (1 - beta2) * input_grad2_f * input_grad2_f;
-    float result_t2 = result_t1 - lr_t2 * m_t2 / (sqrt(v_t2) + epsilon);
-
-    EXPECT_NEAR(m_t2, m_ptr[0], 1e-03F);
-    EXPECT_NEAR(v_t2, v_ptr[0], 1e-03F);
-    EXPECT_NEAR(result_t2, output_ptr[0], 1e-03F);
-}
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp
deleted file mode 100644
index d6e306dbe99..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp
+++ /dev/null
@@ -1,2663 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/batch_norm.hpp"
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-#include <api/reorder.hpp>
-#include <api/data.hpp>
-#include <api/mutable_data.hpp>
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance
-    //  f0: 44.9305
-    //  f1: 107.0624
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", epsilon));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    set_values(mean, { -3.3333f, -0.3583f });
-    set_values(variance, { 44.9305f, 107.0624f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2*j + 2*2*l + 2*2*3*k];
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_scale_shift) {
-	//  Mean   : 3x2x2
-	//  Input  : 2x3x2x2
-	//  Output : 2x3x2x2
-
-	//  Input:
-	//  f0: b0:  1    2  -10   b1:   0    0     -11
-	//  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-	//  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-	//  f1: b0:  7    8  -16   b1:   12   9     -17
-	//
-	//  Mean
-	//  f0: -3.3333
-	//  f1: -0.3583
-	//
-	//  Variance
-	//  f0: 44.9305
-	//  f1: 107.0624
-	//
-	//  Scale
-	//  f0: 2.0
-	//  f1: 1.0
-	//
-	//  Shift
-	//  f0: 0.0
-	//  f1: 5.0
-
-	const auto& engine = get_test_engine();
-
-	auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-	auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-	float epsilon = 0.0001f;
-
-	topology topology;
-	topology.add(input_layout("input", input.get_layout()));
-	topology.add(data("mean", mean));
-	topology.add(data("variance", variance));
-	topology.add(data("scale", scale));
-	topology.add(data("shift", shift));
-	topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift",  epsilon));
-
-	set_values(input, {
-		1.f, 0.f, 5.f, 1.5f,
-		2.f, 0.f, 6.f, 5.2f,
-		-10.f, -11.f, -12.f, -13.f,
-		3.f, 0.5f, 7.f, 12.f,
-		4.f, -0.5f, 8.f, 9.f,
-		-14.f, -15.f, -16.f, -17.f
-	});
-
-	set_values(mean, { -3.3333f, -0.3583f });
-	set_values(variance, { 44.9305f, 107.0624f });
-	set_values(scale, { 2.f, 1.f });
-	set_values(shift, { 0.f, 5.f });
-
-	network network(engine, topology);
-
-	network.set_input_data("input", input);
-
-	auto outputs = network.execute();
-
-	auto output = outputs.at("batch_norm").get_memory();
-	auto output_ptr = output.pointer<float>();
-
-	for (int j = 0; j < 2; ++j) { //F
-		float sum = 0, var = 0;
-
-		auto scalep = scale.pointer<float>();
-		auto shiftp = shift.pointer<float>();
-		float scalef = scalep[j];
-		float shiftf = shiftp[j];
-
-		for (int i = 0; i < 2; ++i) { //B
-			for (int k = 0; k < 2; ++k) { //Y
-				for (int l = 0; l < 3; ++l) { //X
-					float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-					data = (data - shiftf) / scalef;
-					sum += data;
-					var += data * data;
-				}
-			}
-		}
-		sum /= 2 * 3 * 2; 
-		var /= 2 * 3 * 2;
-
-		EXPECT_NEAR(sum, 0, 1e-03F);
-		EXPECT_NEAR(var, 1, 1e-03F);
-	}
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance
-    //  f0: 44.9305
-    //  f1: 107.0624
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(mutable_data("inv_variance", inv_variance));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "inv_variance"));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc_no_inv_var) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance
-    //  f0: 44.9305
-    //  f1: 107.0624
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(batch_norm("batch_norm", "input", epsilon));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc_scale_shift) {
-	//  Mean   : 3x2x2
-	//  Input  : 2x3x2x2
-	//  Output : 2x3x2x2
-
-	//  Input:
-	//  f0: b0:  1    2  -10   b1:   0    0     -11
-	//  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-	//  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-	//  f1: b0:  7    8  -16   b1:   12   9     -17
-	//
-	//  Mean
-	//  f0: -3.3333
-	//  f1: -0.3583
-	//
-	//  Variance
-	//  f0: 44.9305
-	//  f1: 107.0624
-	//
-	//  Scale
-	//  f0: 2.0
-	//  f1: 1.0
-	//
-	//  Shift
-	//  f0: 0.0
-	//  f1: 5.0
-
-	const auto& engine = get_test_engine();
-
-	auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-	auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-	float epsilon = 0.0001f;
-
-	topology topology;
-	topology.add(input_layout("input", input.get_layout()));
-	topology.add(data("scale", scale));
-	topology.add(data("shift", shift));
-	topology.add(mutable_data("inv_variance", inv_variance));
-	topology.add(batch_norm("batch_norm", "input", epsilon, "scale", "shift", "inv_variance"));
-
-	set_values(input, {
-		1.f, 0.f, 5.f, 1.5f,
-		2.f, 0.f, 6.f, 5.2f,
-		-10.f, -11.f, -12.f, -13.f,
-		3.f, 0.5f, 7.f, 12.f,
-		4.f, -0.5f, 8.f, 9.f,
-		-14.f, -15.f, -16.f, -17.f
-	});
-
-	set_values(scale, { 2.f, 1.f });
-	set_values(shift, { 0.f, 5.f });
-
-	network network(engine, topology);
-
-	network.set_input_data("input", input);
-
-	auto outputs = network.execute();
-
-	auto output = outputs.at("batch_norm").get_memory();
-	auto output_ptr = output.pointer<float>();
-
-	for (int j = 0; j < 2; ++j) { //F
-		float sum = 0, var = 0;
-
-		auto scalep = scale.pointer<float>();
-		auto shiftp = shift.pointer<float>();
-		float scalef = scalep[j];
-		float shiftf = shiftp[j];
-
-		for (int i = 0; i < 2; ++i) { //B
-			for (int k = 0; k < 2; ++k) { //Y
-				for (int l = 0; l < 3; ++l) { //X
-					float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-					data = (data - shiftf) / scalef;
-					sum += data;
-					var += data * data;
-				}
-			}
-		}
-		sum /= 2 * 3 * 2;
-		var /= 2 * 3 * 2;
-
-		EXPECT_NEAR(sum, 0, 1e-03F);
-		EXPECT_NEAR(var, 1, 1e-03F);
-	}
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_calc_scale_shift_no_inv_var) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance
-    //  f0: 44.9305
-    //  f1: 107.0624
-    //
-    //  Scale
-    //  f0: 2.0
-    //  f1: 1.0
-    //
-    //  Shift
-    //  f0: 0.0
-    //  f1: 5.0
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "scale", "shift"));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    data = (data - shiftf) / scalef;
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs) {
-	//  Mean   : 3x2x2
-	//  Input  : 2x3x2x2
-	//  Output : 2x3x2x2
-
-	//  Input:
-	//  f0: b0:  1    2  -10   b1:   0    0     -11
-	//  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-	//  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-	//  f1: b0:  7    8  -16   b1:   12   9     -17
-	//
-	//  Mean (to be calculated)
-	//  f0: -3.3333
-	//  f1: -0.3583
-	//
-	//  Variance (to be calculated)
-	//  f0: 44.9305
-	//  f1: 107.0624
-	//
-	//  Scale
-	//  f0: 2.0
-	//  f1: 1.0
-	//
-	//  Shift
-	//  f0: 0.0
-	//  f1: 5.0
-
-	const auto& engine = get_test_engine();
-
-	auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-	auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-	float epsilon = 0.0001f;
-
-	topology topology;
-	topology.add(input_layout("input", input.get_layout()));
-	topology.add(data("scale", scale));
-	topology.add(data("shift", shift));
-	topology.add(mutable_data("mean_out", mean_out));
-	topology.add(mutable_data("variance_out", variance_out));
-	topology.add(mutable_data("inv_variance", inv_variance));
-	topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance"));
-
-	set_values(input, {
-		1.f, 0.f, 5.f, 1.5f,
-		2.f, 0.f, 6.f, 5.2f,
-		-10.f, -11.f, -12.f, -13.f,
-		3.f, 0.5f, 7.f, 12.f,
-		4.f, -0.5f, 8.f, 9.f,
-		-14.f, -15.f, -16.f, -17.f
-	});
-
-	set_values(scale, { 2.f, 1.f });
-	set_values(shift, { 0.f, 5.f });
-
-	network network(engine, topology);
-
-	network.set_input_data("input", input);
-
-	auto outputs = network.execute();
-
-	auto output = outputs.at("batch_norm").get_memory();
-	auto output_ptr = output.pointer<float>();
-
-	std::vector<float> mean_ref = { -3.3333f, -0.3583f };
-	std::vector<float> val_ref = { 44.9305f, 107.0624f };
-
-	for (int j = 0; j < 2; ++j) { //F
-		float sum = 0, var = 0;
-
-		auto scalep = scale.pointer<float>();
-		auto shiftp = shift.pointer<float>();
-		float scalef = scalep[j];
-		float shiftf = shiftp[j];
-
-		auto meanp = mean_out.pointer<float>();
-		auto varp = variance_out.pointer<float>();
-		float meanf = meanp[j];
-		float varf = varp[j];
-
-		for (int i = 0; i < 2; ++i) { //B
-			for (int k = 0; k < 2; ++k) { //Y
-				for (int l = 0; l < 3; ++l) { //X
-					float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-					data = (data - shiftf) / scalef;
-					sum += data;
-					var += data * data;
-				}
-			}
-		}
-		sum /= 2 * 3 * 2;
-		var /= 2 * 3 * 2;
-
-		EXPECT_NEAR(sum, 0, 1e-03F);
-		EXPECT_NEAR(var, 1, 1e-03F);
-
-		EXPECT_NEAR(meanf, mean_ref[j], 1e-03F);
-		EXPECT_NEAR(varf, val_ref[j], 1e-03F);
-	}
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs_no_inv_var) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean (to be calculated)
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance (to be calculated)
-    //  f0: 44.9305
-    //  f1: 107.0624
-    //
-    //  Scale
-    //  f0: 2.0
-    //  f1: 1.0
-    //
-    //  Shift
-    //  f0: 0.0
-    //  f1: 5.0
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(mutable_data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift"));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    std::vector<float> mean_ref = { -3.3333f, -0.3583f };
-    std::vector<float> val_ref = { 44.9305f, 107.0624f };
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean_out.pointer<float>();
-        auto varp = variance_out.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    data = (data - shiftf) / scalef;
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-
-        EXPECT_NEAR(meanf, mean_ref[j], 1e-03F);
-        EXPECT_NEAR(varf, val_ref[j], 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs_error_out_type) {
-	const auto& engine = get_test_engine();
-
-	auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-	auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-	auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-	float epsilon = 0.0001f;
-
-	topology topology;
-	topology.add(input_layout("input", input.get_layout()));
-	topology.add(data("scale", scale));
-	topology.add(data("shift", shift));
-	topology.add(data("mean_out", mean_out));
-	topology.add(data("variance_out", variance_out));
-	topology.add(data("inv_variance", inv_variance));
-	topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance"));
-
-	EXPECT_ANY_THROW(network(engine, topology));
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_with_var_mean_outputs_error_non_equal_types) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-    topology.add(mutable_data("inv_variance", inv_variance));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance"));
-
-    EXPECT_ANY_THROW(network(engine, topology));
-}
-
-TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance
-    //  f0: 44.9305
-    //  f1: 107.0624
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", epsilon));
-
-    set_values(input, {
-        1.f, 2.f, -10.f, 3.f,
-        4.f, -14.f, 5.f, 6.f,
-        -12.f, 7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f, 0.5f,
-        -0.5f, -15.f, 1.5f, 5.2f,
-        -13.f, 12.f, 9.f, -17.f
-    });
-
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-
-    set_values(mean, { -3.3333f, -0.3583f });
-    set_values(variance, { 44.9305f, 107.0624f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[l + k * 3 + j * 2 * 3 + i * 2 * 2 * 3];
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x2x3x2_bfyx_padding) {
-    //  Mean   : 3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-    //  Input padding : 1x2
-    //  Output padding : 2x1
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Mean
-    //  f0: -3.3333
-    //  f1: -0.3583
-    //
-    //  Variance
-    //  f0: 44.9305
-    //  f1: 107.0624
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(reorder("reorder", "input", input.get_layout().with_padding(padding{ { 0, 0, 1, 2 }, 0 })));
-    topology.add(batch_norm("batch_norm", "reorder", "mean", "variance", epsilon, padding({ 0, 0, 2, 1 }, 0)));
-
-    set_values(input, {
-        1.f, 2.f, -10.f, 3.f,
-        4.f, -14.f, 5.f, 6.f,
-        -12.f, 7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f, 0.5f,
-        -0.5f, -15.f, 1.5f, 5.2f,
-        -13.f, 12.f, 9.f, -17.f
-    });
-
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-
-    set_values(mean, { -3.3333f, -0.3583f });
-    set_values(variance, { 44.9305f, 107.0624f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[l + 2 + 7 * (k + 1 + 4 * (j + 2 * i))];
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_to_string) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-
-    auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    auto inv_variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-
-    topology.add(mutable_data("inv_variance", inv_variance));
-
-    topology.add(mutable_data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-
-    topology.add(batch_norm("batch_norm0", "input", "mean", "variance", epsilon));
-    topology.add(batch_norm("batch_norm1", "input", "mean", "variance", "scale", "shift", epsilon));
-    topology.add(batch_norm("batch_norm2", "input", epsilon));
-    topology.add(batch_norm("batch_norm3", "input", epsilon, "inv_variance"));
-    topology.add(batch_norm("batch_norm4", "input", epsilon, "scale", "shift"));
-    topology.add(batch_norm("batch_norm5", "input", epsilon, "scale", "shift", "inv_variance"));
-    topology.add(batch_norm("batch_norm6", "input", epsilon, "mean_out", "variance_out", "scale", "shift" ));
-    topology.add(batch_norm("batch_norm7", "input", epsilon, "mean_out", "variance_out", "scale", "shift", "inv_variance"));
-
-    network network(engine, topology);
-
-    size_t zero_length = 0;
-
-    EXPECT_NE(network.get_primitive_info("batch_norm0").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm1").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm2").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm3").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm4").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm5").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm6").length(), zero_length);
-    EXPECT_NE(network.get_primitive_info("batch_norm7").length(), zero_length);
-}                                         
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_yxfb_scale_shift_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    set_values(mean, { -3.3333f, -0.3583f });
-    set_values(variance, { 44.9305f, 107.0624f });
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    data = (data - shiftf) / scalef;
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_yxfb_scale_shift_different_shapes_input_layouts) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(input_layout("mean", mean.get_layout()));
-    topology.add(input_layout("variance", variance.get_layout()));
-    topology.add(input_layout("scale", scale.get_layout()));
-    topology.add(input_layout("shift", shift.get_layout()));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    set_values(mean, { -3.3333f, -0.3583f });
-    set_values(variance, { 44.9305f, 107.0624f });
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-    network.set_input_data("mean", mean);
-    network.set_input_data("variance", variance);
-    network.set_input_data("scale", scale);
-    network.set_input_data("shift", shift);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    data = (data - shiftf) / scalef;
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x2x2_yxfb_with_var_mean_outputs_no_inv_var_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 1, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 2, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 2 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(mutable_data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift"));
-
-    set_values(input, {
-        1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 9.f,
-        -14.f, -15.f, -16.f, -17.f
-    });
-
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    std::vector<float> mean_ref = { -3.3333f, -0.3583f };
-    std::vector<float> val_ref = { 44.9305f, 107.0624f };
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean_out.pointer<float>();
-        auto varp = variance_out.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    float data = output_ptr[i + 2 * j + 2 * 2 * l + 2 * 2 * 3 * k];
-                    data = (data - shiftf) / scalef;
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-
-        EXPECT_NEAR(meanf, mean_ref[j], 1e-03F);
-        EXPECT_NEAR(varf, val_ref[j], 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x2x3x2_byxf_scale_shift_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 1, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon));
-
-    set_values(input, {
-        1.f, 5.f, 2.f, 6.f, -10.f, -12.f, 
-        3.f, 7.f, 4.f, 8.f, -14.f, -16.f, 
-        0.f, 1.5f, 0.f, 5.2f, -11.f, -13.f, 
-        0.5f, 12.f, -0.5f, 9.f, -15.f, -17.f
-    });
-
-    set_values(mean, { -3.3333f, -0.3583f });
-    set_values(variance, { 44.9305f, 107.0624f });
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    std::vector<float> expected_result{
-        0.646469f, 0.517855f, 0.795655f, 0.614501f, -0.99458f, -1.12512f, 
-        0.944842f, 0.711146f, 1.09403f, 0.807792f, -1.59133f, -1.5117f, 
-        0.497283f, 0.179596f, 0.497283f, 0.537184f, -1.14377f, -1.22176f, 
-        0.571876f, 1.19437f, 0.42269f, 0.904437f, -1.74051f, -1.60834f
-    };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    auto index = 12 * i + 6 * k + 2 * l + j;
-                    float data = output_ptr[index];
-                    data = (data - shiftf) / scalef;
-                    EXPECT_NEAR(data, expected_result[index], 1e-3F);
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x2x3x2_byxf_with_var_mean_outputs_no_inv_var_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 2, 3, 2 } });
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 1, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 2, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 2, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 2 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(mutable_data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift"));
-
-    set_values(input, {
-        1.f, 5.f, 2.f, 6.f, -10.f, -12.f,
-        3.f, 7.f, 4.f, 8.f, -14.f, -16.f,
-        0.f, 1.5f, 0.f, 5.2f, -11.f, -13.f,
-        0.5f, 12.f, -0.5f, 9.f, -15.f, -17.f
-    });
-
-    set_values(scale, { 2.f, 1.f });
-    set_values(shift, { 0.f, 5.f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    std::vector<float> mean_ref = { -3.3333f, -0.3583f };
-    std::vector<float> val_ref = { 44.9305f, 107.0624f };
-
-    std::vector<float> expected_result{
-        0.646469f, 0.517855f, 0.795655f, 0.614501f, -0.99458f, -1.12512f,
-        0.944842f, 0.711146f, 1.09403f, 0.807792f, -1.59133f, -1.5117f,
-        0.497283f, 0.179596f, 0.497283f, 0.537184f, -1.14377f, -1.22176f,
-        0.571876f, 1.19437f, 0.42269f, 0.904437f, -1.74051f, -1.60834f
-    };
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean_out.pointer<float>();
-        auto varp = variance_out.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    auto index = 12 * i + 6 * k + 2 * l + j;
-                    float data = output_ptr[index];
-                    data = (data - shiftf) / scalef;
-                    EXPECT_NEAR(data, expected_result[index], 1e-3F);
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-
-        EXPECT_NEAR(meanf, mean_ref[j], 1e-03F);
-        EXPECT_NEAR(varf, val_ref[j], 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x5x2_yxfb_scale_shift_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 5, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::yxfb,{ 5, 1, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 5, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 5, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 5 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon));
-
-    set_values(input, {
-        // y0x0
-        1.f, 0.f, // f0
-        5.f, 1.5f, // f1
-        1.f, 0.f, // f2
-        5.f, 1.5f, // f3
-        1.f, 0.f, // f4
-
-        // y0x1
-        2.f, 0.f, 
-        6.f, 5.2f,
-        2.f, 0.f,
-        6.f, 5.2f,
-        2.f, 0.f,
-
-        // y0x2
-        -10.f, -11.f, 
-        -12.f, -13.f,
-        -10.f, -11.f,
-        -12.f, -13.f,
-        -10.f, -11.f,
-
-        // y1x0
-        3.f, 0.5f, 
-        7.f, 12.f,
-        3.f, 0.5f,
-        7.f, 12.f,
-        3.f, 0.5f,
-
-        // y1x1
-        4.f, -0.5f, 
-        8.f, 9.f,
-        4.f, -0.5f,
-        8.f, 9.f,
-        4.f, -0.5f,
-
-        // y1x2
-        -14.f, -15.f,
-        -16.f, -17.f,
-        -14.f, -15.f,
-        -16.f, -17.f,
-        - 14.f, -15.f
-    });
-
-    set_values(mean, { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f });
-    set_values(variance, { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f });
-    set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f });
-    set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f });
-
-    std::vector<float> expected_result{
-        0.646469f, 0.497283f, 
-        0.517855f, 0.179596f, 
-        0.646469f, 0.497283f, 
-        0.517855f, 0.179596f, 
-        0.646469f, 0.497283f, 
-        
-        0.795655f, 0.497283f, 
-        0.614501f, 0.537184f, 
-        0.795655f, 0.497283f, 
-        0.614501f, 0.537184f, 
-        0.795655f, 0.497283f, 
-        
-        -0.99458f, -1.14377f, 
-        -1.12512f, -1.22176f, 
-        -0.99458f, -1.14377f, 
-        -1.12512f, -1.22176f, 
-        -0.99458f, -1.14377f, 
-        
-        0.944842f, 0.571876f, 
-        0.711146f, 1.19437f, 
-        0.944842f, 0.571876f, 
-        0.711146f, 1.19437f, 
-        0.944842f, 0.571876f, 
-        
-        1.09403f, 0.42269f, 
-        0.807792f, 0.904437f, 
-        1.09403f, 0.42269f, 
-        0.807792f, 0.904437f, 
-        1.09403f, 0.42269f, 
-        
-        -1.59133f, -1.74051f, 
-        -1.5117f, -1.60834f, 
-        -1.59133f, -1.74051f, 
-        -1.5117f, -1.60834f, 
-        -1.59133f, -1.74051f
-    };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 5; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    int index = 30 * k + 10 * l + 2 * j + i;
-                    float data = output_ptr[index];
-                    data = (data - shiftf) / scalef;
-                    EXPECT_NEAR(data, expected_result[index], 1e-3F);
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x3x5x2_yxfb_with_var_mean_outputs_no_inv_var_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 5, 3, 2 } });
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 5, 1, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 5, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 5, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 1, 5 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(mutable_data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift"));
-
-    set_values(input, {
-        // y0x0
-        1.f, 0.f, // f0
-        5.f, 1.5f, // f1
-        1.f, 0.f, // f2
-        5.f, 1.5f, // f3
-        1.f, 0.f, // f4
-
-        // y0x1
-        2.f, 0.f,
-        6.f, 5.2f,
-        2.f, 0.f,
-        6.f, 5.2f,
-        2.f, 0.f,
-
-        // y0x2
-        -10.f, -11.f,
-        -12.f, -13.f,
-        -10.f, -11.f,
-        -12.f, -13.f,
-        -10.f, -11.f,
-
-        // y1x0
-        3.f, 0.5f,
-        7.f, 12.f,
-        3.f, 0.5f,
-        7.f, 12.f,
-        3.f, 0.5f,
-
-        // y1x1
-        4.f, -0.5f,
-        8.f, 9.f,
-        4.f, -0.5f,
-        8.f, 9.f,
-        4.f, -0.5f,
-
-        // y1x2
-        -14.f, -15.f,
-        -16.f, -17.f,
-        -14.f, -15.f,
-        -16.f, -17.f,
-        -14.f, -15.f
-    });
-
-    set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f });
-    set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f });
-
-    std::vector<float> expected_result{
-        0.646469f, 0.497283f,
-        0.517855f, 0.179596f,
-        0.646469f, 0.497283f,
-        0.517855f, 0.179596f,
-        0.646469f, 0.497283f,
-
-        0.795655f, 0.497283f,
-        0.614501f, 0.537184f,
-        0.795655f, 0.497283f,
-        0.614501f, 0.537184f,
-        0.795655f, 0.497283f,
-
-        -0.99458f, -1.14377f,
-        -1.12512f, -1.22176f,
-        -0.99458f, -1.14377f,
-        -1.12512f, -1.22176f,
-        -0.99458f, -1.14377f,
-
-        0.944842f, 0.571876f,
-        0.711146f, 1.19437f,
-        0.944842f, 0.571876f,
-        0.711146f, 1.19437f,
-        0.944842f, 0.571876f,
-
-        1.09403f, 0.42269f,
-        0.807792f, 0.904437f,
-        1.09403f, 0.42269f,
-        0.807792f, 0.904437f,
-        1.09403f, 0.42269f,
-
-        -1.59133f, -1.74051f,
-        -1.5117f, -1.60834f,
-        -1.59133f, -1.74051f,
-        -1.5117f, -1.60834f,
-        -1.59133f, -1.74051f
-    };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    std::vector<float> mean_ref = { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f };
-    std::vector<float> val_ref = { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f };
-
-    for (int j = 0; j < 5; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean_out.pointer<float>();
-        auto varp = variance_out.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    int index = 30 * k + 10 * l + 2 * j + i;
-                    float data = output_ptr[index];
-                    data = (data - shiftf) / scalef;
-                    EXPECT_NEAR(data, expected_result[index], 1e-3F);
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-
-        EXPECT_NEAR(meanf, mean_ref[j], 1e-03F);
-        EXPECT_NEAR(varf, val_ref[j], 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x2x3x5_byxf_scale_shift_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 5, 3, 2 } });
-    auto mean = memory::allocate(engine, { data_types::f32, format::byxf,{ 5, 1, 1, 1 } });
-    auto variance = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 5, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 5, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 5 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(batch_norm("batch_norm", "input", "mean", "variance", "scale", "shift", epsilon));
-
-    set_values(input, {
-        // b0y0
-        1.f, 5.f, 1.f, 5.f, 1.f, // x0
-        2.f, 6.f, 2.f, 6.f, 2.f, // x1
-        -10.f, -12.f, -10.f, -12.f, -10.f, //x2
-
-        // b0y1
-        3.f, 7.f, 3.f, 7.f, 3.f,
-        4.f, 8.f, 4.f, 8.f, 4.f,
-        -14.f, -16.f, -14.f, -16.f, -14.f,
-        
-        // b1y0
-        0.f, 1.5f, 0.f, 1.5f, 0.f,
-        0.f, 5.2f, 0.f, 5.2f, 0.f,
-        -11.f, -13.f, -11.f, -13.f, -11.f,
-        
-        // b1y1
-        0.5f, 12.f, 0.5f, 12.f, 0.5f,
-        -0.5f, 9.f, -0.5f, 9.f, -0.5f,
-        -15.f, -17.f, -15.f, -17.f, -15.f
-    });
-
-    set_values(mean, { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f });
-    set_values(variance, { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f });
-    set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f });
-    set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f });
-
-    std::vector<float> expected_result{
-        0.646469f, 0.517855f, 0.646469f, 0.517855f, 0.646469f,
-        0.795655f, 0.614501f, 0.795655f, 0.614501f, 0.795655f,
-        -0.99458f, -1.12512f, -0.99458f, -1.12512f, -0.99458f,
-
-        0.944842f, 0.711146f, 0.944842f, 0.711146f, 0.944842f,
-        1.09403f, 0.807792f, 1.09403f, 0.807792f, 1.09403f,
-        -1.59133f, -1.5117f, -1.59133f, -1.5117f, -1.59133f,
-
-        0.497283f, 0.179596f, 0.497283f, 0.179596f, 0.497283f,
-        0.497283f, 0.537184f, 0.497283f, 0.537184f, 0.497283f,
-        -1.14377f, -1.22176f, -1.14377f, -1.22176f, -1.14377f,
-
-        0.571876f, 1.19437f, 0.571876f, 1.19437f, 0.571876f,
-        0.42269f, 0.904437f, 0.42269f, 0.904437f, 0.42269f,
-        -1.74051f, -1.60834f, -1.74051f, -1.60834f, -1.74051f
-    };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 5; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    auto index = 30 * i + 15 * k + 5 * l + j;
-                    float data = output_ptr[index];
-                    data = (data - shiftf) / scalef;
-                    EXPECT_NEAR(data, expected_result[index], 1e-3F);
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-    }
-}
-
-TEST(batch_normalization_gpu, basic_in2x2x3x5_byxf_with_var_mean_outputs_no_inv_var_different_shapes) {
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::byxf,{ 2, 5, 3, 2 } });
-    auto mean_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 5, 1, 1, 1 } });
-    auto variance_out = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 5, 1, 1 } });
-    auto scale = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 5, 1 } });
-    auto shift = memory::allocate(engine, { data_types::f32, format::byxf,{ 1, 1, 1, 5 } });
-
-    float epsilon = 0.0001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("scale", scale));
-    topology.add(data("shift", shift));
-    topology.add(mutable_data("mean_out", mean_out));
-    topology.add(mutable_data("variance_out", variance_out));
-    topology.add(batch_norm("batch_norm", "input", epsilon, "mean_out", "variance_out", "scale", "shift"));
-
-    set_values(input, {
-        // b0y0
-        1.f, 5.f, 1.f, 5.f, 1.f, // x0
-        2.f, 6.f, 2.f, 6.f, 2.f, // x1
-        -10.f, -12.f, -10.f, -12.f, -10.f, //x2
-
-        // b0y1
-        3.f, 7.f, 3.f, 7.f, 3.f,
-        4.f, 8.f, 4.f, 8.f, 4.f,
-        -14.f, -16.f, -14.f, -16.f, -14.f,
-
-        // b1y0
-        0.f, 1.5f, 0.f, 1.5f, 0.f,
-        0.f, 5.2f, 0.f, 5.2f, 0.f,
-        -11.f, -13.f, -11.f, -13.f, -11.f,
-
-        // b1y1
-        0.5f, 12.f, 0.5f, 12.f, 0.5f,
-        -0.5f, 9.f, -0.5f, 9.f, -0.5f,
-        -15.f, -17.f, -15.f, -17.f, -15.f
-    });
-
-    set_values(scale, { 2.f, 1.f, 3.f, 4.f, 5.f });
-    set_values(shift, { 0.f, 5.f, -5.f, -15.f, 0.5f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    std::vector<float> mean_ref = { -3.3333f, -0.3583f, -3.3333f, -0.3583f, -3.3333f };
-    std::vector<float> val_ref = { 44.9305f, 107.0624f, 44.9305f, 107.0624f, 44.9305f };
-
-    std::vector<float> expected_result{
-        0.646469f, 0.517855f, 0.646469f, 0.517855f, 0.646469f,
-        0.795655f, 0.614501f, 0.795655f, 0.614501f, 0.795655f,
-        -0.99458f, -1.12512f, -0.99458f, -1.12512f, -0.99458f,
-
-        0.944842f, 0.711146f, 0.944842f, 0.711146f, 0.944842f,
-        1.09403f, 0.807792f, 1.09403f, 0.807792f, 1.09403f,
-        -1.59133f, -1.5117f, -1.59133f, -1.5117f, -1.59133f,
-
-        0.497283f, 0.179596f, 0.497283f, 0.179596f, 0.497283f,
-        0.497283f, 0.537184f, 0.497283f, 0.537184f, 0.497283f,
-        -1.14377f, -1.22176f, -1.14377f, -1.22176f, -1.14377f,
-
-        0.571876f, 1.19437f, 0.571876f, 1.19437f, 0.571876f,
-        0.42269f, 0.904437f, 0.42269f, 0.904437f, 0.42269f,
-        -1.74051f, -1.60834f, -1.74051f, -1.60834f, -1.74051f
-    };
-
-    for (int j = 0; j < 5; ++j) { //F
-        float sum = 0, var = 0;
-
-        auto scalep = scale.pointer<float>();
-        auto shiftp = shift.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean_out.pointer<float>();
-        auto varp = variance_out.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int i = 0; i < 2; ++i) { //B
-            for (int k = 0; k < 2; ++k) { //Y
-                for (int l = 0; l < 3; ++l) { //X
-                    auto index = 30 * i + 15 * k + 5 * l + j;
-                    float data = output_ptr[index];
-                    data = (data - shiftf) / scalef;
-                    EXPECT_NEAR(data, expected_result[index], 1e-3F);
-                    sum += data;
-                    var += data * data;
-                }
-            }
-        }
-        sum /= 2 * 3 * 2;
-        var /= 2 * 3 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-03F);
-        EXPECT_NEAR(var, 1, 1e-03F);
-
-        EXPECT_NEAR(meanf, mean_ref[j], 1e-03F);
-        EXPECT_NEAR(varf, val_ref[j], 1e-03F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b1c2h2w2)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape{ 1, 2, 2, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape{ feature(2) };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape{ feature(2) };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape{ feature(2) };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape{ feature(2) };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(mutable_data("mean", mean));
-    topology.add(mutable_data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, {
-        0.54881352f,
-        0.71518934f,
-        0.60276335f,
-        0.54488319f,
-
-        0.42365479f,
-        0.64589411f,
-        0.4375872f,
-        0.89177299f
-    });
-
-    set_values<float>(gamma, { 1.f, 1.f });
-    set_values<float>(beta, { 0.f, 0.f });
-
-    std::vector<float> expected_result { 
-        -0.71498716f,
-        1.48388731f,
-        -0.00196938f,
-        -0.76693159f,
-
-        -0.91316032f,
-        0.23943391f,
-        -0.84090298f,
-        1.51462936f 
-    };
-
-    std::vector<float> expected_mean = { 0.602912f, 0.599727f };
-    std::vector<float> expected_variance = { 0.00472505f, 0.0361782f };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean.pointer<float>();
-        auto varp = variance.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int k = 0; k < 2; ++k) { //Y
-            for (int l = 0; l < 2; ++l) { //X
-                int index = 4 * j + 2 * k + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-
-        EXPECT_NEAR(meanf, expected_mean[j], 1e-5F);
-        EXPECT_NEAR(varf, expected_variance[j], 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b2c2h2w1)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape{ 2, 2, 1, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape{ feature(2) };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape{ feature(2) };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape{ feature(2) };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape{ feature(2) };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(mutable_data("mean", mean));
-    topology.add(mutable_data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, { 
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f
-    });
-
-    set_values<float>(gamma, { 1.f, 1.f });
-    set_values<float>(beta, { 0.f, 0.f });
-
-    std::vector<float> expected_result{
-        -0.30327f, 
-        1.1561f, 
-
-        -0.0963782f, 
-        -0.434702f, 
-        
-
-        -1.4011f, 
-        0.548275f, 
-
-        -1.06187f,
-        1.59295f };
-
-    std::vector<float> expected_mean = { 0.583388f, 0.619252f };
-    std::vector<float> expected_variance = { 0.0119972f, 0.0282681f };
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean.pointer<float>();
-        auto varp = variance.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int k = 0; k < 2; ++k) { //B
-            for (int l = 0; l < 2; ++l) { //Y
-                int index = 4 * k + 2 * j + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-
-        EXPECT_NEAR(meanf, expected_mean[j], 1e-5F);
-        EXPECT_NEAR(varf, expected_variance[j], 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_inference_b2c2h2w1)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape{ 2, 2, 1, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape{ feature(2) };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape{ feature(2) };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape{ feature(2) };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape{ feature(2) };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, { 
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f
-    });
-
-    set_values<float>(gamma, { 1.f, 1.f });
-    set_values<float>(beta, { 0.f, 0.f });
-
-    set_values<float>(mean, { 0.583388f, 0.619252f });
-    set_values<float>(variance, { 0.0119972f, 0.0282681f });
-
-    std::vector<float> expected_result{
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-        
-        
-        -1.4011f,
-        0.548275f,
-        
-        -1.06187f,
-        1.59295f };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int k = 0; k < 2; ++k) { //B
-            for (int l = 0; l < 2; ++l) { //Y
-                int index = 4 * k + 2 * j + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b2c2h2w1_different_shapes)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape = { 2, 2, 1, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape = { 2, 1, 1, 1 };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape = { 1, 2, 1, 1 };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape = { 1, 1, 2, 1 };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape = { 1, 1, 1, 2 };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(mutable_data("mean", mean));
-    topology.add(mutable_data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, {
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f
-    });
-
-    set_values<float>(gamma, { 2.f, 3.f });
-    set_values<float>(beta, { 5.f, 10.f });
-
-    std::vector<float> expected_result{
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-
-        -1.4011f,
-        0.548275f,
-
-        -1.06187f,
-        1.59295f };
-
-    std::vector<float> expected_mean = { 0.583388f, 0.619252f };
-    std::vector<float> expected_variance = { 0.0119972f, 0.0282681f };
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean.pointer<float>();
-        auto varp = variance.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int k = 0; k < 2; ++k) { //B
-            for (int l = 0; l < 2; ++l) { //Y
-                int index = 4 * k + 2 * j + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-
-        EXPECT_NEAR(meanf, expected_mean[j], 1e-5F);
-        EXPECT_NEAR(varf, expected_variance[j], 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_inference_b2c2h2w1_different_shapes)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape = { 2, 2, 1, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape = { 2, 1, 1, 1 };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape = { 1, 1, 2, 1 };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape = { 1, 1, 2, 1 };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape = { 1, 1, 1, 2 };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, {
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f
-    });
-
-    set_values<float>(gamma, { 2.f, 3.f });
-    set_values<float>(beta, { 5.f, 10.f });
-
-    set_values<float>(mean, { 0.583388f, 0.619252f });
-    set_values<float>(variance, { 0.0119972f, 0.0282681f });
-
-    std::vector<float> expected_result{
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-
-        -1.4011f,
-        0.548275f,
-
-        -1.06187f,
-        1.59295f };
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 2; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int k = 0; k < 2; ++k) { //B
-            for (int l = 0; l < 2; ++l) { //Y
-                int index = 4 * k + 2 * j + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b2c5h2w1_different_shapes)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape = { 2, 5, 1, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape = { 5, 1, 1, 1 };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape = { 1, 5, 1, 1 };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape = { 1, 1, 5, 1 };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape = { 1, 1, 1, 5 };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(mutable_data("mean", mean));
-    topology.add(mutable_data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, {
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.54881352f,
-        0.71518934f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f,
-
-        0.42365479f,
-        0.64589411f
-    });
-
-    set_values<float>(gamma, { 2.f, 3.f, 4.f, 5.f, 1.f });
-    set_values<float>(beta, { 5.f, 10.f, -10.f, -15.f, 0.f });
-
-    std::vector<float> expected_result{
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-
-        -0.30327f,
-        1.1561f,
-
-        -1.4011f,
-        0.548275f,
-
-        -1.06187f,
-        1.59295f,
-
-        -1.4011f,
-        0.548275f,
-
-        -1.06187f,
-        1.59295f,
-
-        -1.4011f,
-        0.548275f
-    };
-
-    std::vector<float> expected_mean = { 0.583388f, 0.619252f, 0.583388f, 0.619252f, 0.583388f };
-    std::vector<float> expected_variance = { 0.0119972f, 0.0282681f, 0.0119972f, 0.0282681f, 0.0119972f };
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 5; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        auto meanp = mean.pointer<float>();
-        auto varp = variance.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        for (int k = 0; k < 2; ++k) { //B
-            for (int l = 0; l < 2; ++l) { //Y
-                int index = 10 * k + 2 * j + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-
-        EXPECT_NEAR(meanf, expected_mean[j], 1e-5F);
-        EXPECT_NEAR(varf, expected_variance[j], 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_inference_b2c5h2w1_different_shapes)
-{
-    const auto& engine = get_test_engine();
-
-    tensor input_shape = { 2, 5, 1, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape = { 5, 1, 1, 1 };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape = { 1, 5, 1, 1 };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape = { 1, 1, 5, 1 };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape = { 1, 1, 1, 5 };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(data("mean", mean));
-    topology.add(data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, {
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.54881352f,
-        0.71518934f,
-
-        0.60276335f,
-        0.54488319f,
-
-        0.54881352f,
-        0.71518934f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f,
-
-        0.42365479f,
-        0.64589411f,
-
-        0.4375872f,
-        0.89177299f,
-
-        0.42365479f,
-        0.64589411f
-    });
-
-    set_values<float>(gamma, { 2.f, 3.f, 4.f, 5.f, 1.f });
-    set_values<float>(beta, { 5.f, 10.f, -10.f, -15.f, 0.f });
-
-    std::vector<float> expected_result{
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-
-        -0.30327f,
-        1.1561f,
-
-        -0.0963782f,
-        -0.434702f,
-
-        -0.30327f,
-        1.1561f,
-
-        -1.4011f,
-        0.548275f,
-
-        -1.06187f,
-        1.59295f,
-
-        -1.4011f,
-        0.548275f,
-
-        -1.06187f,
-        1.59295f,
-
-        -1.4011f,
-        0.548275f
-    };
-
-    set_values<float>(mean, { 0.583388f, 0.619252f, 0.583388f, 0.619252f, 0.583388f });
-    set_values<float>(variance, { 0.0119972f, 0.0282681f, 0.0119972f, 0.0282681f, 0.0119972f });
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (int j = 0; j < 5; ++j) { //F
-        float sum = 0;
-
-        auto scalep = gamma.pointer<float>();
-        auto shiftp = beta.pointer<float>();
-        float scalef = scalep[j];
-        float shiftf = shiftp[j];
-
-        for (int k = 0; k < 2; ++k) { //B
-            for (int l = 0; l < 2; ++l) { //Y
-                int index = 10 * k + 2 * j + l;
-                float data = output_ptr[index];
-                data = (data - shiftf) / scalef;
-                EXPECT_NEAR(data, expected_result[index], 1e-5F);
-                sum += data;
-            }
-        }
-
-        sum /= 2 * 2;
-
-        EXPECT_NEAR(sum, 0, 1e-5F);
-    }
-}
-
-TEST(ngraph_batch_normalization_gpu, batchnorm_fprop_b1c2h2w2_no_bn_output)
-{
-    engine engine;
-
-    tensor input_shape{ 1, 2, 2, 2 };
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_shape });
-    tensor mean_shape{ feature(2) };
-    auto mean = memory::allocate(engine, { data_types::f32, format::bfyx, mean_shape });
-    tensor var_shape{ feature(2) };
-    auto variance = memory::allocate(engine, { data_types::f32, format::bfyx, var_shape });
-    tensor gamma_shape{ feature(2) };
-    auto gamma = memory::allocate(engine, { data_types::f32, format::bfyx, gamma_shape });
-    tensor beta_shape{ feature(2) };
-    auto beta = memory::allocate(engine, { data_types::f32, format::bfyx, beta_shape });
-
-    float eps = 0.001f;
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("gamma", gamma));
-    topology.add(data("beta", beta));
-    topology.add(mutable_data("mean", mean));
-    topology.add(mutable_data("variance", variance));
-    topology.add(batch_norm("batch_norm", "input", eps, "mean", "variance", "gamma", "beta"));
-
-    set_values<float>(input, {
-        0.54881352f,
-        0.71518934f,
-        0.60276335f,
-        0.54488319f,
-
-        0.42365479f,
-        0.64589411f,
-        0.4375872f,
-        0.89177299f
-    });
-
-    set_values<float>(gamma, { 1.f, 1.f });
-    set_values<float>(beta, { 0.f, 0.f });
-
-    std::vector<float> expected_mean = { 0.602912f, 0.599727f };
-    std::vector<float> expected_variance = { 0.00472505f, 0.0361782f };
-
-    build_options bo;
-    bo.set_option(build_option::outputs({ "mean", "variance" }));
-    network network(engine, topology, bo);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    for (int j = 0; j < 2; ++j) { //F
-        auto meanp = mean.pointer<float>();
-        auto varp = variance.pointer<float>();
-        float meanf = meanp[j];
-        float varf = varp[j];
-
-        EXPECT_NEAR(meanf, expected_mean[j], 1e-5F);
-        EXPECT_NEAR(varf, expected_variance[j], 1e-5F);
-    }
-}
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp
deleted file mode 100644
index d4a625e0d5d..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/batch_norm_grad.hpp"
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-#include <api/reorder.hpp>
-#include <api/data.hpp>
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(batch_normalization_backward_gpu, basic_in2x2x2x3) {
-    //  Grad input  : 2x2x2x3
-    //  Input : 2x2x2x3
-    //  Inverted variance : 1x2x1x1
-    //  Output : 2x2x2x3
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0     -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5   -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2   -13     
-    //  f1: b0:  7    8  -16   b1:   12   9     -17
-    //
-    //  Grad input
-    //  f0: b0:  1    2  3   b1:  -1    -2     -3
-    //  f0: b0:  5    6  7   b1:   0.5  -0.5   -4  
-    //  f1: b0:  8    9  10  b1:   1.5   5     -5     
-    //  f1: b0: 11   12  13  b1:   2    -7.2   -6
-    //
-    //  Inverted variance
-    //  f0: 0.1491862
-    //  f1: 0.0966454
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto inv_var = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(data("grad_input", grad_input));
-    topology.add(data("inv_var", inv_var));
-    topology.add(batch_norm_grad("batch_norm_grad", "grad_input", "input", "inv_var"));
-
-    set_values(input, {
-        1.f, 2.f, -10.f,
-        3.f, 4.f, -14.f, 
-        5.f, 6.f, -12.f, 
-        7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f, 
-        0.5f, -0.5f, -15.f, 
-        1.5f, 5.2f, -13.f, 
-        12.f, 9.f, -17.f
-    });
-
-    set_values(grad_input, {
-        1.f, 2.f, 3.f,
-        5.f, 6.f, 7.f,
-        8.f, 9.f, 10.f,
-        11.f, 12.f, 13.f,
-        -1.f, -2.f, -3.f,
-        0.5f, -0.5f, -4.f,
-        1.5f, 5.f, -5.f,
-        2.f, -7.2f, -6.f
-    });
-
-    set_values(inv_var, { 0.1491862f, 0.0966454f });
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("batch_norm_grad").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    std::vector<float> expected_out = {
-        -0.142969f, -0.111888f, 1.45456f, 
-         0.217566f, 0.248648f, 2.52372f,
-        -3.41923f, -4.07521f, 5.f,
-        -4.63455f, -5.f, 5.f,
-        -0.323237f, -0.472423f, 0.677543f, 
-        -0.15851f, -0.189591f, 1.00078f,
-        -1.41324f, -3.85969f, 5.f,
-        -5.f, -5.f, 5.f
-    };
-
-    for (int i = 0; i < 2 * 2 * 3 * 2; i++)
-    {    
-        EXPECT_NEAR(expected_out[i], output_ptr[i], 1e-03F);
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp
deleted file mode 100644
index a29cd0186e2..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-// Copyright (c) 2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-
-#include <api/engine.hpp>
-#include <api/input_layout.hpp>
-#include <api/memory.hpp>
-#include <api/contract.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-
-#include "test_utils/test_utils.h"
-#include "test_utils/uniform_quantized_real_distribution.hpp"
-
-#include <cstddef>
-
-using namespace cldnn;
-using namespace ::tests;
-
-template <typename T>
-T reduce_execute(cldnn::contract_mode mode, T x, T y) {
-    switch (mode) {
-    case contract_mode::sum:
-        return x + y;
-    case contract_mode::prod:
-        return x * y;
-    case contract_mode::all:
-        return x && y;
-    case contract_mode::any:
-        return x || y;
-    case contract_mode::max:
-        return x > y ? x : y;
-    default:
-        return (T)0;
-    }
-}
-
-template <typename T>
-VVVVF<T> reduce_dim(VVVVF<T> &input,
-    cldnn::contract_mode mode, uint16_t axis,
-    int input_padding_y = 0, int input_padding_x = 0,
-    int output_padding_y = 0, int output_padding_x = 0) {
-
-    size_t padding_y = input_padding_y + output_padding_y;
-    size_t padding_x = input_padding_x + output_padding_x;
-    size_t out_sizes[4];
-    out_sizes[0] = input.size();
-    out_sizes[1] = input[0].size();
-    out_sizes[2] = input[0][0].size() + 2 * padding_y;
-    out_sizes[3] = input[0][0][0].size() + 2 * padding_x;
-    if (axis == 0)
-        out_sizes[0] = 1;
-    else
-        for (uint16_t i = axis; i > 0; --i)
-        {
-            out_sizes[i] = out_sizes[i - 1];
-            out_sizes[i - 1] = 1;
-        }
-    VVVVF<T> output(out_sizes[0], VVVF<T>(out_sizes[1], VVF<T>(out_sizes[2], VF<T>(out_sizes[3]))));
-
-    switch (axis) {
-    case 0:
-        for (size_t f = 0; f < out_sizes[1]; ++f)
-            for (size_t y = 0; y < out_sizes[2]; ++y)
-                for (size_t x = 0; x < out_sizes[3]; ++x)
-                {
-                    T res = input[0][f][y][x];
-                    size_t orig_b = input.size();
-                    for (size_t b = 1; b < orig_b; ++b)
-                        res = reduce_execute<T>(mode, res, input[b][f][y][x]);
-                    output[0][f][y][x] = res;
-                }
-        break;
-    case 1:
-        for (size_t b = 0; b < out_sizes[1]; ++b)
-            for (size_t y = 0; y < out_sizes[2]; ++y)
-                for (size_t x = 0; x < out_sizes[3]; ++x)
-                {
-                    T res = input[b][0][y][x];
-                    size_t orig_f = input[0].size();
-                    for (size_t f = 1; f < orig_f; ++f)
-                        res = reduce_execute<T>(mode, res, input[b][f][y][x]);
-                    output[0][b][y][x] = res;
-                }
-        break;
-    case 2:
-        for (size_t b = 0; b < out_sizes[1]; ++b)
-            for (size_t f = 0; f < out_sizes[2]; ++f)
-                for (size_t x = 0; x < out_sizes[3]; ++x)
-                {
-                    T res = input[b][f][0][x];
-                    size_t orig_y = input[0][0].size();
-                    for (size_t y = 1; y < orig_y; ++y)
-                        res = reduce_execute<T>(mode, res, input[b][f][y][x]);
-                    output[0][b][f][x] = res;
-                }
-        break;
-    case 3:
-        for (size_t b = 0; b < out_sizes[1]; ++b)
-            for (size_t f = 0; f < out_sizes[2]; ++f)
-                for (size_t y = 0; y < out_sizes[3]; ++y)
-                {
-                    T res = input[b][f][y][0];
-                    size_t orig_x = input[0][0][0].size();
-                    for (size_t x = 1; x < orig_x; ++x)
-                        res = reduce_execute<T>(mode, res, input[b][f][y][x]);
-                    output[0][b][f][y] = res;
-                }
-        break;
-    default: break;
-    }
-    return output;
-}
-
-template <typename T>
-VVVVF<T> reduce_input(VVVVF<T> &input,
-    cldnn::contract_mode mode, std::vector<uint16_t> reduction_axes,
-    int input_padding_y = 0, int input_padding_x = 0,
-    int output_padding_y = 0, int output_padding_x = 0) {
-    VVVVF<T> output(input);
-    for (size_t i = 0; i < reduction_axes.size(); ++i)
-        output = reduce_dim<T>(output, mode, reduction_axes[i], input_padding_y, input_padding_x, output_padding_y, output_padding_x);
-    return output;
-}
-
-std::string print_axes(std::vector<uint16_t> reduction_axes)
-{
-    std::stringstream res;
-    res << "[";
-    for (size_t i = 0; i < reduction_axes.size(); ++i)
-    {
-        if (i != 0)
-            res << ", ";
-        res << reduction_axes[i];
-    }
-    res << "]";
-    return res.str();
-}
-
-template <typename T>
-void generic_contract_test_float(cldnn::format test_input_fmt, int input_b, int input_f, int input_y, int input_x, cldnn::contract_mode mode,
-    std::vector<uint16_t> reduction_axes, int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, int output_padding_x = 0) {
-
-    int min_random = -2, max_random = 2;
-    VVVVF<T> input_rnd = generate_random_4d<T>(input_b, input_f, input_y, input_x, min_random, max_random);
-    VF<T> input_rnd_vec = flatten_4d<T>(test_input_fmt, input_rnd);
-
-    const auto& engine = get_test_engine();
-    tensor input_tensor(input_b, input_f, input_x, input_y);
-    auto input = memory::allocate(engine, { type_to_data_type<T>::value, test_input_fmt, input_tensor });
-    set_values(input, input_rnd_vec);
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(contract("output", "input", mode, reduction_axes));
-
-    network network(engine, topology);
-    network.set_input_data("input", input);
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "output");
-
-    auto output_memory = outputs.at("output").get_memory();
-    auto output_layout = output_memory.get_layout();
-    auto output_ptr = output_memory.pointer<T>();
-
-    VVVVF<T> output_cpu = reduce_input<T>(input_rnd, mode, reduction_axes, input_padding_y, input_padding_x, output_padding_y, output_padding_x);
-    EXPECT_EQ(output_layout.format.value, test_input_fmt.value);
-    tensor output_tensor = output_layout.get_buffer_size();
-    int y_size = output_tensor.spatial[1];
-    int x_size = output_tensor.spatial[0];
-    int f_size = output_tensor.feature[0];
-    int b_size = output_tensor.batch[0];
-    EXPECT_EQ(y_size, (int)output_cpu[0][0].size());
-    EXPECT_EQ(x_size, (int)output_cpu[0][0][0].size());
-    EXPECT_EQ(f_size, (int)output_cpu[0].size());
-    EXPECT_EQ(b_size, (int)output_cpu.size());
-
-    bool test_is_correct = true;
-    VF<T> output_cpu_vec = flatten_4d<T>(test_input_fmt, output_cpu);
-    for (size_t i = 0; i < output_cpu_vec.size(); ++i) {
-        if (!floating_point_equal(output_cpu_vec[i], output_ptr[i]) && !(std::isnan((float)output_cpu_vec[i]) && std::isnan((float)output_ptr[i]))) {
-            test_is_correct = false;
-            break;
-        }
-    }
-    EXPECT_EQ(test_is_correct, true) << std::endl
-        << "failing test parameters:" << std::endl
-        << "input_b = " << input_b << std::endl
-        << "input_f = " << input_f << std::endl
-        << "input_y = " << input_y << std::endl
-        << "input_x = " << input_x << std::endl
-        << "contract_mode = " << (int)mode << std::endl
-        << "axes = " << print_axes(reduction_axes) << std::endl
-        << "input_padding_y = " << input_padding_y << std::endl
-        << "input_padding_x = " << input_padding_x << std::endl
-        << "output_padding_y = " << output_padding_y << std::endl
-        << "output_padding_x = " << output_padding_x << std::endl;
-}
-
-template <typename T>
-void generic_contract_test_int(cldnn::format test_input_fmt, int input_b, int input_f, int input_y, int input_x, cldnn::contract_mode mode,
-    std::vector<uint16_t> reduction_axes, int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0, int output_padding_x = 0) {
-
-    int min_random = -2, max_random = 2;
-    VVVVF<T> input_rnd = generate_random_4d<T>(input_b, input_f, input_y, input_x, min_random, max_random);
-    VF<T> input_rnd_vec = flatten_4d<T>(test_input_fmt, input_rnd);
-
-    const auto& engine = get_test_engine();
-    tensor input_tensor(input_b, input_f, input_x, input_y);
-    auto input = memory::allocate(engine, { type_to_data_type<T>::value, test_input_fmt, input_tensor });
-    set_values(input, input_rnd_vec);
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(contract("output", "input", mode, reduction_axes));
-
-    network network(engine, topology);
-    network.set_input_data("input", input);
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "output");
-
-    auto output_memory = outputs.at("output").get_memory();
-    auto output_layout = output_memory.get_layout();
-    auto output_ptr = output_memory.pointer<T>();
-
-    VVVVF<T> output_cpu = reduce_input<T>(input_rnd, mode, reduction_axes, input_padding_y, input_padding_x, output_padding_y, output_padding_x);
-    EXPECT_EQ(output_layout.format.value, test_input_fmt.value);
-    tensor output_tensor = output_layout.get_buffer_size();
-    int y_size = output_tensor.spatial[1];
-    int x_size = output_tensor.spatial[0];
-    int f_size = output_tensor.feature[0];
-    int b_size = output_tensor.batch[0];
-    EXPECT_EQ(y_size, (int)output_cpu[0][0].size());
-    EXPECT_EQ(x_size, (int)output_cpu[0][0][0].size());
-    EXPECT_EQ(f_size, (int)output_cpu[0].size());
-    EXPECT_EQ(b_size, (int)output_cpu.size());
-
-    bool test_is_correct = true;
-    VF<T> output_cpu_vec = flatten_4d<T>(test_input_fmt, output_cpu);
-
-    for (size_t i = 0; i < output_cpu_vec.size(); ++i) {
-        if (output_cpu_vec[i] != output_ptr[i]) {
-            test_is_correct = false;
-            break;
-        }
-    }
-    EXPECT_EQ(test_is_correct, true) << std::endl
-        << "failing test parameters:" << std::endl
-        << "input_b = " << input_b << std::endl
-        << "input_f = " << input_f << std::endl
-        << "input_y = " << input_y << std::endl
-        << "input_x = " << input_x << std::endl
-        << "contract_mode = " << (int)mode << std::endl
-        << "axes = " << print_axes(reduction_axes) << std::endl
-        << "input_padding_y = " << input_padding_y << std::endl
-        << "input_padding_x = " << input_padding_x << std::endl
-        << "output_padding_y = " << output_padding_y << std::endl
-        << "output_padding_x = " << output_padding_x << std::endl;
-}
-
-TEST(contract_gpu_f32, generic_y_sum) {
-    generic_contract_test_float<float>(format::bfyx, 5, 5, 5, 5, contract_mode::sum, { 2 });
-}
-
-TEST(contract_gpu_f32, generic_fx_prod) {
-    generic_contract_test_float<float>(format::bfyx, 5, 5, 5, 5, contract_mode::sum, { 1, 3 });
-}
-
-TEST(contract_gpu_i32, generic_f_all) {
-    generic_contract_test_int<int32_t>(format::bfyx, 5, 5, 5, 5, contract_mode::all, { 1 });
-}
-
-TEST(contract_gpu_i32, generic_bfyx_any) {
-    generic_contract_test_int<int32_t>(format::bfyx, 5, 5, 5, 5, contract_mode::any, { 0, 1, 2, 3 });
-}
-
-TEST(contract_gpu_f32, generic_f_max) {
-    generic_contract_test_float<float>(format::bfyx, 5, 5, 5, 5, contract_mode::max, { 1 });
-}
-
-TEST(contract_gpu_i32, generic_f_max) {
-    generic_contract_test_int<int32_t>(format::bfyx, 5, 5, 5, 5, contract_mode::max, { 1 });
-}
-
-TEST(contract_gpu_i64, generic_f_max) {
-    generic_contract_test_int<int64_t>(format::bfyx, 5, 5, 5, 5, contract_mode::max, { 1 });
-}
-
-TEST(contract_gpu_i64, generic_x_sum) {
-    generic_contract_test_int<int64_t>(format::bfyx, 5, 5, 5, 5, contract_mode::sum, { 3 });
-}
-
-TEST(contract_gpu_i64, generic_fy_any) {
-    generic_contract_test_int<int64_t>(format::bfyx, 5, 5, 5, 5, contract_mode::any, { 1, 2 });
-}
-
-TEST(contract_error, basic_error_empty_r_axes) {
-
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(contract("output", "input", contract_mode::sum, { }));
-
-    std::string msg_to_find = "Incorrect parameters configuration: reduction_axes should not be empty.";
-    EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find));
-}
-
-TEST(contract_error, basic_error_wrong_r_axes_size) {
-
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(contract("output", "input", contract_mode::sum, { 0, 1, 2, 3, 4 }));
-
-    std::string msg_to_find = "Incorrect parameters configuration: reduction_axes size should be less or equal 4.";
-    EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find));
-}
-
-TEST(contract_error, basic_error_wrong_r_axis_value) {
-
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(contract("output", "input", contract_mode::sum, { 0, 4 }));
-
-    std::string msg_to_find = "Incorrect parameters configuration: reduction_axes index should be within reduction_axes range.";
-    EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find));
-}
-
-TEST(contract_error, basic_error_duplicate_r_axis_values) {
-
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(contract("output", "input", contract_mode::sum, { 0, 1, 1 }));
-
-    std::string msg_to_find = "Incorrect parameters configuration: Duplicate axes numbers was found in reduction_axes.";
-    EXPECT_ANY_THROW(check_exception_massage(engine, topology, msg_to_find));
-}
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
index 00f48ffa1ec..de8b62496cd 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
@@ -4000,90 +4000,6 @@ TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp32)
 #undef USE_OLD_WEIGHTS_FORMAT
 }
 
-template<typename T>
-void quantize_weights(cldnn::memory& weights, cldnn::memory& w_qf)
-{
-    using std::abs;
-
-    auto batch_pitch = weights.get_layout().get_pitches().batch[0];
-    auto ptr = weights.pointer<T>();
-    auto wqf_ptr = w_qf.pointer<float>();
-    T max = (T) 0.0f;
-    for (int ofm = 0; ofm < weights.get_layout().size.batch[0]; ofm++)
-    {
-        max = (T) 0.0f;
-        for (int w = 0; w < batch_pitch; w++)
-            if (max < abs(ptr[ofm* batch_pitch + w]))
-                max = abs(ptr[ofm* batch_pitch + w]);
-
-        if (max == (T)0)
-            max = (T)1; // do not quantize
-
-        for (int w = 0; w < batch_pitch; w++)
-            ptr[ofm* batch_pitch + w] = (T)round((float)ptr[ofm* batch_pitch + w] * 127.0f / (float)max);
-        wqf_ptr[ofm] = max/127.0f;
-    }
-}
-template<typename T>
-void calibrate(const cldnn::memory& output, cldnn::memory& calibrations)
-{
-    using std::abs;
-
-    auto feature_pitch = output.get_layout().get_pitches().feature[0];
-    auto ptr = output.pointer<T>();
-    auto calibrations_ptr = calibrations.pointer<float>();
-    T max = (T) 0.0f;
-    for (int ofm = 0; ofm < output.get_layout().size.feature[0]; ofm++)
-    {
-        max = (T) 0.0f;
-        for (int w = 0; w < feature_pitch; w++)
-            if (max < abs(ptr[ofm* feature_pitch + w]))
-                max = abs(ptr[ofm* feature_pitch + w]);
-        calibrations_ptr[ofm] =  127.0f / max;
-    }
-}
-
-template<typename T>
-T max_abs(const cldnn::memory& mem)
-{
-    using std::abs;
-
-    T max = (T)0;
-    auto ptr = mem.pointer<T>();
-    for (auto& a : ptr)
-        if (max < abs(a))
-            max = abs(a);
-    return max;
-}
-
-template<typename T>
-void apply_calibration_on_weights(cldnn::memory& weights, cldnn::memory& qf)
-{
-    auto ptr = weights.pointer<T>();
-    auto wqf_ptr = qf.pointer<float>();
-    tensor w_size = weights.get_layout().size;
-    int index = 0;
-    for (int ofm = 0; ofm < w_size.batch[0]; ofm++)
-        for (int ifm = 0; ifm < w_size.feature[0]; ifm++)
-            for (int xy = 0; xy < w_size.spatial[0] * w_size.spatial[1]; xy++)
-            {
-                ptr[index] = ptr[index] / wqf_ptr[ifm];
-                index++;
-            }
-}
-
-cldnn::memory create_int8_weights(engine engine, cldnn::memory& in_weights)
-{
-    auto layout = in_weights.get_layout();
-    auto out_weights = memory::allocate(engine, { data_types::i8, layout.format, layout.size });
-    auto in = in_weights.pointer<float>();
-    auto out = out_weights.pointer<char>();
-    int indx = 0;
-    for (auto& a : in)
-        out[indx++] = (char) a;
-    return out_weights;
-}
-
 void add_primitives(const engine& engine, topology& topology)
 {
     auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } });
@@ -4655,238 +4571,6 @@ TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_asymmetric_weights_p
         }
 }
 
-template <typename InputTy, typename OutputTy, typename PreActivationTy = int32_t>
-class ConvQuantizedTest : public testing::Test {
-protected:
-    std::vector<InputTy> input_values;
-    // As for the depthwise convolution, will be processed to a normal
-    // convolution later on.
-    std::vector<int8_t> weights_values;
-    std::vector<int32_t> biases_values;
-    std::vector<float> quantization_values;
-    std::vector<PreActivationTy> output_pre_relu; // ...but after quantization.
-
-    void add_feature(std::vector<InputTy> input,
-                     std::vector<int8_t> weights,
-                     int32_t bias,
-                     float quantization,
-                     std::vector<PreActivationTy> output)
-    {
-        input_values.insert(input_values.end(), input.begin(), input.end());
-        weights_values.insert(
-            weights_values.end(), weights.begin(), weights.end());
-        biases_values.push_back(bias);
-        quantization_values.push_back(quantization);
-        output_pre_relu.insert(
-            output_pre_relu.end(), output.begin(), output.end());
-    }
-
-    template<typename T = PreActivationTy>
-    static typename std::enable_if<std::is_floating_point<T>::value>::type
-    expect_eq(const PreActivationTy& lhs, const PreActivationTy& rhs)
-    {
-        EXPECT_NEAR(lhs, rhs, 0.001f);
-    }
-
-    template<typename T = PreActivationTy>
-    static typename std::enable_if<std::is_integral<T>::value>::type
-    expect_eq(const PreActivationTy& lhs, const PreActivationTy& rhs)
-    {
-        EXPECT_EQ(lhs, rhs);
-    }
-
-    template <typename T>
-    static T pre_relu_to_output(T pre_relu) {
-      // No std::clamp before C++17 :(
-      return std::min(
-          static_cast<T>(std::numeric_limits<OutputTy>::max()),
-          std::max(static_cast<T>(std::numeric_limits<OutputTy>::lowest()),
-                   std::max(static_cast<T>(0), pre_relu)));
-    }
-
-    void do_test()
-    {
-        const auto& engine = get_test_engine();
-        int n_features = static_cast<int>(biases_values.size());
-
-        auto input_shape = tensor(1, n_features, 4, 1);
-        auto weights_shape = tensor(n_features, n_features, 3, 1);
-        auto biases_shape = tensor(1, n_features, 1, 1);
-
-        auto input = memory::allocate(
-            engine,
-            {type_to_data_type<InputTy>::value, format::bfyx, input_shape});
-        auto weights = memory::allocate(
-            engine, {data_types::i8, format::bfyx, weights_shape});
-
-        auto biases = memory::allocate(
-            engine, {data_types::i32, format::bfyx, biases_shape});
-        auto quantization = memory::allocate(
-            engine, {data_types::f32, format::bfyx, biases_shape});
-
-        set_values(input, input_values);
-        std::vector<int8_t> post_processed_weights_values(n_features
-                                                          * n_features * 3);
-        for (int output_feature = 0; output_feature < n_features; ++output_feature)
-            for (int input_feature = 0; input_feature < n_features;
-                 ++input_feature)
-                for (int x = 0; x < 3; ++x)
-                {
-                    int idx =
-                        output_feature * n_features * 3 + input_feature * 3 + x;
-                    if (input_feature == output_feature)
-                        post_processed_weights_values[idx] =
-                            weights_values[input_feature * 3 + x];
-                    else
-                        post_processed_weights_values[idx] = 0;
-                }
-        set_values(weights, post_processed_weights_values);
-        set_values(biases, biases_values);
-        set_values(quantization, quantization_values);
-
-        build_options opts;
-        opts.set_option(build_option::optimize_data(false));
-
-        topology topology(input_layout("input", input.get_layout()),
-                          data("weights", weights),
-                          data("biases", biases),
-                          data("quantization", quantization),
-                          convolution("conv",
-                                      "input",
-                                      {"weights"},
-                                      {"biases"},
-                                      {1, 1, 1, 1},
-                                      {0, 0, 0, 0},
-                                      {1, 1, 1, 1}),
-            activation("out", "conv", activation_func::relu));
-
-        network network(engine, topology, opts);
-        network.set_input_data("input", input);
-
-        auto outputs = network.execute();
-
-        auto output_memory = outputs.at("out").get_memory();
-        auto output_layout = output_memory.get_layout();
-        auto output_ptr = output_memory.pointer<OutputTy>();
-        int y_size = output_layout.size.spatial[1];
-        int x_size = output_layout.size.spatial[0];
-        int f_size = output_layout.size.feature[0];
-        int b_size = output_layout.size.batch[0];
-        EXPECT_EQ(output_layout.format, format::bfyx);
-        EXPECT_EQ(y_size, 1);
-        EXPECT_EQ(x_size, 2);
-        EXPECT_EQ(f_size, n_features);
-        EXPECT_EQ(b_size, 1);
-
-        for (int f = 0; f < f_size; f++)
-            for (int x = 0; x < x_size; ++x)
-            {
-                // printf("f: %d, x: %d\n", f, x);
-                PreActivationTy expected = pre_relu_to_output(output_pre_relu[f * x_size + x]);
-                auto actual = static_cast<PreActivationTy>(output_ptr[f * x_size + x]);
-                expect_eq(expected, actual);
-            }
-    }
-};
-
-class ConvQuantizedTest_i8_to_u8 : public ConvQuantizedTest<int8_t, uint8_t>
-{};
-
-TEST_F(ConvQuantizedTest_i8_to_u8, DISABLED_basic) {
-    // Check that the output precision is `u8` indeed.
-    add_feature({125, 125, 0, 1}, {2, 0, 1}, 1, 1.0f, {251, 252});
-
-    // Check ReLU (negative result will become zero in the output).
-    add_feature({0, 50, 0, -50}, {0, 4, 4}, 1, 1.0f, {201, -199});
-
-    // Same but with non-unit calibration (just in case).
-    add_feature({0, 50, 0, -50}, {0, 8, 8}, 2, 0.5f, {201, -199});
-
-    // Something with intermediate accumulator outside i8/u8 range.
-    add_feature({120, 120, 120, -120}, {1, 1, 1}, 0, 0.25f, {90, 30});
-
-    // Check rounding (TODO: currently rounding to nearest, with half rounded
-    // away from zero, might need to change that).
-    add_feature({125, 125, 0, 126}, {1, 1, 1}, 1, 0.5f, {126, 126});
-    add_feature({125, 125, 0, 126}, {1, 1, 1}, 2, 0.5f, {126, 127});
-    // Same, but with output outside the i8 range.
-    add_feature({125, 125, 0, 126}, {1, 1, 1}, 21, 0.5f, {136, 136});
-
-    // Check saturation.
-    add_feature({0, 50, 0, -50}, {0, 8, 8}, 2, 1.0f, {402, -398});
-
-    do_test();
-}
-
-class ConvQuantizedTest_u8_to_u8 : public ConvQuantizedTest<uint8_t, uint8_t>
-{};
-
-TEST_F(ConvQuantizedTest_u8_to_u8, DISABLED_basic) {
-    // Start with the tests from the "i8" input case (move negative sign to the
-    // weights were needed):
-    //
-    // Check that the output precision is `u8` indeed.
-    add_feature({125, 125, 0, 1}, {2, 0, 1}, 1, 1.0f, {251, 252});
-
-    // Check ReLU (negative result will become zero in the output).
-    add_feature({0, 50, 0, 50}, {0, 4, -4}, 1, 1.0f, {201, -199});
-
-    // Same but with non-unit calibration (just in case).
-    add_feature({0, 50, 0, 50}, {0, 8, -8}, 2, 0.5f, {201, -199});
-
-    // Something with intermediate accumulator outside i8/u8 range.
-    add_feature({240, 240, 240, 240}, {2, 1, -1}, 0, 0.125f, {60, 60});
-
-    // Check rounding (TODO: currently rounding to nearest, with half rounded
-    // away from zero, might need to change that).
-    add_feature({125, 125, 0, 126}, {1, 1, 1}, 1, 0.5f, {126, 126});
-    add_feature({125, 125, 0, 126}, {1, 1, 1}, 2, 0.5f, {126, 127});
-    // Same, but with output outside the i8 range.
-    add_feature({125, 125, 0, 126}, {1, 1, 1}, 21, 0.5f, {136, 136});
-
-    // Check saturation.
-    add_feature({0, 50, 0, 50}, {0, 8, -8}, 2, 1.0f, {402, -398});
-
-    // Now, something "u8"-input-specific (basically subset of the tests above
-    // but move the scaling from the weights to the input):
-    add_feature({250, 250, 0, 1}, {1, 0, 1}, 1, 1.0f, {251, 252});
-    add_feature({0, 200, 0, 200}, {0, 1, -1}, 1, 1.0f, {201, -199});
-    add_feature({0, 200, 0, 200}, {0, 2, -2}, 2, 0.5f, {201, -199});
-
-    do_test();
-}
-
-class ConvQuantizedTest_u8_to_i8 : public ConvQuantizedTest<uint8_t, int8_t>
-{};
-
-TEST_F(ConvQuantizedTest_u8_to_i8, DISABLED_basic) {
-    // Basic test + rounding
-    add_feature({125, 125, 0, 1}, {2, 0, 1}, 1, 0.5f, {126, 126});
-
-    // Test proper clamping to the output i8 range.
-    add_feature({125, 125, 0, 1}, {2, 0, 1}, 1, 1.0f, {251, 252});
-
-    // Test ReLU by having negative number pre-ReLU.
-    add_feature({0, 50, 0, 50}, {0, 1, -1}, 1, 1.0f, {51, -49});
-
-    do_test();
-}
-
-class ConvQuantizedTest_i8_to_float : public ConvQuantizedTest<int8_t, float, float>
-{};
-
-TEST_F(ConvQuantizedTest_i8_to_float, DISABLED_basic) {
-    // Some basic checks.
-    add_feature({125, 125, 0, 1}, {2, 0, 1}, 1, 1.0f, {251.0f, 252.0f});
-    add_feature({0, 50, 0, -50}, {0, 8, 8}, 2, 0.5f, {201.0f, -199.0f});
-    add_feature({0, 50, 0, -50}, {0, 8, 8}, 2, 1.0f, {402.0f, -398.0f});
-
-    // Check the FP accuracy - no rounding should be performed.
-    add_feature({0, 5, 0, -5}, {0, 8, 8}, 0, 1.01f, {40.4f, -40.4f});
-
-    do_test();
-}
-
 TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp16)
 {
 #define USE_OLD_WEIGHTS_FORMAT 0
@@ -5209,561 +4893,6 @@ struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_g
     }
 };
 
-TEST_P(convolution_gpu, b_fs_yx_fsv4)
-{
-    const int in_B = 2;
-    const int in_X = 56;
-    const int in_Y = 56;
-    const int _OuD = 32;
-    const int W_B = _OuD;
-
-    // Kernel sizes
-    int W_X = testing::get<0>(GetParam());
-    int W_Y = W_X;
-
-    // Convolution offset
-    int offSet = -(W_X / 2);
-
-    // Features
-    int in_F = testing::get<1>(GetParam());
-    int W_F = in_F;
-
-    // Stride
-    int stride = testing::get<2>(GetParam());
-
-    // Output padding
-    int output_padding = testing::get<3>(GetParam());
-
-    // Biases
-    bool with_bias = testing::get<4>(GetParam());
-
-    engine engine;
-
-    // Input data init
-    std::vector<char> Data(in_B * in_F * in_X * in_Y);
-    std::iota(Data.begin(), Data.end(), 0);
-    auto input = memory::allocate(engine, {data_types::i8, format::bfyx, {in_B, in_F, in_X, in_Y}});
-    set_values(input, std::move(Data));
-
-    // Create a topology
-    topology topology_ref(input_layout("input", input.get_layout()));
-    topology topology_imad(input_layout("input", input.get_layout()));
-
-    // Reorder
-    topology_imad.add(reorder("reorder_in",
-                              "input",
-                              layout(data_types::i8, format::b_fs_yx_fsv4, {in_B, in_F, in_X, in_Y})));
-
-    // Weights init
-    std::vector<char> Weights(W_B * W_F * W_X * W_Y);
-    std::iota(Weights.begin(), Weights.end(), 0);
-    auto weights_gold =
-        memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
-    auto weights_imad =
-        memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
-    set_values(weights_gold, Weights);
-    set_values(weights_imad, std::move(Weights));
-    topology_ref.add(data("weights_gold", weights_gold));
-    topology_imad.add(data("weights_imad", weights_imad));
-
-    if (with_bias)
-    {
-        // Bias, Calibration, Quantization
-        std::vector<float> vB(_OuD), vC(_OuD), vQ(_OuD);
-        float x = 0.1f;
-        std::generate(vB.begin(), vB.end(), [x]() mutable {
-            x += 0.01f;
-            if (x >= 0.9f)
-                x = 0.1f;
-            return x;
-        });
-        x = 0.2f;
-        std::generate(vC.begin(), vC.end(), [x]() mutable {
-            x += 0.01f;
-            if (x >= 0.9f)
-                x = 0.2f;
-            return x;
-        });
-        x = 0.3f;
-        std::generate(vQ.begin(), vQ.end(), [x]() mutable {
-            x += 0.01f;
-            if (x >= 0.9f)
-                x = 0.3f;
-            return x;
-        });
-        auto bias_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, _OuD, 1, 1}});
-        auto bias_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, _OuD, 1, 1}});
-        auto callib_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-        auto callib_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-        auto quant_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-        auto quant_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-        set_values(bias_gold, vB);
-        set_values(bias_imad, std::move(vB));
-        set_values(callib_gold, vC);
-        set_values(callib_imad, std::move(vC));
-        set_values(quant_gold, vQ);
-        set_values(quant_imad, std::move(vQ));
-        topology_ref.add(data("bias_gold", bias_gold),
-                         data("callib_gold", callib_gold),
-                         data("quant_gold", quant_gold));
-        topology_imad.add(data("bias_imad", bias_imad),
-                         data("callib_imad", callib_imad),
-                         data("quant_imad", quant_imad));
-
-        // Convolutions
-        convolution conv_gold("conv_gold",
-                              "input",
-                              {"weights_gold"},
-                              {"bias_gold"},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        convolution conv_imad("conv_imad",
-                              "reorder_in",
-                              {"weights_imad"},
-                              {"bias_imad"},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        topology_ref.add(conv_gold);
-        topology_imad.add(conv_imad);
-    }
-    else
-    {
-        // Convolutions
-        convolution conv_gold(
-            "conv_gold", "input", {"weights_gold"}, {1, 1, stride, stride}, {0, 0, offSet, offSet});
-        convolution conv_imad(
-            "conv_imad", "reorder_in", {"weights_imad"}, {1, 1, stride, stride}, {0, 0, offSet, offSet});
-        conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        topology_ref.add(conv_gold);
-        topology_imad.add(conv_imad);
-    }
-
-    // Reorder
-    topology_imad.add(reorder("reorder_out",
-                              "conv_imad",
-                              layout(data_types::i8,
-                                     format::bfyx,
-                                     {in_B, W_B, (in_X + stride - 1) / stride, (in_Y + stride - 1) / stride},
-                                     padding({0, 0, output_padding, output_padding}, 0.0f))));
-
-    // Network build
-    build_options build_opt_ref, build_opt_imad;
-    build_opt_ref.set_option(build_option::optimize_data(false));
-    build_opt_imad.set_option(build_option::optimize_data(true));
-
-    network network_ref(engine, topology_ref, build_opt_ref);
-    network network_imad(engine, topology_imad, build_opt_imad);
-
-    network_ref.set_input_data("input", input);
-    auto outputs_ref = network_ref.execute();
-    network_imad.set_input_data("input", input);
-    auto outputs_imad = network_imad.execute();
-
-    auto out_gold = outputs_ref.find("conv_gold");
-    auto out_test = outputs_imad.find("reorder_out");
-
-    ASSERT_NE(out_gold, outputs_ref.end());
-    ASSERT_NE(out_test, outputs_imad.end());
-
-    auto gold_ptr = out_gold->second.get_memory().pointer<char>();
-    auto test_ptr = out_test->second.get_memory().pointer<char>();
-
-    ASSERT_EQ(gold_ptr.size(), test_ptr.size());
-    for (size_t i = 0; i < gold_ptr.size(); i++)
-    {
-        ASSERT_EQ(gold_ptr[i], test_ptr[i]);
-    }
-}
-TEST_P(convolution_gpu, b_fs_yx_fsv4_i8_to_u8)
-{
-    const int in_B = 2;
-    const int in_X = 56;
-    const int in_Y = 56;
-    const int _OuD = 32;
-    const int W_B = _OuD;
-
-    // Kernel sizes
-    int W_X = testing::get<0>(GetParam());
-    int W_Y = W_X;
-
-    // Convolution offset
-    int offSet = -(W_X / 2);
-
-    // Features
-    int in_F = testing::get<1>(GetParam());
-    int W_F = in_F;
-
-    // Stride
-    int stride = testing::get<2>(GetParam());
-
-    // Output padding
-    int output_padding = testing::get<3>(GetParam());
-
-    // Biases
-    bool with_bias = testing::get<4>(GetParam());
-
-    engine engine;
-
-    // Input data init
-    std::vector<char> Data(in_B * in_F * in_X * in_Y);
-    std::iota(Data.begin(), Data.end(), 0);
-    auto input = memory::allocate(engine, {data_types::i8, format::bfyx, {in_B, in_F, in_X, in_Y}});
-    set_values(input, std::move(Data));
-
-    // Create a topology
-    topology topology_ref(input_layout("input", input.get_layout()));
-    topology topology_imad(input_layout("input", input.get_layout()));
-
-    // Reorder
-    topology_imad.add(reorder("reorder_in",
-                         "input",
-                         layout(data_types::i8, format::b_fs_yx_fsv4, {in_B, in_F, in_X, in_Y})));
-
-    // Weights init
-    std::vector<char> Weights(W_B * W_F * W_X * W_Y);
-    std::iota(Weights.begin(), Weights.end(), 0);
-    auto weights_gold =
-        memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
-    auto weights_imad =
-        memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
-    set_values(weights_gold, Weights);
-    set_values(weights_imad, std::move(Weights));
-    topology_ref.add(data("weights_gold", weights_gold));
-    topology_imad.add(data("weights_imad", weights_imad));
-
-    // Bias, Calibration, Quantization
-
-    std::vector<float> vQ(_OuD);
-    float x = 0.3f;
-    std::generate(vQ.begin(), vQ.end(), [x]() mutable {
-        x += 0.01f;
-        if (x >= 0.9f)
-            x = 0.3f;
-        return x;
-    });
-    auto bias_gold = memory::allocate(engine, {data_types::i32, format::bfyx, {1, _OuD, 1, 1}});
-    auto bias_imad = memory::allocate(engine, {data_types::i32, format::bfyx, {1, _OuD, 1, 1}});
-    auto callib_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-    auto callib_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-    auto quant_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-    auto quant_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-
-    set_values(quant_gold, vQ);
-    set_values(quant_imad, std::move(vQ));
-    topology_ref.add(data("quant_gold", quant_gold));
-    topology_imad.add(data("quant_imad", quant_imad));
-
-
-    if (with_bias)
-    {
-        std::vector<int> vB(_OuD);
-
-        int i = 1;
-        std::generate(vB.begin(), vB.end(), [i]() mutable {
-            i += 1;
-            if (i >= 128)
-                i = 1;
-            return i;
-        });
-        set_values(bias_gold, vB);
-        set_values(bias_imad, std::move(vB));
-        topology_ref.add(data("bias_gold",bias_gold));
-        topology_imad.add(data("bias_imad",bias_imad));
-        // Convolutions
-        convolution conv_gold("conv_gold",
-                              "input",
-                              {"weights_gold"},
-                              {"bias_gold"},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        convolution conv_imad("conv_imad",
-                              "reorder_in",
-                              {"weights_imad"},
-                              {"bias_imad"},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-
-        conv_gold.output_data_type = data_types::u8;
-        conv_imad.output_data_type = data_types::u8;
-
-        topology_ref.add(conv_gold);
-        topology_imad.add(conv_imad);
-    }
-    else
-    {
-        // Convolutions
-        convolution conv_gold("conv_gold",
-                              "input",
-                              {"weights_gold"},
-                              {},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        convolution conv_imad("conv_imad",
-                              "reorder_in",
-                              {"weights_imad"},
-                              {},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-
-        conv_gold.output_data_type = data_types::u8;
-        conv_imad.output_data_type = data_types::u8;
-
-        conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        topology_ref.add(conv_gold);
-        topology_imad.add(conv_imad);
-    }
-
-    // Reorder
-    topology_imad.add(reorder("reorder_out",
-                              "conv_imad",
-                              layout(data_types::u8,
-                                     format::bfyx,
-                                     {in_B, W_B, (in_X + stride - 1) / stride, (in_Y + stride - 1) / stride},
-                                     padding({0, 0, output_padding, output_padding}, 0.0f))));
-
-    // Network build
-    build_options build_opt_ref, build_opt_imad;
-    build_opt_ref.set_option(build_option::optimize_data(false));
-    build_opt_imad.set_option(build_option::optimize_data(true));
-
-    network network_ref(engine, topology_ref, build_opt_ref);
-    network network_imad(engine, topology_imad, build_opt_imad);
-
-    network_ref.set_input_data("input", input);
-    auto outputs_ref = network_ref.execute();
-    network_imad.set_input_data("input", input);
-    auto outputs_imad = network_imad.execute();
-
-    auto out_gold = outputs_ref.find("conv_gold");
-    auto out_test = outputs_imad.find("reorder_out");
-
-    ASSERT_NE(out_gold, outputs_ref.end());
-    ASSERT_NE(out_test, outputs_imad.end());
-
-    auto gold_ptr = out_gold->second.get_memory().pointer<char>();
-    auto test_ptr = out_test->second.get_memory().pointer<char>();
-
-    ASSERT_EQ(gold_ptr.size(), test_ptr.size());
-    for (size_t i = 0; i < gold_ptr.size(); i++)
-    {
-        ASSERT_EQ(gold_ptr[i], test_ptr[i]);
-    }
-}
-TEST_P(convolution_gpu, b_fs_yx_fsv4_i8_to_fp32)
-{
-    const int in_B = 2;
-    const int in_X = 56;
-    const int in_Y = 56;
-    const int _OuD = 32;
-    const int W_B = _OuD;
-
-    // Kernel sizes
-    int W_X = testing::get<0>(GetParam());
-    int W_Y = W_X;
-
-    // Convoluiton offset
-    int offSet = -(W_X / 2);
-
-    // Features
-    int in_F = testing::get<1>(GetParam());
-    int W_F = in_F;
-
-    // Stride
-    int stride = testing::get<2>(GetParam());
-
-    // Output padding
-    int output_padding = testing::get<3>(GetParam());
-
-    // Biases
-    bool with_bias = testing::get<4>(GetParam());
-
-    engine engine;
-
-    // Input data init
-    std::vector<char> Data(in_B * in_F * in_X * in_Y);
-    std::iota(Data.begin(), Data.end(), 0);
-    auto input = memory::allocate(engine, {data_types::i8, format::bfyx, {in_B, in_F, in_X, in_Y}});
-    set_values(input, std::move(Data));
-
-    // Create a topology
-    topology topology_ref(input_layout("input", input.get_layout()));
-    topology topology_imad(input_layout("input", input.get_layout()));
-
-    // Reorder
-    topology_imad.add(reorder("reorder_in",
-                         "input",
-                         layout(data_types::i8, format::b_fs_yx_fsv4, {in_B, in_F, in_X, in_Y})));
-
-    // Weights init
-    std::vector<char> Weights(W_B * W_F * W_X * W_Y);
-    std::iota(Weights.begin(), Weights.end(), 0);
-    auto weights_gold =
-        memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
-    auto weights_imad =
-        memory::allocate(engine, {data_types::i8, format::bfyx, {W_B, W_F, W_X, W_Y}});
-    set_values(weights_gold, Weights);
-    set_values(weights_imad, std::move(Weights));
-    topology_ref.add(data("weights_gold", weights_gold));
-    topology_imad.add(data("weights_imad", weights_imad));
-
-    // Bias, Calibration, Quantization
-
-    std::vector<float> vQ(_OuD);
-    float x = 0.3f;
-    std::generate(vQ.begin(), vQ.end(), [x]() mutable {
-        x += 0.01f;
-        if (x >= 0.9f)
-            x = 0.3f;
-        return x;
-    });
-    auto bias_gold = memory::allocate(engine, {data_types::i32, format::bfyx, {1, _OuD, 1, 1}});
-    auto bias_imad = memory::allocate(engine, {data_types::i32, format::bfyx, {1, _OuD, 1, 1}});
-    auto callib_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-    auto callib_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-    auto quant_gold = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-    auto quant_imad = memory::allocate(engine, {data_types::f32, format::bfyx, {1, 1, _OuD, 1}});
-
-    set_values(quant_gold, vQ);
-    set_values(quant_imad, std::move(vQ));
-    topology_ref.add(data("quant_gold", quant_gold));
-    topology_imad.add(data("quant_imad", quant_imad));
-
-
-    if (with_bias)
-    {
-        std::vector<int> vB(_OuD);
-
-        int i = 1;
-        std::generate(vB.begin(), vB.end(), [i]() mutable {
-            i += 1;
-            if (i >= 128)
-                i = 1;
-            return i;
-        });
-        set_values(bias_gold, vB);
-        set_values(bias_imad, std::move(vB));
-        topology_ref.add(data("bias_gold",bias_gold));
-        topology_imad.add(data("bias_imad",bias_imad));
-        // Convolutions
-        convolution conv_gold("conv_gold",
-                              "input",
-                              {"weights_gold"},
-                              {"bias_gold"},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        convolution conv_imad("conv_imad",
-                              "reorder_in",
-                              {"weights_imad"},
-                              {"bias_imad"},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-
-        conv_gold.output_data_type = data_types::f32;
-        conv_imad.output_data_type = data_types::f32;
-
-        topology_ref.add(conv_gold);
-        topology_imad.add(conv_imad);
-    }
-    else
-    {
-        // Convolutions
-        convolution conv_gold("conv_gold",
-                              "input",
-                              {"weights_gold"},
-                              {},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-        convolution conv_imad("conv_imad",
-                              "reorder_in",
-                              {"weights_imad"},
-                              {},
-                              {1, 1, stride, stride},
-                              {0, 0, offSet, offSet});
-
-        conv_gold.output_data_type = data_types::f32;
-        conv_imad.output_data_type = data_types::f32;
-
-        conv_gold.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        conv_imad.output_padding = padding({0, 0, output_padding, output_padding}, 0.0f);
-        topology_ref.add(conv_gold);
-        topology_imad.add(conv_imad);
-    }
-
-    // Reorder
-    topology_imad.add(reorder("reorder_out",
-                              "conv_imad",
-                              layout(data_types::f32,
-                                     format::bfyx,
-                                     {in_B, W_B, (in_X + stride - 1) / stride, (in_Y + stride - 1) / stride},
-                                     padding({0, 0, output_padding, output_padding}, 0.0f))));
-
-    // Network build
-    build_options build_opt_ref, build_opt_imad;
-    build_opt_ref.set_option(build_option::optimize_data(false));
-    build_opt_imad.set_option(build_option::optimize_data(true));
-
-    network network_ref(engine, topology_ref, build_opt_ref);
-    network network_imad(engine, topology_imad, build_opt_imad);
-
-    network_ref.set_input_data("input", input);
-    auto outputs_ref = network_ref.execute();
-    network_imad.set_input_data("input", input);
-    auto outputs_imad = network_imad.execute();
-
-    auto out_gold = outputs_ref.find("conv_gold");
-    auto out_test = outputs_imad.find("reorder_out");
-
-    ASSERT_NE(out_gold, outputs_ref.end());
-    ASSERT_NE(out_test, outputs_imad.end());
-
-    auto gold_ptr = out_gold->second.get_memory().pointer<float>();
-    auto test_ptr = out_test->second.get_memory().pointer<float>();
-
-    ASSERT_EQ(gold_ptr.size(), test_ptr.size());
-    for (size_t i = 0; i < gold_ptr.size(); i++)
-    {
-        ASSERT_EQ(gold_ptr[i], test_ptr[i]);
-    }
-}
-
-// Select particular test cases
-//INSTANTIATE_TEST_CASE_P(convolution_gpu_imad,
-//                        convolution_gpu,
-//                        ::testing::Values(
-//                            // Filter size, Input features, Stride, Output padding, With bias
-//                            TestParamType_convolution_gpu(1, 32, 1, 0, false),
-//                            TestParamType_convolution_gpu(3, 32, 1, 0, false),
-//                            TestParamType_convolution_gpu(7,  3, 1, 0, false),
-//                            TestParamType_convolution_gpu(1, 32, 1, 0, true),
-//                            TestParamType_convolution_gpu(3, 32, 1, 0, true),
-//                            TestParamType_convolution_gpu(7,  3, 1, 0, true),
-//                            TestParamType_convolution_gpu(1, 32, 1, 1, false),
-//                            TestParamType_convolution_gpu(3, 32, 1, 1, false),
-//                            TestParamType_convolution_gpu(7,  3, 1, 1, false),
-//                            TestParamType_convolution_gpu(1, 32, 2, 0, false),
-//                            TestParamType_convolution_gpu(3, 32, 2, 0, false),
-//                            TestParamType_convolution_gpu(7,  3, 2, 0, false),
-//                            TestParamType_convolution_gpu(3, 64, 2, 1, true)),
-//                        convolution_gpu::PrintToStringParamName);
-//// or test all combinations
-//INSTANTIATE_TEST_CASE_P(convolution_gpu_imad,
-//                        convolution_gpu,
-//                        ::testing::Combine(::testing::Values(1, 3, 7),    // Filter size
-//                                           ::testing::Values(3, 32),      // Input features
-//                                           ::testing::Values(1, 2),       // Stride
-//                                           ::testing::Values(0, 1),       // Output padding
-//                                           ::testing::Values(false, true) // With bias
-//                                           ),
-//                        convolution_gpu::PrintToStringParamName);
-
 INSTANTIATE_TEST_CASE_P(convolution_gpu_test,
                         convolution_gpu_fs_byx_fsv32,
                         ::testing::Values(
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp
deleted file mode 100644
index 4dd7aa70d29..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/convolution_grad_input.hpp"
-#include <api/data.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-#include "api/eltwise.hpp"
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1) {
-    //  Filter : 2x2
-    //  Input  : 2x2x1x2
-    //  Output : 2x2x1x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  8  0.5    1   3
-    //  6  9      2   4
-    //
-    //  Filter
-    //  -2   2
-    //   7  -0.5
-    //
-    //  Output:
-    //  -4    3.5    -0.5   21
-    //   12  -18      4     -9
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::oiyx,{ 1, 1, 2, 2 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f });
-    set_values(weights, { -2.f, 2.f, 7.f, -0.5f });
-
-    topology topology(
-        input_layout("input", input.get_layout()),
-        data("weights", weights),
-        convolution_grad_input("deconv", "input", { "weights" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "deconv");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-
-    std::vector<float> expected_output_vec = {
-        -4.f, 3.5f, 12.f, -18.f,
-        -.5f, 21.f, 4.f, -8.f
-    };
-
-    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
-    {
-        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
-    }
-}
-
-TEST(convolution_grad_input_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1_output_size) {
-    //  Filter : 2x2
-    //  Input  : 2x2x1x2
-    //  Output : 2x2x1x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  8  0.5    1   3
-    //  6  9      2   4
-    //
-    //  Filter
-    //  -2   2
-    //   7  -0.5
-    //
-    //  Output:
-    //  -4    3.5    -0.5   21
-    //   12  -18      4     -9
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::oiyx,{ 1, 1, 2, 2 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f });
-    set_values(weights, { -2.f, 2.f, 7.f, -0.5f });
-
-    topology topology(
-        input_layout("input", input.get_layout()),
-        data("weights", weights),
-        convolution_grad_input("deconv", "input", { "weights" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 }, { 2, 1, 2, 2 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "deconv");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-
-    std::vector<float> expected_output_vec = {
-        -4.f, 3.5f, 12.f, -18.f,
-        -.5f, 21.f, 4.f, -8.f
-    };
-
-    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
-    {
-        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
-    }
-}
-
-TEST(convolution_grad_input_f32_fw_gpu, DISABLED_basic_wsiz2x2_in2x2x1x2_bfyx_stride2_fusion) {
-    //  Filter : 2x2
-    //  Input  : 2x2x1x2
-    //  Output : 2x2x1x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  8  0.5    1   3
-    //  6  9      2   4
-    //
-    //  Filter
-    //  -2   2
-    //   7  -0.5
-    //
-    //  Output:
-    //  -4    3.5    -0.5   21
-    //   12  -18      4     -9
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
-    auto scale_in = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto elt_data = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 1, 2 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f });
-    set_values(weights, { -2.f, 2.f, 7.f, -0.5f });
-    set_values(scale_in, { 1.0f });
-    set_values(elt_data, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f });
-
-    topology topology(
-        input_layout("input", input.get_layout()),
-        data("weights", weights),
-        data("scale_in", scale_in),
-        data("elt_data", elt_data),
-        convolution_grad_input("conv", "input", { "weights" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 }),
-        eltwise("elt", "conv", "elt_data", eltwise_mode::sum),
-        scale("scale", "elt", "scale_in")
-    );
-
-    build_options options;
-    options.set_option(build_option::optimize_data(true));
-       
-    network network(engine, topology, options);
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-    auto primitives = network.get_all_primitive_ids();
-    auto exec_prim = network.get_executed_primitive_ids();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "scale");
-    EXPECT_TRUE(std::find(primitives.begin(), primitives.end(), "elt") == primitives.end());
-    EXPECT_TRUE(std::find(exec_prim.begin(), exec_prim.end(), "elt") == exec_prim.end());
-        
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-
-    std::vector<float> expected_output_vec = {
-        -3.f, 5.5f, 14.f, -15.f,
-        4.5f, 27.f, 10.f, -1.f
-    };
-
-    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
-    {
-        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp
deleted file mode 100644
index 08c6af472c0..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp
+++ /dev/null
@@ -1,1112 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/convolution_grad_weights.hpp"
-#include "api/convolution.hpp"
-#include "api/convolution_grad_input.hpp"
-#include "api/reorder.hpp"
-#include <api/mutable_data.hpp>
-#include <api/data.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-using namespace cldnn;
-using namespace tests;
-
-void validate_output(std::vector<float> expected_weights_vec, std::map<primitive_id, network_output> outputs)
-{
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-    auto output_ptr = output_prim.pointer<float>();
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i]);
-        float y = float_round(output_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on weights verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1) {
-    //  Filter : 2x2
-    //  Input grad  : 1x2x2x2
-    //  Input  : 1x1x2x2
-    //  Stride : 2x2
-    //
-    //  Input grad:
-    //  0.5    0.6    0.9   1
-    //  0.7    0.8    1.7   1.8
-    //
-    //  Input:
-    //  8  0.5
-    //  6  9
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 2, 2 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 3, 3 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f });
-    set_values(input_grad, { 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.7f, 1.8f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        7.2f, 4.2f, 6.3f,
-        0.3f, 4.0f, 0.25f,
-        5.4f, 3.0f, 4.5f,
-
-        16.2f, 10.2f, 15.3f,
-        0.5f, 7.2f, 0.45f,
-        9.0f, 5.4f, 8.1f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        2.6f, 5.4f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in8x1x2x2_bfyx_stride2_pad1) {
-    //  Filter : 2x2
-    //  Input grad  : 1x2x2x2
-    //  Input  : 1x1x2x2
-    //  Stride : 2x2
-    //
-    //  Input grad:
-    //  0.5    0.6    0.9   1
-    //  0.7    0.8    1.7   1.8
-    //  0.5    0.6    0.9   1
-    //  0.7    0.8    1.7   1.8
-    //
-    //  Input:
-    //  8  0.5
-    //  6  9
-    //  8  0.5
-    //  6  9
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 2, 2 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f, 8.f, 0.5f, 4.f, 7.f });
-    set_values(input_grad, { 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.7f, 1.8f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 1.f, 1.7f, 1.8f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        10.f, 5.4f, 8.4f,
-        0.4f, 4.8f, 0.3f,
-        6.8f, 3.4f, 5.2f,
-
-        28.8f, 17.f, 27.2f,
-        1.f, 11.2f, 0.7f,
-        16.0f, 7.4f, 11.6f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        3.6f, 10.4f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1_no_bias) {
-    //  Filter : 2x2
-    //  Input grad  : 1x2x2x2
-    //  Input  : 1x1x2x2
-    //  Stride : 2x2
-    //
-    //  Input grad:
-    //  0.5    0.6    0.9   1
-    //  0.7    0.8    1.7   1.8
-    //
-    //  Input:
-    //  8  0.5
-    //  6  9
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f });
-    set_values(input_grad, { 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.7f, 1.8f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        7.2f, 4.2f, 6.3f,
-        0.3f, 4.0f, 0.25f,
-        5.4f, 3.0f, 4.5f,
-
-        16.2f, 10.2f, 15.3f,
-        0.5f, 7.2f, 0.45f,
-        9.0f, 5.4f, 8.1f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1_fwd_backw) {
-    //  Filter : 2x2
-    //  Input grad  : 1x2x2x2
-    //  Input  : 1x1x2x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  1  2
-    //  3  4
-    //
-    //  Weights:
-    //  2    1
-    //  1    1
-    //
-    //  Bias:
-    //  0
-
-    const auto& engine = get_test_engine();
-    float lr = 0.001f;
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
-
-    auto weights = memory::allocate(engine, { data_types::f32, format::oiyx,{ 1, 1, 2, 2 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-
-    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f });
-    set_values(weights, { 2.0f, 1.0f, 1.0f, 1.0f });
-    set_values(biases, { 0.f } );
-
-    topology topology(
-        input_layout("input", input.get_layout()),
-        reorder("input_reordered", "input", input.get_layout()),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution("conv", "input_reordered", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 }),
-        convolution_grad_input("conv_grad_input", "conv", { "weights" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 }),
-        convolution_grad_weights("conv_grad_weights", "conv", "input_reordered", { "weights" }, { "biases" }, { 1, 1, 1, 1 },
-        { 0, 0, -1, -1 }, { 1,1,1,1 }, "conv_grad_input")
-    );
-    build_options opt;
-    opt.set_option(build_option::outputs({ "conv_grad_input", "conv_grad_weights" }));
-    network network(engine, topology, opt);
-    network.set_input_data("input", input);
-    network.set_learning_rate(lr);
-
-    auto outputs = network.execute();
-
-    auto output_prim = outputs.at("conv_grad_input").get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_output_vec = {
-        30.0f, 32.0f, 38.0f, 45.0f
-    };
-
-    std::vector<float> expected_weights_vec = {
-        2 - 89 * lr, 1 - 75 * lr, 1 - 72 * lr, 1 - 63 * lr
-    };
-
-    std::vector<float> expected_bias_vec = {
-        -50.0f * lr
-    };
-
-    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
-    {
-        float x = float_round(expected_output_vec[i]), y = float_round(output_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on output verification" << random_seed << std::endl;
-        x = float_round(expected_weights_vec[i]);
-        y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    float x = float_round(expected_bias_vec[0]), y = float_round(biases_ptr[0]);
-    EXPECT_FLOAT_EQ(x, y) << "on biases verification" << random_seed << std::endl;
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_bfyx_stride2_pad1_split2) {
-    //  Filter : 2x2
-    //  Input grad  : 1x2x2x2
-    //  Input  : 1x1x2x2
-    //  Stride : 2x2
-    //
-    //  Input grad:
-    //  0.5    0.6    0.9   1
-    //  0.7    0.8    1.7   1.8
-    //
-    //  Input:
-    //  8  0.5    1  2
-    //  6  9      3  4
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 2, 2 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 3 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-    auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 3 } });
-    auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f, 1.0f, 2.0f, 3.0f, 4.0f });
-    set_values(input_grad, { 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.f, 1.7f, 1.8f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        mutable_data("weights2", weights2),
-        mutable_data("biases2", biases2),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights", "weights2" }, { "biases", "biases2" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-    auto weights2_ptr = weights2.pointer<float>();
-    auto biases2_ptr = biases2.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        7.2f, 4.2f, 6.3f,
-        0.3f, 4.0f, 0.25f,
-        5.4f, 3.0f, 4.5f,
-    };
-
-    std::vector<float> expected_bias_vec = {
-        2.6f
-    };
-
-    std::vector<float> expected_weights2_vec = {
-        7.2f, 5.1f, 6.8f,
-        2.0f, 0.9f, 1.8f,
-        4.0f, 2.7f, 3.6f,
-    };
-
-    std::vector<float> expected_bias2_vec = {
-        5.4f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-        x = float_round(expected_weights2_vec[i] * lr);
-        y = float_round(weights2_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-        x = float_round(expected_bias2_vec[i] * lr);
-        y = float_round(biases2_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz1x1_in1x2x5x5_bfyx_stride2_pad1) {
-    //  Filter : 1x1
-    //  Input grad  : 1x2x2x2
-    //  Input  : 1x1x2x2
-    //  Stride : 2x2
-    //
-    //  Input grad:
-    //  0.5    0.6    0.7    0.9   1      1.1    0.7    0.9    0.1    1.9
-    //  0.7    0.8    0.8    1.7   1.8    1.2    2.1    0.5    0.2    0.9
-    //  0.6    0.5    0.4    0.2   0.1    1.5    0.6    0.7    0.3    0.8
-    //  0.7    0.8    0.9    0.2   0.4    1.8    0.4    0.9    0.4    0.7
-    //  0.6    0.5    0.4    0.1   0.1    1.7    0.5    0.4    0.5    0.6
-    //
-    //  Input:
-    //  8  0.5 1  2
-    //  6  9   3  4
-    //  5  6   7  8
-    //  9  10 11 11
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 5, 5 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 4 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 1, 1 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    set_values(input, {
-        8.f, 0.5f, 1.f, 2.f,
-        6.f, 9.f, 3.f, 4.f,
-        5.f, 6.f, 7.f, 8.f,
-        9.f, 10.f, 11.f, 11.f
-    });
-    set_values(input_grad, {
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f,
-        0.6f, 0.5f, 0.4f, 0.2f, 0.1f,
-        0.7f, 0.8f, 0.9f, 0.2f, 0.4f,
-        0.6f, 0.5f, 0.4f, 0.1f, 0.1f,
-        1.1f, 0.7f, 0.9f, 0.1f, 1.9f,
-        1.2f, 2.1f, 0.5f, 0.2f, 0.9f,
-        1.5f, 0.6f, 0.7f, 0.3f, 0.8f,
-        1.8f, 0.4f, 0.9f, 0.4f, 0.7f,
-        1.7f, 0.5f, 0.4f, 0.5f, 0.6f
-    });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { 1, 1, 2, 2 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        19.8f, 34.6f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        16.0f, 21.4f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification " << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification " << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz2x2_in32x1x2x2_yxfb_stride1) {
-    //  Filter : 1x1
-    //  Input grad  : 32x1x2x2
-    //  Input  : 32x1x3x3
-    //  Stride : 1x1
-    //
-    //  Input grad:
-    // y0: x0: 0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  1.9  0.6  0.5  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.5  0.6
-    // y0: x1: 0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  1.9  0.1  1.5  0.6  2.1  0.4  0.3
-    // y1: x0: 0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  1.9  0.1  1.7  0.5  0.4  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  1.2  2.1  0.5  0.2  0.9  0.4  0.1  1.2  0.2  0.1
-    // y1: x1: 0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.9  0.1  1.9  0.1  1.7  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  0.1  1.7  0.5  0.4  1.7  0.5  0.4  0.5  0.6  0.0  0.7
-    //
-    //  Input:
-    // y0: x0: 0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  0.6  1.2  2.1  0.1  0.2
-    // y0: x1: 0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.4  0.1  0.1  1.7  0.5  0.4  0.5  1    1.1  0.7  0.9  0.1  1.9  0.1  1.7  0.5  0.4  0.4  0.1  0.1  0.3  0.4
-    // y0: x2: 0.1  1.7  0.5  0.4  0.4  0.1  0.1  1.7  0.5  0.4  0.5  1.1  0.7  0.9  0.1  0.1  1.7  0.5  0.1  1.9  0.6  0.5  0.4  0.1  0.1  1.7  0.5  0.4  2.1  0.5  0.5  0.6
-    // y1: x0: 1.9  0.1  1.7  0.5  0.6  0.7  0.9  1    1.1  0.7  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.4  0.2  0.9  0.4  0.1  1.2  1.9  0.1  1.5  0.6  2.1  2.3  0.7  0.8
-    // y1: x1: 0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  0.6  1.2  2.1  0.9  1.0
-    // y1: x2: 0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  0.4  0.1  0.1  1.7  0.5  0.4  0.5  1    1.1  0.7  0.9  0.1  1.9  0.1  1.7  0.5  0.4  0.4  0.1  0.1  1.1  1.2
-    // y2: x0: 0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.9  0.1  1.9  0.1  1.7  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  0.1  1.7  0.5  0.4  1.7  0.5  0.4  0.5  0.6  1.3  1.4
-    // y2: x1: 0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  1.9  0.6  0.5  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2  0.9  1.5  1.6
-    // y2: x2: 0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1  1.9  0.1  1.7  0.5  0.4  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  1.2  2.1  0.5  0.2  0.9  0.4  0.1  1.2  1.7  1.8
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::yxfb,{ 32, 1, 2, 2 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 32, 1, 3, 3 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });
-
-    set_values(input, {
-        0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 0.6f, 1.2f, 2.1f, 0.1f, 0.2f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 0.3f, 0.4f,
-        0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 1.1f, 0.7f, 0.9f, 0.1f, 0.1f, 1.7f, 0.5f, 0.1f, 1.9f, 0.6f, 0.5f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 2.1f, 0.5f, 0.5f, 0.6f,
-        1.9f, 0.1f, 1.7f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.4f, 0.2f, 0.9f, 0.4f, 0.1f, 1.2f, 1.9f, 0.1f, 1.5f, 0.6f, 2.1f, 2.3f, 0.7f, 0.8f,
-        0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 0.6f, 1.2f, 2.1f, 0.9f, 1.0f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 1.1f, 1.2f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.9f, 0.1f, 1.9f, 0.1f, 1.7f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 1.3f, 1.4f,
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.6f, 0.5f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 1.5f, 1.6f,
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.4f, 0.1f, 1.2f, 1.7f, 1.8f
-        });
-    set_values(input_grad, {
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.6f, 0.5f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.5f, 0.6f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.1f, 1.5f, 0.6f, 2.1f, 0.4f, 0.3f,
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 1.9f, 0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.2f, 0.9f, 0.4f, 0.1f, 1.2f, 0.2f, 0.1f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.9f, 0.1f, 1.9f, 0.1f, 1.7f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.0f, 0.7f
-        });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, { 0, 0, 0, 0 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        90.58f, 85.92f, 97.22f, 91.86f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        102.6f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification " << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification " << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz3x3_in2x1x3x3_bfyx_stride1_pad1) {
-    //  Filter : 3x3
-    //  Input grad  : 2x2x3x3
-    //  Input  : 2x1x3x3
-    //  Stride : 1x1
-    //
-    //  Input grad:
-    //  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7
-    //  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2
-    //  0.1  1.7  0.5  0.4  0.4  0.1  0.1  1.7  0.5
-    //  1.9  0.1  1.7  0.5  0.6  0.7  0.9  1    1.1
-    //
-    //  Input:
-    //  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1
-    //  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 3 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    set_values(input, {
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f
-        });
-    set_values(input_grad, {
-        0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f,
-        0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f,
-        1.9f, 0.1f, 1.7f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f
-        });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        5.88f, 7.76f, 5.39f,
-        8.28f, 8.27f, 6.1f,
-        5.58f, 7.14f, 4.59f,
-
-        6.93f, 11.42f, 8.63f,
-        10.59f, 16.13f, 10.47f,
-        8.7f, 12.18f, 7.2f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        10.5f, 18.3f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz3x3_in2x1x3x3_bfyx_stride1_pad1_momentum) {
-    //  Filter : 3x3
-    //  Input grad  : 2x2x3x3
-    //  Input  : 2x1x3x3
-    //  Stride : 1x1
-    //
-    //  Input grad:
-    //  0.4  0.1  0.1  1.7  0.5  0.4  0.5  0.6  0.7
-    //  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2
-    //  0.1  1.7  0.5  0.4  0.4  0.1  0.1  1.7  0.5
-    //  1.9  0.1  1.7  0.5  0.6  0.7  0.9  1    1.1
-    //
-    //  Input:
-    //  0.5  0.6  0.7  0.9  1    1.1  0.7  0.9  0.1
-    //  0.7  0.8  0.8  1.7  1.8  1.2  2.1  0.5  0.2
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 3 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto prev_weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 3} });
-    auto prev_biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1} });
-
-    set_values(input, {
-        0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f, 0.7f, 0.9f, 0.1f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f
-        });
-    set_values(input_grad, {
-        0.4f, 0.1f, 0.1f, 1.7f, 0.5f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.7f, 0.8f, 0.8f, 1.7f, 1.8f, 1.2f, 2.1f, 0.5f, 0.2f,
-        0.1f, 1.7f, 0.5f, 0.4f, 0.4f, 0.1f, 0.1f, 1.7f, 0.5f,
-        1.9f, 0.1f, 1.7f, 0.5f, 0.6f, 0.7f, 0.9f, 1.f,  1.1f
-        });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        mutable_data("prev_weights", prev_weights),
-        mutable_data("prev_biases", prev_biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { "prev_weights" }, { "prev_biases" }, { 1, 1, 1, 1 }, { 0, 0, -1, -1 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        5.88f, 7.76f, 5.39f,
-        8.28f, 8.27f, 6.1f,
-        5.58f, 7.14f, 4.59f,
-
-        6.93f, 11.42f, 8.63f,
-        10.59f, 16.13f, 10.47f,
-        8.7f, 12.18f, 7.2f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        10.5f, 18.3f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz7x7_in2x1x7x7_bfyx_stride1_pad3) {
-    //  Filter : 7x7
-    //  Input grad  : 2x2x7x7
-    //  Input  : 2x1x7x7
-    //  Stride : 1x1
-    //
-    //  Input grad:
-    // b0:f0: 0.1  0.2  0.3  0.4  0.5  0.6  0.7    b0:f1: 0.1  0.2  0.3  0.4  0.5  0.6  0.7
-    // b0:f0: 0.7  0.6  0.5  0.4  0.3  0.2  0.1    b0:f1: 0.1  0.2  0.3  0.5  0.7  1.1  1.3
-    // b0:f0: 0.1  0.7  0.2  0.6  0.3  0.5  0.4    b0:f1: 0.7  0.6  0.5  0.4  0.3  0.2  0.1
-    // b0:f0: 0.3  0.4  0.5  0.6  0.7  0.8  0.9    b0:f1: 0.1  0.7  0.2  0.6  0.3  0.5  0.4
-    // b0:f0: 0.9  0.8  0.7  0.6  0.5  0.4  0.3    b0:f1: 0.3  0.4  0.5  0.6  0.7  0.8  0.9
-    // b0:f0: 0.3  0.9  0.4  0.8  0.5  0.7  0.6    b0:f1: 0.9  0.8  0.7  0.6  0.5  0.4  0.3
-    // b0:f0: 0.1  0.2  0.3  0.5  0.7  1.1  1.3    b0:f1: 0.3  0.9  0.4  0.8  0.5  0.7  0.6
-    //
-    // b1:f0: 0.1  0.2  0.3  0.4  0.5  0.6  0.7    b1:f1: 0.1  0.2  0.3  0.4  0.5  0.6  0.7
-    // b1:f0: 0.7  0.6  0.5  0.4  0.3  0.2  0.1    b1:f1: 0.3  0.4  0.5  0.6  0.7  0.8  0.9
-    // b1:f0: 0.1  0.7  0.2  0.6  0.3  0.5  0.4    b1:f1: 0.7  0.6  0.5  0.4  0.3  0.2  0.1
-    // b1:f0: 0.3  0.9  0.4  0.8  0.5  0.7  0.6    b1:f1: 0.1  0.7  0.2  0.6  0.3  0.5  0.4
-    // b1:f0: 0.3  0.4  0.5  0.6  0.7  0.8  0.9    b1:f1: 0.9  0.8  0.7  0.6  0.5  0.4  0.3
-    // b1:f0: 0.9  0.8  0.7  0.6  0.5  0.4  0.3    b1:f1: 0.3  0.9  0.4  0.8  0.5  0.7  0.6
-    // b1:f0: 0.1  0.2  0.3  0.5  0.7  1.1  1.3    b1:f1: 0.1  0.2  0.3  0.5  0.7  1.1  1.3
-    //
-    //  Input:
-    // b0:f0: 0.5  0.6  0.7  0.9  0.2  0.1  0.7    b0:f1: 0.5  0.6  0.4  0.9  0.2  0.1  0.5
-    // b0:f0: 0.7  0.8  0.2  0.1  0.7  0.8  0.8    b0:f1: 0.9  0.3  0.7  0.5  0.6  0.7  0.9
-    // b0:f0: 0.5  0.1  0.7  0.9  0.6  0.1  0.7    b0:f1: 0.7  0.8  0.8  0.7  0.8  0.2  0.1
-    // b0:f0: 0.7  0.2  0.8  0.7  0.8  0.2  0.1    b0:f1: 0.5  0.6  0.7  0.9  0.3  0.1  0.7
-    // b0:f0: 0.1  0.7  0.5  0.6  0.7  0.9  0.1    b0:f1: 0.7  0.8  0.8  0.7  0.8  0.2  0.1
-    // b0:f0: 0.7  0.8  0.8  0.7  0.8  0.2  0.1    b0:f1: 0.4  0.6  0.1  0.2  0.1  0.1  0.7
-    // b0:f0: 0.5  0.6  0.7  0.9  0.   0.1  0.7    b0:f1: 0.5  0.3  0.7  0.5  0.4  0.1  0.7
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 7, 7 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    set_values(input, {
-        0.5f, 0.6f, 0.7f, 0.9f, 0.2f, 0.1f, 0.7f,
-        0.7f, 0.8f, 0.2f, 0.1f, 0.7f, 0.8f, 0.8f,
-        0.5f, 0.1f, 0.7f, 0.9f, 0.6f, 0.1f, 0.7f,
-        0.7f, 0.2f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.5f, 0.6f, 0.7f, 0.9f, 0.1f,
-        0.7f, 0.8f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.5f, 0.6f, 0.7f, 0.9f, 0.f,  0.1f, 0.7f,
-
-        0.5f, 0.6f, 0.4f, 0.9f, 0.2f, 0.1f, 0.5f,
-        0.9f, 0.3f, 0.7f, 0.5f, 0.6f, 0.7f, 0.9f,
-        0.7f, 0.8f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.5f, 0.6f, 0.7f, 0.9f, 0.3f, 0.1f, 0.7f,
-        0.7f, 0.8f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.4f, 0.6f, 0.1f, 0.2f, 0.1f, 0.1f, 0.7f,
-        0.5f, 0.3f, 0.7f, 0.5f, 0.4f, 0.1f, 0.7f
-        });
-    set_values(input_grad, {
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f,
-
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f,
-
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f
-        });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { 1, 1, 1, 1 }, { 0, 0, -3, -3 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    ASSERT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        14.02f, 15.52f, 15.92f, 17.84f, 14.41f, 11.16f,  8.43f,
-        15.63f, 18.22f, 20.7f,  20.47f, 16.75f, 13.52f, 10.06f,
-        16.14f, 19.15f, 21.4f,  23.61f, 19.83f, 15.77f, 12.25f,
-        20.18f, 21.93f, 22.73f, 24.75f, 20.79f, 15.54f, 12.24f,
-        14.02f, 16.77f, 20.24f, 22.51f, 19.33f, 15.67f, 11.58f,
-        11.96f, 14.57f, 15.26f, 16.94f, 13.79f, 11.1f,   8.14f,
-         9.38f, 10.3f,  11.09f, 12.31f, 10.68f,  7.95f,  6.34f,
-
-        11.67f, 13.25f, 14.95f, 16.62f, 13.74f, 11.6f,   8.86f,
-        12.85f, 15.77f, 18.18f, 19.95f, 16.98f, 14.05f, 11.02f,
-        16.69f, 18.76f, 20.57f, 22.6f,  19.66f, 15.07f, 12.14f,
-        18.11f, 20.92f, 23.91f, 27.39f, 22.55f, 17.55f, 12.81f,
-        15.32f, 18.54f, 20.06f, 21.88f, 18.13f, 13.96f, 10.35f,
-        13.34f, 15.14f, 15.16f, 16.43f, 13.54f, 10.41f,  7.75f,
-        10.28f, 11.8f,  12.68f, 12.49f,  9.91f,  7.05f,  4.94f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        50.4f, 50.4f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, basic_wsiz7x7_in2x1x7x7_bfyx_stride1_pad3_momentum) {
-    //  Filter : 7x7
-    //  Input grad  : 2x2x7x7
-    //  Input  : 2x1x7x7
-    //  Stride : 1x1
-    //
-    //  Input grad:
-    // b0:f0: 0.1  0.2  0.3  0.4  0.5  0.6  0.7    b0:f1: 0.1  0.2  0.3  0.4  0.5  0.6  0.7
-    // b0:f0: 0.7  0.6  0.5  0.4  0.3  0.2  0.1    b0:f1: 0.1  0.2  0.3  0.5  0.7  1.1  1.3
-    // b0:f0: 0.1  0.7  0.2  0.6  0.3  0.5  0.4    b0:f1: 0.7  0.6  0.5  0.4  0.3  0.2  0.1
-    // b0:f0: 0.3  0.4  0.5  0.6  0.7  0.8  0.9    b0:f1: 0.1  0.7  0.2  0.6  0.3  0.5  0.4
-    // b0:f0: 0.9  0.8  0.7  0.6  0.5  0.4  0.3    b0:f1: 0.3  0.4  0.5  0.6  0.7  0.8  0.9
-    // b0:f0: 0.3  0.9  0.4  0.8  0.5  0.7  0.6    b0:f1: 0.9  0.8  0.7  0.6  0.5  0.4  0.3
-    // b0:f0: 0.1  0.2  0.3  0.5  0.7  1.1  1.3    b0:f1: 0.3  0.9  0.4  0.8  0.5  0.7  0.6
-    //
-    // b1:f0: 0.1  0.2  0.3  0.4  0.5  0.6  0.7    b1:f1: 0.1  0.2  0.3  0.4  0.5  0.6  0.7
-    // b1:f0: 0.7  0.6  0.5  0.4  0.3  0.2  0.1    b1:f1: 0.3  0.4  0.5  0.6  0.7  0.8  0.9
-    // b1:f0: 0.1  0.7  0.2  0.6  0.3  0.5  0.4    b1:f1: 0.7  0.6  0.5  0.4  0.3  0.2  0.1
-    // b1:f0: 0.3  0.9  0.4  0.8  0.5  0.7  0.6    b1:f1: 0.1  0.7  0.2  0.6  0.3  0.5  0.4
-    // b1:f0: 0.3  0.4  0.5  0.6  0.7  0.8  0.9    b1:f1: 0.9  0.8  0.7  0.6  0.5  0.4  0.3
-    // b1:f0: 0.9  0.8  0.7  0.6  0.5  0.4  0.3    b1:f1: 0.3  0.9  0.4  0.8  0.5  0.7  0.6
-    // b1:f0: 0.1  0.2  0.3  0.5  0.7  1.1  1.3    b1:f1: 0.1  0.2  0.3  0.5  0.7  1.1  1.3
-    //
-    //  Input:
-    // b0:f0: 0.5  0.6  0.7  0.9  0.2  0.1  0.7    b0:f1: 0.5  0.6  0.4  0.9  0.2  0.1  0.5
-    // b0:f0: 0.7  0.8  0.2  0.1  0.7  0.8  0.8    b0:f1: 0.9  0.3  0.7  0.5  0.6  0.7  0.9
-    // b0:f0: 0.5  0.1  0.7  0.9  0.6  0.1  0.7    b0:f1: 0.7  0.8  0.8  0.7  0.8  0.2  0.1
-    // b0:f0: 0.7  0.2  0.8  0.7  0.8  0.2  0.1    b0:f1: 0.5  0.6  0.7  0.9  0.3  0.1  0.7
-    // b0:f0: 0.1  0.7  0.5  0.6  0.7  0.9  0.1    b0:f1: 0.7  0.8  0.8  0.7  0.8  0.2  0.1
-    // b0:f0: 0.7  0.8  0.8  0.7  0.8  0.2  0.1    b0:f1: 0.4  0.6  0.1  0.2  0.1  0.1  0.7
-    // b0:f0: 0.5  0.6  0.7  0.9  0.   0.1  0.7    b0:f1: 0.5  0.3  0.7  0.5  0.4  0.1  0.7
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 7, 7 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto prev_weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 7, 7 } });
-    auto prev_biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    set_values(input, {
-        0.5f, 0.6f, 0.7f, 0.9f, 0.2f, 0.1f, 0.7f,
-        0.7f, 0.8f, 0.2f, 0.1f, 0.7f, 0.8f, 0.8f,
-        0.5f, 0.1f, 0.7f, 0.9f, 0.6f, 0.1f, 0.7f,
-        0.7f, 0.2f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.5f, 0.6f, 0.7f, 0.9f, 0.1f,
-        0.7f, 0.8f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.5f, 0.6f, 0.7f, 0.9f, 0.f,  0.1f, 0.7f,
-
-        0.5f, 0.6f, 0.4f, 0.9f, 0.2f, 0.1f, 0.5f,
-        0.9f, 0.3f, 0.7f, 0.5f, 0.6f, 0.7f, 0.9f,
-        0.7f, 0.8f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.5f, 0.6f, 0.7f, 0.9f, 0.3f, 0.1f, 0.7f,
-        0.7f, 0.8f, 0.8f, 0.7f, 0.8f, 0.2f, 0.1f,
-        0.4f, 0.6f, 0.1f, 0.2f, 0.1f, 0.1f, 0.7f,
-        0.5f, 0.3f, 0.7f, 0.5f, 0.4f, 0.1f, 0.7f
-        });
-    set_values(input_grad, {
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f,
-
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f,
-
-        0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f,
-        0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f,
-        0.7f, 0.6f, 0.5f, 0.4f, 0.3f, 0.2f, 0.1f,
-        0.1f, 0.7f, 0.2f, 0.6f, 0.3f, 0.5f, 0.4f,
-        0.9f, 0.8f, 0.7f, 0.6f, 0.5f, 0.4f, 0.3f,
-        0.3f, 0.9f, 0.4f, 0.8f, 0.5f, 0.7f, 0.6f,
-        0.1f, 0.2f, 0.3f, 0.5f, 0.7f, 1.1f, 1.3f
-        });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        mutable_data("prev_weights", prev_weights),
-        mutable_data("prev_biases", prev_biases),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" }, { "prev_weights" }, { "prev_biases" }, { 1, 1, 1, 1 }, { 0, 0, -3, -3 })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    ASSERT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        14.02f, 15.52f, 15.92f, 17.84f, 14.41f, 11.16f,  8.43f,
-        15.63f, 18.22f, 20.7f,  20.47f, 16.75f, 13.52f, 10.06f,
-        16.14f, 19.15f, 21.4f,  23.61f, 19.83f, 15.77f, 12.25f,
-        20.18f, 21.93f, 22.73f, 24.75f, 20.79f, 15.54f, 12.24f,
-        14.02f, 16.77f, 20.24f, 22.51f, 19.33f, 15.67f, 11.58f,
-        11.96f, 14.57f, 15.26f, 16.94f, 13.79f, 11.1f,   8.14f,
-         9.38f, 10.3f,  11.09f, 12.31f, 10.68f,  7.95f,  6.34f,
-
-        11.67f, 13.25f, 14.95f, 16.62f, 13.74f, 11.6f,   8.86f,
-        12.85f, 15.77f, 18.18f, 19.95f, 16.98f, 14.05f, 11.02f,
-        16.69f, 18.76f, 20.57f, 22.6f,  19.66f, 15.07f, 12.14f,
-        18.11f, 20.92f, 23.91f, 27.39f, 22.55f, 17.55f, 12.81f,
-        15.32f, 18.54f, 20.06f, 21.88f, 18.13f, 13.96f, 10.35f,
-        13.34f, 15.14f, 15.16f, 16.43f, 13.54f, 10.41f,  7.75f,
-        10.28f, 11.8f,  12.68f, 12.49f,  9.91f,  7.05f,  4.94f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        50.4f, 50.4f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i] * lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i] * lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, -y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(convolution_grad_weights_f32_fw_gpu, ngraph_2d_1item_2iterations) {
-    //  Filter : 2x1x2x2
-    //  Input grad  : 1x2x4x2
-    //  Input  : 1x1x5x3
-    //  Stride : 1x1
-
-    const auto& engine = get_test_engine();
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 2 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 5, 3 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 2, 2 } });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        convolution_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { 1,1,1,1 }, { 0,0,0,0 }, { 1,1,1,1 }, true)
-    );
-
-    set_values(input,
-        { 0.671875f, 0.546875f, -0.5625f, -0.359375f, -0.09375f, 0.546875f, -0.546875f, 0.890625f, 0.828125f, -0.546875f, 1.f, -0.078125f, -0.890625f, 0.40625f, -0.359375f });
-
-    build_options bo;
-    bo.set_option(build_option::optimize_data(true));
-    network network(engine, topology, bo);
-
-    // set values for first iteration
-    set_values(input_grad,
-        {   1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
-            0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f });
-    network.set_input_data("input_grad", input_grad);
-    std::vector<float> expected_weights_vec =
-    {   0.671875f, 0.546875f, 0.546875f, -0.546875f,
-        0.f, 0.f, 0.f, 0.f };
-    auto outputs = network.execute();
-    validate_output(expected_weights_vec, outputs);
-
-    // set values for second iteration
-    set_values(input_grad,
-        {   0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
-            0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 1.f });
-    network.set_input_data("input_grad", input_grad);
-    expected_weights_vec =
-    {   0.f, 0.f, 0.f, 0.f,
-        0.828125f, -0.546875f, 0.40625f, -0.359375f };
-    outputs =  network.execute();
-    validate_output(expected_weights_vec, outputs);
-}
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp
deleted file mode 100644
index fb4e0aa6926..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/embed.hpp"
-#include <api/topology.hpp>
-#include <api/tensor.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include <api/data.hpp>
-#include "test_utils/test_utils.h"
-
-#include <cmath>
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(embed_gpu, seq3num4) {
-    //  Input  : 1x1x1x3
-    //  Weights: 4x1x3x1
-    //  Bias   : 1x1x1x4
-    //  Output : 1x3x4x1
-    //  Input:
-    //   1.0    2.0   0.0
-    //
-    //  Weights:
-    //   1.0    1.0   1.0    1.0
-    //   2.0    2.0   2.0    2.0
-    //   3.0    3.0   3.0    3.0
-    //  Biases:
-    //   1.0    2.0   3.0    4.0
-    //
-    //  Output:
-    //   2.0    4.0   6.0    8.0
-    //   0.0    0.0   0.0    0.0
-    //   6.0    8.0  -2.0   -2.0
-
-    const auto& engine = get_test_engine();
-    auto batch = 1;
-    auto sequence_length = 3;
-    auto num_output_size = 4;
-    auto vocab_size = 3;
-    auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, sequence_length, 1 } });
-    auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ num_output_size, 1, vocab_size, 1 } });
-    auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, 1, num_output_size } });
-    auto output_ref = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, sequence_length, num_output_size, 1 } });
-
-    set_values(input_prim, { 1.0f, 2.0f, 0.0f });
-    set_values(weights_prim, { 1.0f, 1.0f, 1.0f, 1.0f,
-        2.0f, 2.0f, 2.0f, 2.0f,
-        3.0f, 3.0f, 3.0f, 3.0f });
-    set_values(bias_prim, { 1.0f, 2.0f, 3.0f, 4.0f });
-    set_values(output_ref, { 3.0f, 4.0f, 5.0f, 6.0f,
-        4.0f, 5.0f, 6.0f, 7.0f,
-        2.0f, 3.0f, 4.0f, 5.0f });
-
-    auto input = input_layout("input", input_prim.get_layout());
-    auto w_data = data("weights", weights_prim);
-    auto b_data = data("bias", bias_prim);
-
-    auto embed_test = embed("embed_prim", "input", "weights", "bias");
-    topology topology;
-    topology.add(input);
-    topology.add(w_data);
-    topology.add(b_data);
-    topology.add(embed_test);
-
-    network network(engine, topology);
-    network.set_input_data("input", input_prim);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "embed_prim");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-    auto ref = output_ref.pointer<float>();
-    auto output_ptr = output_prim.pointer<float>();
-    for (auto i = 0; i < batch * sequence_length * num_output_size; i++) {
-        EXPECT_EQ(ref[i], output_ptr[i]);
-    }
-
-}
-
-TEST(embed_gpu, b2seq2num3) {
-    //  Input  : 2x1x1x2
-    //  Weights: 3x1x3x1
-    //  Bias   : 1x1x1x4
-    //  Output : 1x3x4x1
-    //  Input:
-    //   0.0    1.0
-    //   2.0    0.0
-    //
-    //  Weights:
-    //  -1.0   -2.0  -3.0 
-    //  -1.0    2.0   0.0 
-    //   10.0   16.0  15.0 
-    //  Biases:
-    //   0.0    2.0   4.0
-    //
-    //  Output:
-    //   -1.0   0.0   1.0   -1.0   4.0   4.0
-    //    10.0  18.0  19.0  -1.0   0.0   1.0
-
-    const auto& engine = get_test_engine();
-    auto batch = 2;
-    auto sequence_length = 2;
-    auto num_output_size = 3;
-    auto vocab_size = 3;
-    auto input_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, 1, sequence_length, 1 } });
-    auto weights_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ num_output_size, 1, vocab_size, 1 } });
-    auto bias_prim = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 1, num_output_size } });
-    auto output_ref = memory::allocate(engine, { data_types::f32,format::bfyx,{ batch, sequence_length, num_output_size, 1 } });
-
-    set_values(input_prim, { 0.0f, 1.0f, 2.0f, 0.0f });
-    set_values(weights_prim, { -1.0f, -2.0f, -3.0f,
-        -1.0f,  2.0f,  0.0f,
-        10.0f, 16.0f, 15.0f });
-    set_values(bias_prim, { 0.0f, 2.0f, 4.0f });
-    set_values(output_ref, { -1.0f, 0.0f, 1.0f, -1.0f, 4.0f, 4.0f,
-        10.0f, 18.0f, 19.0f, -1.0f, 0.0f, 1.0f });
-
-    auto input = input_layout("input", input_prim.get_layout());
-    auto w_data = data("weights", weights_prim);
-    auto b_data = data("bias", bias_prim);
-
-    auto embed_test = embed("embed_prim", "input", "weights", "bias");
-    topology topology;
-    topology.add(input);
-    topology.add(w_data);
-    topology.add(b_data);
-    topology.add(embed_test);
-
-    network network(engine, topology);
-    network.set_input_data("input", input_prim);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "embed_prim");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-    auto ref = output_ref.pointer<float>();
-    auto output_ptr = output_prim.pointer<float>();
-    for (auto i = 0; i < batch * sequence_length * num_output_size; i++) {
-        EXPECT_EQ(ref[i], output_ptr[i]);
-    }
-
-}
-
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp
deleted file mode 100644
index 89e0f2946d4..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/fully_connected_grad_input.hpp"
-#include <api/data.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(fully_connected_grad_input_gpu, basic_bfyx) {
-    //  Filter : 2x2
-    //  Input  : 2x2x1x2
-    //  Output : 2x2x1x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  -0.5     2    0.5
-    //
-    //  Input_grad:
-    //   1.5   0.75  -2.25  3
-    //
-    //  Weights:
-    //   1.5     1    0.5
-    //  -1       0    0.5
-    //   0.5    -0.5 -2
-    //  -0.5     1    1.5
-    //
-    //  Output:
-    //  -1.125  5.625   10.125
-
-    const auto& engine = get_test_engine();
-
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 4, 1, 3, 1 } });
-
-    set_values(input, { -0.5f, 2.0f, 0.5f });
-    set_values(input_grad, { 1.5f, 0.75f, -2.25f, 3.0f });
-    set_values(weights, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        data("weights", weights),
-        fully_connected_grad_input("fully_connected_grad_input", "input_grad", "input", { "weights" })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "fully_connected_grad_input");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-
-    std::vector<float> expected_output_vec = {
-        -1.125f, 5.625f, 10.125f
-    };
-
-    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
-    {
-        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp
deleted file mode 100644
index 49c76106469..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/fully_connected_grad_weights.hpp"
-#include "api/fully_connected.hpp"
-#include "api/fully_connected_grad_input.hpp"
-#include "api/reorder.hpp"
-#include <api/mutable_data.hpp>
-#include <api/data.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(fully_connected_grad_weights_gpu, basic_bfyx) {
-    //  Filter : 2x2
-    //  Input  : 1x1x1x3
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  -0.5     2    0.5
-    //
-    //  Input_grad:
-    //   1.5   0.75  -2.25  3
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 4, 1 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 1 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 4, 1, 3, 1 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } });
-
-    set_values(input, { -0.5f, 2.0f, 0.5f });
-    set_values(input_grad, { 1.5f, 0.75f, -2.25f, 3.0f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        fully_connected_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        0.75f, -3.f, -0.75,
-        0.375f, -1.5f, -0.375f,
-        -1.125f, 4.5f, 1.125f,
-        1.5f, -6.f, -1.5f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        -1.5f, -0.75f, 2.25f, -3.0f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i]*lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i]*lr), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(fully_connected_grad_weights_gpu, basic_bfyx_b8) {
-    //  Filter : 2x2
-    //  Input  : 2x2x1x2
-    //  Output : 2x2x1x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  -0.5     2    0.5
-    //  -0.5     2    0.5
-    //  -0.5     2    0.5
-    //  -0.5     2    0.5
-    //  -0.5     2    0.5
-    //  -0.5     2    0.5
-    //  -0.5     2    0.5
-    //  1     1    1
-    //
-    //  Input_grad:
-    //   1.5   0.75  -2.25  3
-    //   1.5   0.75  -2.25  3
-    //   1.5   0.75  -2.25  3
-    //   1.5   0.75  -2.25  3
-    //   1.5   0.75  -2.25  3
-    //   1.5   0.75  -2.25  3
-    //   1.5   0.75  -2.25  3
-    //   1   1  1  1
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 8, 1, 4, 1 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 8, 1, 3, 1 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 4, 1, 3, 1 } });
-    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } });
-
-    set_values(input, { -0.5f, 2.0f, 0.5f,
-                        -0.5f, 2.0f, 0.5f, 
-                        -0.5f, 2.0f, 0.5f, 
-                        -0.5f, 2.0f, 0.5f, 
-                        -0.5f, 2.0f, 0.5f, 
-                        -0.5f, 2.0f, 0.5f, 
-                        -0.5f, 2.0f, 0.5f, 
-                        1.f, 1.f, 1.f });
-    set_values(input_grad, { 1.5f, 0.75f, -2.25f, 3.0f,
-                            1.5f, 0.75f, -2.25f, 3.0f, 
-                            1.5f, 0.75f, -2.25f, 3.0f, 
-                            1.5f, 0.75f, -2.25f, 3.0f, 
-                            1.5f, 0.75f, -2.25f, 3.0f, 
-                            1.5f, 0.75f, -2.25f, 3.0f, 
-                            1.5f, 0.75f, -2.25f, 3.0f, 
-                            1.f, 1.f, 1.f, 1.f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        mutable_data("biases", biases),
-        fully_connected_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" }, { "biases" })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-    auto biases_ptr = biases.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        4.25e-05f, -0.00022f, -6.25e-05f,
-        1.625e-05f, -0.000115f, -3.625e-05f,
-        -8.875e-05f, 0.000305f, 6.875e-05f,
-        9.5e-05f, -0.00043f, -0.000115f
-    };
-
-    std::vector<float> expected_bias_vec = {
-        -0.000115f, -6.25e-05f, 0.0001475f, -0.00022f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i]), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on weights verification" << random_seed << std::endl;
-    }
-
-    for (unsigned int i = 0; i < expected_bias_vec.size(); i++)
-    {
-        float x = float_round(expected_bias_vec[i]), y = float_round(biases_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on biases verification" << random_seed << std::endl;
-    }
-}
-
-TEST(fully_connected_grad_weights_gpu, basic_bfyx_no_bias) {
-    //  Filter : 2x2
-    //  Input  : 2x2x1x2
-    //  Output : 2x2x1x2
-    //  Stride : 2x2
-    //
-    //  Input:
-    //  -0.5     2    0.5
-    //
-    //  Input_grad:
-    //   1.5   0.75  -2.25  3
-
-    const auto& engine = get_test_engine();
-    float lr = 0.00001f;
-    auto input_grad = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 4, 1 } });
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 3, 1 } });
-    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 4, 1, 3, 1 } });
-
-    set_values(input, { -0.5f, 2.0f, 0.5f });
-    set_values(input_grad, { 1.5f, 0.75f, -2.25f, 3.0f });
-
-    topology topology(
-        input_layout("input_grad", input_grad.get_layout()),
-        data("input", input),
-        mutable_data("weights", weights),
-        fully_connected_grad_weights("conv_grad_weights", "input_grad", "input", { "weights" })
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input_grad", input_grad);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "conv_grad_weights");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-    auto weights_ptr = weights.pointer<float>();
-
-    std::vector<float> expected_weights_vec = {
-        0.75f, -3.f, -0.75,
-        0.375f, -1.5f, -0.375f,
-        -1.125f, 4.5f, 1.125f,
-        1.5f, -6.f, -1.5f
-    };
-
-    for (unsigned int i = 0; i < expected_weights_vec.size(); i++)
-    {
-        float x = float_round(expected_weights_vec[i]*lr), y = float_round(weights_ptr[i]);
-        EXPECT_FLOAT_EQ(x, y) << "on weights verification" << random_seed << std::endl;
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp
index ed692747d49..91454f152c2 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp
@@ -200,10 +200,6 @@ protected:
     std::vector<InputTy> input_values;
     std::vector<WeightsTy> weights_values;
     std::vector<BiasesTy> biases_values;
-    // Note, not all of the quantization/calibration factors are used in all the
-    // tests. However, I didn't come up with a way to correctly reflect that
-    // while unifying the boileplate testing code.
-    static constexpr float ignore = std::numeric_limits<float>::quiet_NaN();
 
     // Eltw part.
     std::vector<InputTy> non_conv_input_values;
@@ -358,57 +354,6 @@ TEST_F(FusedConvTest_all_float, DISABLED_basic) {
                                eltwise_mode::sum,
                                {"weights"},
                                {"biases"},
-                               {},
-                               {},
-                               1.0f, // conv_i_quantization_factor
-                               1.0f, // non_conv_scale
-                               "",
-                               {{1, 1, 1, 1}}, // eltw_stride
-                               {1, 1, 1, 1},   // stride
-                               {0, 0, 0, 0},   // input_offset
-                               {1, 1, 1, 1},   // dilation
-                               false,          // conv_with_activation
-                               0.0f,           // con_activation_slp
-                               true,           // eltw_activation
-                               0.0f));         // eltw_activation_slp
-}
-
-class FusedConvTest_no_conv_calibration : public FusedConvTest<float, float>
-{};
-
-TEST_F(FusedConvTest_no_conv_calibration, DISABLED_basic) {
-    // That might happen if both conv output and non-conv input happen to be
-    // normalized to the same dynamic range of if tensor-wise (instead of
-    // per-channel) calibration is used. Also, a similar thing might happen for
-    // a convolution with calibration without quantization (which is the real
-    // target of this test, needed for the Inference Engine).
-
-    // add_feature contains data for conv quantization/calibration, but the
-    // primitive won't use it. It's just much easier to unify different tests
-    // this way.
-    add_feature({125.0f, 125.0f, 0.0f, 1.0f}, // input
-                {2.0f, 0.0f, 1.0f},           // weights
-                1.0f,                         // bias
-                {-10.0f, -10.0f},             // non_conv_input
-                {241.0f, 242.0f});            // output_pre_relu
-
-    add_feature({125.0f, 125.0f, 0.0f, 1.0f}, // input
-                {2.0f, 0.0f, 1.0f},           // weights
-                0.0f,                         // bias
-                {-10.0f, -11.0f},             // non_conv_input
-                {480.0f, 480.0f});            // output_pre_relu
-
-    do_test(fused_conv_eltwise("fused_conv",
-                               "input",
-                               "sum_input",
-                               eltwise_mode::sum,
-                               {"weights"},
-                               {"biases"},
-                               {},
-                               {},   // conv_output_calibration
-                               1.0f, // conv_i_quantization_factor
-                               1.0f, // non_conv_scale
-                               "",
                                {{1, 1, 1, 1}}, // eltw_stride
                                {1, 1, 1, 1},   // stride
                                {0, 0, 0, 0},   // input_offset
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
index 6c29112ea39..efe843251f5 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@@ -3154,37 +3154,37 @@ INSTANTIATE_TEST_CASE_P(
         activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"},  // FIXME - accuracy bug
     }), );
 
-class activation_scale_activation_quantize_i8 : public ActivationFusingTest {};
-TEST_P(activation_scale_activation_quantize_i8, basic) {
+class activation_scale_activation_quantize_u8 : public ActivationFusingTest {};
+TEST_P(activation_scale_activation_quantize_u8, basic) {
     auto p = GetParam();
     create_topologies(input_layout("input", get_input_layout(p)),
                       activation("act", "input", activation_func::relu),
                       data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)),
-                      data("in_low", get_mem(get_single_element_layout(p), min_random, 0)),
+                      data("in_low", get_mem(get_single_element_layout(p), 0)),
                       data("in_high", get_mem(get_single_element_layout(p), 1, max_random)),
                       data("out_low", get_mem(get_single_element_layout(p), 0)),
                       data("out_high", get_mem(get_single_element_layout(p), 255)),
                       scale("scale", "act", "scale_data"),
                       activation("act2", "scale", activation_func::softsign),
-                      quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8),
+                      quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8),
                       reorder("reorder_bfyx", "quant", p.default_format, data_types::f32));
 
     tolerance = 1.f;
     execute(p);
 }
 
-TEST_P(activation_scale_activation_quantize_i8, per_channel) {
+TEST_P(activation_scale_activation_quantize_u8, per_channel) {
     auto p = GetParam();
     create_topologies(input_layout("input", get_input_layout(p)),
                       activation("act", "input", activation_func::relu),
                       data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)),
-                      data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)),
+                      data("in_low", get_mem(get_per_channel_layout(p), 0)),
                       data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)),
                       data("out_low", get_mem(get_single_element_layout(p), 0)),
                       data("out_high", get_mem(get_single_element_layout(p), 255)),
                       scale("scale", "act", "scale_data"),
                       activation("act2", "scale", activation_func::softsign),
-                      quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8),
+                      quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8),
                       reorder("reorder_bfyx", "quant", p.default_format, data_types::f32));
 
     tolerance = 1.f;
@@ -3193,7 +3193,7 @@ TEST_P(activation_scale_activation_quantize_i8, per_channel) {
 
 INSTANTIATE_TEST_CASE_P(
     fusings_gpu,
-    activation_scale_activation_quantize_i8,
+    activation_scale_activation_quantize_u8,
     ::testing::ValuesIn(std::vector<activation_test_params>{
         // InputDataType = FP32
         activation_test_params{CASE_ACTIVATION_F32_0, 2, 5, "activation_opt"},
@@ -3216,7 +3216,7 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     DISABLED_fusings_gpu,
-    activation_scale_activation_quantize_i8,
+    activation_scale_activation_quantize_u8,
     ::testing::ValuesIn(std::vector<activation_test_params>{
         activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"},  // FIXME - accuracy bug
         activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"},  // FIXME - accuracy bug
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp
deleted file mode 100644
index 33331b151f6..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp
+++ /dev/null
@@ -1,1672 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <gtest/gtest.h>
-
-#include <api/engine.hpp>
-#include <api/input_layout.hpp>
-#include <api/index_select.hpp>
-#include <api/memory.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-
-#include "test_utils/test_utils.h"
-
-#include <vector>
-#include <algorithm>
-
-using namespace cldnn;
-using namespace tests;
-
-std::vector<float> generate_reference_bfyx(const std::vector<float>& input, const std::vector<int32_t>& indices, index_select_axis_name axis, const size_t b_size, const size_t f_size,
-    const size_t y_size, const size_t x_size)
-{
-
-    auto unique_indices = std::set<int32_t>(indices.begin(), indices.end());
-    std::map<uint32_t, std::vector<float>> mapped_axises;
-
-    auto append_to_vector = [&](std::vector<float>& vec)
-    {
-        for (auto const& id : indices)
-        {
-            vec.insert(vec.end(), mapped_axises.at(id).begin(), mapped_axises.at(id).end());
-        }
-    };
-
-    std::vector<float> ret;
-    size_t offset = 0;    
-    switch (axis)
-    {
-    case index_select_axis_name::along_b:
-        offset = f_size * y_size * x_size; 
-        for (auto const& ui : unique_indices)
-        {
-            mapped_axises[ui] = std::vector<float>(input.begin() + ui * offset, input.begin() + ui * offset + offset);
-        }
-        append_to_vector(ret);
-        return ret;
-    case index_select_axis_name::along_f:
-        offset = y_size * x_size;   
-        for (size_t i = 0; i < b_size; i++)
-        {
-            size_t batch_index = i * f_size * y_size * x_size;
-            mapped_axises.clear();
-            for (auto const& ui : unique_indices)
-            {
-                mapped_axises[ui] = std::vector<float>(input.begin() + batch_index + ui * offset, input.begin() + batch_index + ui * offset + offset);
-            }
-            append_to_vector(ret);
-        }
-        return ret;
-    case index_select_axis_name::along_x:
-        offset = y_size;
-        ret.resize(b_size * f_size * y_size * indices.size());
-        for (size_t i = 0; i < b_size; i++)
-        {
-            size_t batch_index = i * f_size * y_size * x_size;
-            for (size_t j = 0; j < f_size; j++)
-            {
-                size_t feature_index = j * y_size * x_size;
-                size_t start_idx = batch_index + feature_index;
-                mapped_axises.clear();
-                for (auto const& ui : unique_indices)
-                {
-                    std::vector<float> values = {};
-                    for (size_t k = 0; k < offset; k++)
-                    {
-                        values.push_back(input.at(start_idx + k * x_size +  ui));
-                    }
-                    mapped_axises[ui] = values;
-                }
-
-                for (size_t idx = 0; idx < indices.size(); idx++)
-                {
-                    auto const id = indices.at(idx);
-                    //ret.insert(ret.end(), mapped_axises.at(id).begin(), mapped_axises.at(id).end());
-                    auto out_idx = i * f_size * y_size * indices.size() + j * y_size * indices.size() + idx;
-                    for (size_t y = 0; y < y_size; y++)
-                    {
-                        ret.at(out_idx + y * indices.size()) = mapped_axises.at(id).at(y);
-                    }
-                }
-            }
-        }
-        return ret;
-        break;
-    case index_select_axis_name::along_y:
-        offset = x_size;
-        for (size_t i = 0; i < b_size; i++)
-        {
-            size_t batch_index = i * f_size * y_size * x_size;
-            for (size_t j = 0; j < f_size; j++)
-            {
-                size_t feature_index = j * y_size * x_size;
-                size_t start_idx = batch_index + feature_index;
-                mapped_axises.clear();
-                for (auto const& ui : unique_indices)
-                {
-                    mapped_axises[ui] = std::vector<float>(input.begin() + start_idx + ui * offset, input.begin() + start_idx + ui * offset + offset);
-                }
-                append_to_vector(ret);
-            }
-        }
-        return ret;
-        break;
-    default:
-        throw std::runtime_error("Unknown index_select axis!");
-        break;
-    }
-}
-
-std::vector<float> generate_reference_yxfb(const std::vector<float>& input, const std::vector<int32_t>& indices, index_select_axis_name axis, const cldnn::layout& input_lay)
-{
-    auto memory_desc_inp = generic_test::get_linear_memory_desc(input_lay);
-
-    std::vector<float> ret;
-    switch (axis)
-    {
-    case index_select_axis_name::along_b:
- 
-        for (auto y = 0; y < input_lay.size.spatial[1]; y++)
-        {
-            for (auto x = 0; x < input_lay.size.spatial[0]; x++)
-            {
-                for (auto f = 0; f < input_lay.size.feature[0]; f++)
-                {
-                    for (auto const& ind : indices)
-                    {
-
-                        size_t index = generic_test::get_linear_index(input_lay, ind, f, y, x, memory_desc_inp);
-                        ret.push_back(input.at(index));
-                    }
-
-                }
-            }
-        }
-        return ret;
-    case index_select_axis_name::along_f:
-        for (auto y = 0; y < input_lay.size.spatial[1]; y++)
-        {
-            for (auto x = 0; x < input_lay.size.spatial[0]; x++)
-            {
-                for (auto const& ind : indices)
-                {
-                    for (auto b = 0; b < input_lay.size.batch[0]; b++)
-                    {
-                    size_t index = generic_test::get_linear_index(input_lay, b, ind, y, x, memory_desc_inp);
-                    ret.push_back(input.at(index));
-                    }
-                }
-            }
-        }
-        return ret;
-    case index_select_axis_name::along_x:
-        for (auto y = 0; y < input_lay.size.spatial[1]; y++)
-        {
-            for (auto const& ind : indices)
-            {
-                for (auto f = 0; f < input_lay.size.feature[0]; f++)
-                {
-                    for (auto b = 0; b < input_lay.size.batch[0]; b++)
-                    {
-                        size_t index = generic_test::get_linear_index(input_lay, b, f, y, ind, memory_desc_inp);
-                        ret.push_back(input.at(index));
-                    }
-                }
-            }
-        }
-        return ret;
-    case index_select_axis_name::along_y:
-
-        for (auto const& ind : indices)
-        {
-            for (auto x = 0; x < input_lay.size.spatial[0]; x++)
-            {
-                for (auto f = 0; f < input_lay.size.feature[0]; f++)
-                {
-                    for (auto b = 0; b < input_lay.size.batch[0]; b++)
-                    {
-                        size_t index = generic_test::get_linear_index(input_lay, b, f, ind, x, memory_desc_inp);
-                        ret.push_back(input.at(index));
-                    }
-                }
-            }
-        }
-        return ret;
-    default:
-        throw std::runtime_error("Unknown index_select axis!");
-        break;
-    }
-}
-
-TEST(index_select_gpu, basic_along_b_3_executes_bfyx)
-{
-    /*
-    input: {5, 2, 3, 4}
-    indices: {1, 1, 4, 1}
-    output: {4, 2, 3, 4}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 5;
-    constexpr auto in_size_f = 2;
-    constexpr auto in_size_x = 3;
-    constexpr auto in_size_y = 4;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 4;
-    constexpr auto axis = index_select_axis_name::along_b;
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { in_size_b, in_size_f, in_size_x, in_size_y } });
-    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 1, 1, new_indicies_size, 1 } });
-    
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        {0, 1, 1, 1}, //for run: 0
-        {0, 1, 2, 3}, //for run: 1
-        {4, 3, 2, 1} // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::bfyx);
-        EXPECT_EQ(b_size, new_indicies_size);
-        EXPECT_EQ(f_size, in_size_f);
-        EXPECT_EQ(x_size, in_size_x);
-        EXPECT_EQ(y_size, in_size_y);
-        
-        auto ref = generate_reference_bfyx(input_data, id, axis, in_size_b, in_size_f, in_size_y, in_size_x);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, basic_along_f_3_executes_bfyx)
-{
-    /*
-    input: {2, 5, 3, 3}
-    indices: {1, 1, 10, 1}
-    output: {2, 10, 3, 3}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 2;
-    constexpr auto in_size_f = 5;
-    constexpr auto in_size_x = 3;
-    constexpr auto in_size_y = 3;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 10;
-    constexpr auto axis = index_select_axis_name::along_f;
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { in_size_b, in_size_f, in_size_x, in_size_y } });
-    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4 }, //for run: 0
-        { 1, 1, 3, 3, 2, 2, 4, 4, 0, 0 }, //for run: 1
-        { 0, 0, 0, 0, 0, 4, 3, 2, 1, 0 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::bfyx);
-        EXPECT_EQ(b_size, in_size_b);
-        EXPECT_EQ(f_size, new_indicies_size);
-        EXPECT_EQ(x_size, in_size_x);
-        EXPECT_EQ(y_size, in_size_y);
-
-        auto ref = generate_reference_bfyx(input_data, id, axis, in_size_b, in_size_f, in_size_y, in_size_x);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, basic_along_x_3_executes_bfyx)
-{
-    /*
-    input: {3, 4, 6, 5}
-    indices: {1, 1, 3, 1}
-    output: {3, 4, 3, 5}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 3;
-    constexpr auto in_size_f = 4;
-    constexpr auto in_size_x = 6;
-    constexpr auto in_size_y = 5;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 3;
-    constexpr auto axis = index_select_axis_name::along_x;
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { in_size_b, in_size_f, in_size_x, in_size_y } });
-    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx, { 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 2, 1, 0 }, //for run: 0
-        { 0, 0, 0 }, //for run: 1
-        { 1, 1, 0 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::bfyx);
-        EXPECT_EQ(b_size, in_size_b);
-        EXPECT_EQ(f_size, in_size_f);
-        EXPECT_EQ(x_size, new_indicies_size);
-        EXPECT_EQ(y_size, in_size_y);
-
-        auto ref = generate_reference_bfyx(input_data, id, axis, in_size_b, in_size_f, in_size_y, in_size_x);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, basic_along_y_3_executes_bfyx)
-{
-    /*
-    input: {2, 4, 4, 3}
-    indices: {1, 1, 5, 1}
-    output: {2, 4, 4, 5}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 2;
-    constexpr auto in_size_f = 4;
-    constexpr auto in_size_x = 4;
-    constexpr auto in_size_y = 3;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 5;
-    constexpr auto axis = index_select_axis_name::along_y;
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ in_size_b, in_size_f, in_size_x, in_size_y } });
-    auto indices = memory::allocate(engine, { data_types::i32, format::bfyx,{ 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 0, 1, 2, 2, 1 }, //for run: 0
-        { 2, 2, 1, 0, 1 }, //for run: 1
-        { 1, 1, 2, 1, 0 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::bfyx);
-        EXPECT_EQ(b_size, in_size_b);
-        EXPECT_EQ(f_size, in_size_f);
-        EXPECT_EQ(x_size, x_size);
-        EXPECT_EQ(y_size, new_indicies_size);
-
-        auto ref = generate_reference_bfyx(input_data, id, axis, in_size_b, in_size_f, in_size_y, in_size_x);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, basic_along_b_3_executes_yxfb)
-{
-    /*
-    input: {5, 2, 3, 4}
-    indices: {1, 1, 4, 1}
-    output: {4, 2, 3, 4}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 5;
-    constexpr auto in_size_f = 2;
-    constexpr auto in_size_x = 3;
-    constexpr auto in_size_y = 4;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 4;
-    constexpr auto axis = index_select_axis_name::along_b;
-    auto input_lay= cldnn::layout(data_types::f32, format::yxfb,{ in_size_b, in_size_f, in_size_x, in_size_y });
-    auto input = memory::allocate(engine, input_lay);
-    auto indices = memory::allocate(engine, { data_types::i32, format::yxfb, { 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 0, 1, 1, 1 }, //for run: 0
-        { 0, 1, 2, 3 }, //for run: 1
-        { 4, 3, 2, 1 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::yxfb);
-        EXPECT_EQ(b_size, new_indicies_size);
-        EXPECT_EQ(f_size, in_size_f);
-        EXPECT_EQ(x_size, in_size_x);
-        EXPECT_EQ(y_size, in_size_y);
-
-        auto ref = generate_reference_yxfb(input_data, id, axis, input_lay);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, basic_along_f_3_executes_yxfb)
-{
-    /*
-    input: {2, 5, 3, 3}
-    indices: {1, 1, 10, 1}
-    output: {2, 10, 3, 3}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 2;
-    constexpr auto in_size_f = 5;
-    constexpr auto in_size_x = 3;
-    constexpr auto in_size_y = 3;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 10;
-    constexpr auto axis = index_select_axis_name::along_f;
-    auto input_lay = cldnn::layout(data_types::f32, format::yxfb, { in_size_b, in_size_f, in_size_x, in_size_y });
-    auto input = memory::allocate(engine, input_lay);
-    auto indices = memory::allocate(engine, { data_types::i32, format::yxfb,{ 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 0, 1, 2, 3, 4, 0, 1, 2, 3, 4 }, //for run: 0
-    { 1, 1, 3, 3, 2, 2, 4, 4, 0, 0 }, //for run: 1
-    { 0, 0, 0, 0, 0, 4, 3, 2, 1, 0 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::yxfb);
-        EXPECT_EQ(b_size, in_size_b);
-        EXPECT_EQ(f_size, new_indicies_size);
-        EXPECT_EQ(x_size, in_size_x);
-        EXPECT_EQ(y_size, in_size_y);
-
-        auto ref = generate_reference_yxfb(input_data, id, axis, input_lay);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, basic_along_x_3_executes_yxfb)
-{
-    /*
-    input: {3, 4, 6, 5}
-    indices: {1, 1, 3, 1}
-    output: {3, 4, 3, 5}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 3;
-    constexpr auto in_size_f = 4;
-    constexpr auto in_size_x = 6;
-    constexpr auto in_size_y = 5;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 3;
-    constexpr auto axis = index_select_axis_name::along_x;
-    auto input_lay = cldnn::layout(data_types::f32, format::yxfb, { in_size_b, in_size_f, in_size_x, in_size_y });
-    auto input = memory::allocate(engine, input_lay);
-    auto indices = memory::allocate(engine, { data_types::i32, format::yxfb,{ 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 2, 1, 0 }, //for run: 0
-        { 0, 0, 0 }, //for run: 1
-        { 1, 1, 0 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::yxfb);
-        EXPECT_EQ(b_size, in_size_b);
-        EXPECT_EQ(f_size, in_size_f);
-        EXPECT_EQ(x_size, new_indicies_size);
-        EXPECT_EQ(y_size, in_size_y);
-
-        auto ref = generate_reference_yxfb(input_data, id, axis, input_lay);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-TEST(index_select_gpu, basic_along_y_3_executes_yxfb)
-{
-    /*
-    input: {2, 4, 4, 3}
-    indices: {1, 1, 5, 1}
-    output: {2, 4, 4, 5}
-    */
-    const auto& engine = get_test_engine();
-    constexpr auto in_size_b = 2;
-    constexpr auto in_size_f = 4;
-    constexpr auto in_size_x = 4;
-    constexpr auto in_size_y = 3;
-    constexpr auto count = in_size_b * in_size_f * in_size_x * in_size_y;
-    constexpr auto new_indicies_size = 5;
-    constexpr auto axis = index_select_axis_name::along_y;
-    auto input_lay = cldnn::layout(data_types::f32, format::yxfb, { in_size_b, in_size_f, in_size_x, in_size_y });
-    auto input = memory::allocate(engine, input_lay);
-    auto indices = memory::allocate(engine, { data_types::i32, format::yxfb,{ 1, 1, new_indicies_size, 1 } });
-
-    auto input_data = generate_random_1d<float>(count, 0, 10);
-    set_values(input, input_data);
-
-    /*
-    Network will be executed 3 times (for 3 different indicies_data).
-    */
-    std::vector<std::vector<int32_t>> indices_data =
-    {
-        { 0, 1, 2, 2, 1 }, //for run: 0
-        { 2, 2, 1, 0, 1 }, //for run: 1
-        { 1, 1, 2, 1, 0 } // for run: 2
-    };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        input_layout("indices", indices.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", "indices", axis)
-    );
-
-    network net(engine, topo);
-    net.set_input_data("input", input);
-    for (auto const& id : indices_data)
-    {
-        set_values(indices, id);
-        net.set_input_data("indices", indices);
-        auto outputs = net.execute();
-
-		ASSERT_EQ(outputs.size(), size_t(1));
-        EXPECT_EQ(outputs.begin()->first, "index_select");
-
-        auto output_mem = outputs.at("index_select").get_memory();
-        auto output_layout = output_mem.get_layout();
-
-        int b_size = output_layout.size.batch[0];
-        int f_size = output_layout.size.feature[0];
-        int x_size = output_layout.size.spatial[0];
-        int y_size = output_layout.size.spatial[1];
-        EXPECT_EQ(output_layout.format, format::yxfb);
-        EXPECT_EQ(b_size, in_size_b);
-        EXPECT_EQ(f_size, in_size_f);
-        EXPECT_EQ(x_size, x_size);
-        EXPECT_EQ(y_size, new_indicies_size);
-
-        auto ref = generate_reference_yxfb(input_data, id, axis, input_lay);
-
-        auto output_ptr = output_mem.pointer<float>();
-        for (size_t i = 0; i < output_ptr.size(); i++)
-        {
-            EXPECT_EQ(output_ptr[i], ref[i]);
-        }
-    }
-}
-
-TEST(index_select_gpu, reverse_along_b_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 4, 2 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-
-        8.f,  9.f, 10.f, 11.f,
-        12.f, 13.f, 14.f, 15.f,
-
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-    };
-
-    std::vector<float> out_data = {
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-
-        
-
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-
-        8.f,  9.f, 10.f, 11.f,
-        12.f, 13.f, 14.f, 15.f,
-    };
-
-    constexpr auto axis = index_select_axis_name::along_b;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-    
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_f_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 3, 4 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f
-    };
-
-    constexpr auto axis = index_select_axis_name::along_f;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_y_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        8.f,  9.f, 10.f, 11.f,
-        4.f,  5.f,  6.f,  7.f,
-        0.f,  1.f,  2.f,  3.f,
-        
-        20.f, 21.f, 22.f, 23.f,
-        16.f, 17.f, 18.f, 19.f,
-        12.f, 13.f, 14.f, 15.f
-    };
-
-    constexpr auto axis = index_select_axis_name::along_y;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_x_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        3.f,  2.f,  1.f,  0.f,
-        7.f,  6.f,  5.f,  4.f,
-        11.f,  10.f, 9.f, 8.f,
-
-        15.f, 14.f, 13.f, 12.f,
-        19.f, 18.f, 17.f, 16.f,
-        23.f, 22.f, 21.f, 20.f
-    };
-
-    constexpr auto axis = index_select_axis_name::along_x;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_y_yxfb)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 2, 2, 2 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-
-        8.f,  9.f, 10.f, 11.f,
-        12.f, 13.f, 14.f, 15.f,
-
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-    };
-
-    std::vector<float> out_data = {
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-
-        8.f,  9.f, 10.f, 11.f,
-        12.f, 13.f, 14.f, 15.f,
-    };
-
-    constexpr auto axis = index_select_axis_name::along_y;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_x_yxfb)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 3, 4, 2, 1 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f
-    };
-
-    constexpr auto axis = index_select_axis_name::along_x;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_f_yxfb)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 2, 1 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        8.f,  9.f, 10.f, 11.f,
-        4.f,  5.f,  6.f,  7.f,
-        0.f,  1.f,  2.f,  3.f,
-
-        20.f, 21.f, 22.f, 23.f,
-        16.f, 17.f, 18.f, 19.f,
-        12.f, 13.f, 14.f, 15.f
-    };
-
-    constexpr auto axis = index_select_axis_name::along_f;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_b_yxfb)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 2, 1 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        3.f,  2.f,  1.f,  0.f,
-        7.f,  6.f,  5.f,  4.f,
-        11.f,  10.f, 9.f, 8.f,
-
-        15.f, 14.f, 13.f, 12.f,
-        19.f, 18.f, 17.f, 16.f,
-        23.f, 22.f, 21.f, 20.f
-    };
-
-    constexpr auto axis = index_select_axis_name::along_b;
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_yx_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        11.f,  10.f, 9.f, 8.f,
-        7.f,  6.f,  5.f,  4.f,
-        3.f,  2.f,  1.f,  0.f,
-
-        23.f, 22.f, 21.f, 20.f,
-        19.f, 18.f, 17.f, 16.f,
-        15.f, 14.f, 13.f, 12.f
-    };
-
-    std::vector<index_select_axis_name> axis = { index_select_axis_name::along_y, index_select_axis_name::along_x };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_fyx_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 4, 3 } });
-
-    std::vector<float> input_data = {
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f
-    };
-
-    std::vector<float> out_data = {
-        23.f, 22.f, 21.f, 20.f,
-        19.f, 18.f, 17.f, 16.f,
-        15.f, 14.f, 13.f, 12.f,
-
-        11.f,  10.f, 9.f, 8.f,
-        7.f,  6.f,  5.f,  4.f,
-        3.f,  2.f,  1.f,  0.f
-    };
-
-    std::vector<index_select_axis_name> axis = { index_select_axis_name::along_f, index_select_axis_name::along_y, index_select_axis_name::along_x };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_bfyx_bfyx)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 3, 3, 4, 3 } });
-
-    std::vector<float> input_data = {
-        // b0f0
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-        // f1
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-        // f2
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-        32.f, 33.f, 34.f, 35.f,
-
-        // b1f0
-        36.f, 37.f, 38.f, 39.f,
-        40.f, 41.f, 42.f, 43.f,
-        44.f, 45.f, 46.f, 47.f,
-        // f1
-        48.f, 49.f, 50.f, 51.f,
-        52.f, 53.f, 54.f, 55.f,
-        56.f, 57.f, 58.f, 59.f,
-        // f2
-        60.f, 61.f, 62.f, 63.f,
-        64.f, 65.f, 66.f, 67.f,
-        68.f, 69.f, 70.f, 71.f,
-
-        // b2f0
-        72.f, 73.f, 74.f, 75.f,
-        76.f, 77.f, 78.f, 79.f,
-        80.f, 81.f, 82.f, 83.f,
-        // f1
-        84.f, 85.f, 86.f, 87.f,
-        88.f, 89.f, 90.f, 91.f,
-        92.f, 93.f, 94.f, 95.f,
-        // f2
-        96.f, 97.f, 98.f, 99.f,
-        100.f, 101.f, 102.f, 103.f,
-        104.f, 105.f, 106.f, 107.f
-    };
-
-    std::vector<float> out_data = {
-        107.f, 106.f, 105.f, 104.f,
-        103.f, 102.f, 101.f, 100.f,
-        99.f, 98.f, 97.f, 96.f,
-
-        95.f, 94.f, 93.f, 92.f,
-        91.f, 90.f, 89.f, 88.f,
-        87.f, 86.f, 85.f, 84.f,
-
-        83.f, 82.f, 81.f, 80.f,
-        79.f, 78.f, 77.f, 76.f,
-        75.f, 74.f, 73.f, 72.f,
-
-        71.f, 70.f, 69.f, 68.f,
-        67.f, 66.f, 65.f, 64.f,
-        63.f, 62.f, 61.f, 60.f,
-
-        59.f, 58.f, 57.f, 56.f,
-        55.f, 54.f, 53.f, 52.f,
-        51.f, 50.f, 49.f, 48.f,
-
-        47.f, 46.f, 45.f, 44.f,
-        43.f, 42.f, 41.f, 40.f,
-        39.f, 38.f, 37.f, 36.f,
-
-        
-        35.f, 34.f, 33.f, 32.f,
-        31.f, 30.f, 29.f, 28.f,
-        27.f, 26.f, 25.f, 24.f,
-        
-        23.f, 22.f, 21.f, 20.f,
-        19.f, 18.f, 17.f, 16.f,
-        15.f, 14.f, 13.f, 12.f,
-
-        11.f,  10.f, 9.f, 8.f,
-        7.f,  6.f,  5.f,  4.f,
-        3.f,  2.f,  1.f,  0.f
-    };
-
-    std::vector<index_select_axis_name> axis = { index_select_axis_name::along_b, index_select_axis_name::along_f, index_select_axis_name::along_y, index_select_axis_name::along_x };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_bfx_yxfb)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 3, 3 } });
-
-    std::vector<float> input_data = {
-        // y0x0
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-        // x1
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-        // x2
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-        32.f, 33.f, 34.f, 35.f,
-
-        // y1x0
-        36.f, 37.f, 38.f, 39.f,
-        40.f, 41.f, 42.f, 43.f,
-        44.f, 45.f, 46.f, 47.f,
-        // x1
-        48.f, 49.f, 50.f, 51.f,
-        52.f, 53.f, 54.f, 55.f,
-        56.f, 57.f, 58.f, 59.f,
-        // x2
-        60.f, 61.f, 62.f, 63.f,
-        64.f, 65.f, 66.f, 67.f,
-        68.f, 69.f, 70.f, 71.f,
-
-        // y2x0
-        72.f, 73.f, 74.f, 75.f,
-        76.f, 77.f, 78.f, 79.f,
-        80.f, 81.f, 82.f, 83.f,
-        // x1
-        84.f, 85.f, 86.f, 87.f,
-        88.f, 89.f, 90.f, 91.f,
-        92.f, 93.f, 94.f, 95.f,
-        // x2
-        96.f, 97.f, 98.f, 99.f,
-        100.f, 101.f, 102.f, 103.f,
-        104.f, 105.f, 106.f, 107.f
-    };
-
-    std::vector<float> out_data = {
-        35.f, 34.f, 33.f, 32.f,
-        31.f, 30.f, 29.f, 28.f,
-        27.f, 26.f, 25.f, 24.f,
-
-        23.f, 22.f, 21.f, 20.f,
-        19.f, 18.f, 17.f, 16.f,
-        15.f, 14.f, 13.f, 12.f,
-
-        11.f,  10.f, 9.f, 8.f,
-        7.f,  6.f,  5.f,  4.f,
-        3.f,  2.f,  1.f,  0.f,
-
-        71.f, 70.f, 69.f, 68.f,
-        67.f, 66.f, 65.f, 64.f,
-        63.f, 62.f, 61.f, 60.f,
-
-        59.f, 58.f, 57.f, 56.f,
-        55.f, 54.f, 53.f, 52.f,
-        51.f, 50.f, 49.f, 48.f,
-
-        47.f, 46.f, 45.f, 44.f,
-        43.f, 42.f, 41.f, 40.f,
-        39.f, 38.f, 37.f, 36.f,
-
-        107.f, 106.f, 105.f, 104.f,
-        103.f, 102.f, 101.f, 100.f,
-        99.f, 98.f, 97.f, 96.f,
-
-        95.f, 94.f, 93.f, 92.f,
-        91.f, 90.f, 89.f, 88.f,
-        87.f, 86.f, 85.f, 84.f,
-
-        83.f, 82.f, 81.f, 80.f,
-        79.f, 78.f, 77.f, 76.f,
-        75.f, 74.f, 73.f, 72.f
-    };
-
-    std::vector<index_select_axis_name> axis = { index_select_axis_name::along_f, index_select_axis_name::along_b, index_select_axis_name::along_x };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
-
-TEST(index_select_gpu, reverse_along_bfyx_yxfb)
-{
-    const auto& engine = get_test_engine();
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 4, 3, 3, 3 } });
-
-    std::vector<float> input_data = {
-        // y0x0
-        0.f,  1.f,  2.f,  3.f,
-        4.f,  5.f,  6.f,  7.f,
-        8.f,  9.f, 10.f, 11.f,
-        // x1
-        12.f, 13.f, 14.f, 15.f,
-        16.f, 17.f, 18.f, 19.f,
-        20.f, 21.f, 22.f, 23.f,
-        // x2
-        24.f, 25.f, 26.f, 27.f,
-        28.f, 29.f, 30.f, 31.f,
-        32.f, 33.f, 34.f, 35.f,
-
-        // y1x0
-        36.f, 37.f, 38.f, 39.f,
-        40.f, 41.f, 42.f, 43.f,
-        44.f, 45.f, 46.f, 47.f,
-        // x1
-        48.f, 49.f, 50.f, 51.f,
-        52.f, 53.f, 54.f, 55.f,
-        56.f, 57.f, 58.f, 59.f,
-        // x2
-        60.f, 61.f, 62.f, 63.f,
-        64.f, 65.f, 66.f, 67.f,
-        68.f, 69.f, 70.f, 71.f,
-
-        // y2x0
-        72.f, 73.f, 74.f, 75.f,
-        76.f, 77.f, 78.f, 79.f,
-        80.f, 81.f, 82.f, 83.f,
-        // x1
-        84.f, 85.f, 86.f, 87.f,
-        88.f, 89.f, 90.f, 91.f,
-        92.f, 93.f, 94.f, 95.f,
-        // x2
-        96.f, 97.f, 98.f, 99.f,
-        100.f, 101.f, 102.f, 103.f,
-        104.f, 105.f, 106.f, 107.f
-    };
-
-    std::vector<float> out_data = {
-        107.f, 106.f, 105.f, 104.f,
-        103.f, 102.f, 101.f, 100.f,
-        99.f, 98.f, 97.f, 96.f,
-
-        95.f, 94.f, 93.f, 92.f,
-        91.f, 90.f, 89.f, 88.f,
-        87.f, 86.f, 85.f, 84.f,
-
-        83.f, 82.f, 81.f, 80.f,
-        79.f, 78.f, 77.f, 76.f,
-        75.f, 74.f, 73.f, 72.f,
-
-        71.f, 70.f, 69.f, 68.f,
-        67.f, 66.f, 65.f, 64.f,
-        63.f, 62.f, 61.f, 60.f,
-
-        59.f, 58.f, 57.f, 56.f,
-        55.f, 54.f, 53.f, 52.f,
-        51.f, 50.f, 49.f, 48.f,
-
-        47.f, 46.f, 45.f, 44.f,
-        43.f, 42.f, 41.f, 40.f,
-        39.f, 38.f, 37.f, 36.f,
-
-        35.f, 34.f, 33.f, 32.f,
-        31.f, 30.f, 29.f, 28.f,
-        27.f, 26.f, 25.f, 24.f,
-
-        23.f, 22.f, 21.f, 20.f,
-        19.f, 18.f, 17.f, 16.f,
-        15.f, 14.f, 13.f, 12.f,
-
-        11.f,  10.f, 9.f, 8.f,
-        7.f,  6.f,  5.f,  4.f,
-        3.f,  2.f,  1.f,  0.f
-    };
-
-    std::vector<index_select_axis_name> axis = { index_select_axis_name::along_b, index_select_axis_name::along_f, index_select_axis_name::along_y, index_select_axis_name::along_x };
-
-    topology topo;
-    topo.add(
-        input_layout("input", input.get_layout())
-    );
-    topo.add(
-        index_select("index_select", "input", axis)
-    );
-
-    network net(engine, topo);
-
-    set_values(input, input_data);
-    net.set_input_data("input", input);
-
-    auto outputs = net.execute();
-    auto output_mem = outputs.at("index_select").get_memory();
-    auto output_ptr = output_mem.pointer<float>();
-
-    for (size_t i = 0; i < output_ptr.size(); i++)
-    {
-        EXPECT_EQ(output_ptr[i], out_data[i]);
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp
deleted file mode 100644
index 5deb427ad6e..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/lookup_table.hpp"
-#include "api/arg_max_min.hpp"
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-using namespace cldnn;
-using namespace std;
-using namespace tests;
-
-TEST(lookup_table_base, base) {
-    //  Input  : 2x3x2x2
-    static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2;
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { batch_num, feature_num, x_size , y_size } });
-    auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, {2, 1, 1, 1} });
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(input_layout("input2", input2.get_layout()));
-    topology.add(lookup_table("table", "input", "input2"));
-    vector<float> input_vec = {
-        //y0x0 y0x1 y1x0 y1x1
-        /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
-        /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
-        /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
-
-        /*b1f0*/3.f,  0.5f,  7.f,   10.f,
-        /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
-        /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
-    };
-    vector<float> input2_vec = { 11, 3 };
-    set_values(input, input_vec);
-    set_values(input2, input2_vec);
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-    network.set_input_data("input2", input2);
-    auto outputs = network.execute();
-
-    EXPECT_EQ(outputs.size(), size_t(1));
-
-    auto output = outputs.at("table").get_memory();
-    auto output_ptr = output.pointer<float>();;
-    float out_buffer[batch_num];
-    for (uint32_t i = 0; i < batch_num; i++)
-    {
-        out_buffer[i] = get_value<float>(output_ptr, i);
-    }
-    int size = x_size * y_size * feature_num;
-    float value;
-    for (int i = 0; i < batch_num; i++) {
-        value = out_buffer[i];
-        for (int j = 0; j < size; j++)
-        {
-            EXPECT_LE(input_vec[i*size + j], value);
-        }
-    }
-}
-
-TEST(lookup_table_num, base) {
-    //  Input  : 2x3x2x2
-    static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, number_of_values = 3;
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
-    auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 3, 1 } });
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(input_layout("input2", input2.get_layout()));
-    topology.add(lookup_table("table", "input", "input2"));
-    vector<float> input_vec = {
-        //y0x0 y0x1 y1x0 y1x1
-        /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
-        /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
-        /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
-
-        /*b1f0*/3.f,  0.5f,  7.f,   10.f,
-        /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
-        /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
-    };
-    vector<float> input2_vec = { 11, 7, 3, 3, 7, 6};
-    set_values(input, input_vec);
-    set_values(input2, input2_vec);
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-    network.set_input_data("input2", input2);
-    auto outputs = network.execute();
-
-    EXPECT_EQ(outputs.size(), size_t(1));
-
-    auto output = outputs.at("table").get_memory();
-    auto output_ptr = output.pointer<float>();;
-    float out_buffer[batch_num*number_of_values];
-    for (uint32_t i = 0; i < batch_num * number_of_values; i++)
-    {
-        out_buffer[i] = get_value<float>(output_ptr, i);
-    }
-    int size = x_size * y_size * feature_num;
-    float value;
-    for (int i = 0; i < batch_num; i++) {
-        int count = 0;
-        int amount = 0;
-        int same_values = 1;
-        int j;
-        for (j = 0; j < number_of_values; j++) {
-            if (number_of_values - 1 == j) {
-                if (input_vec[i*size + (int)input2_vec[i*number_of_values + j]] != input_vec[i*size + (int)input2_vec[i*number_of_values + j - 1]]) {
-                    amount += j;
-                }
-                else
-                    amount += same_values * (j - same_values + 1);
-            }
-            else if (input_vec[i*size + (int)input2_vec[i*number_of_values + j]] != input_vec[i*size + (int)input2_vec[i*number_of_values + j + 1]]) {
-                if (same_values != j + 1) {
-                    amount += same_values * (j - same_values + 1);
-                    same_values = 1;
-                }
-            }
-            else
-                same_values++;
-        }
-        for (int j = 0; j < number_of_values; j++)
-        {
-            value = out_buffer[i*number_of_values + j];
-            for (int k = 0; k < size; k++)
-            {
-                if (input_vec[i*size + k] > value)
-                    count++;
-            }
-        }
-        EXPECT_EQ(count, amount);
-    }
-}
-
-TEST(lookup_table_with_arg_max, base) {
-    //  Input  : 2x3x2x2
-    static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2;
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ batch_num, feature_num, x_size , y_size } });
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(arg_max_min("arg_max", { "input" }, arg_max_min::max));
-    topology.add(lookup_table("table", "input", "arg_max"));
-    vector<float> input_vec = {
-        //y0x0 y0x1 y1x0 y1x1
-        /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
-        /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
-        /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
-
-        /*b1f0*/3.f,  0.5f,  7.f,   10.f,
-        /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
-        /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
-    };
-    set_values(input, input_vec);
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-    auto outputs = network.execute();
-
-    EXPECT_EQ(outputs.size(), size_t(1));
-
-    auto output = outputs.at("table").get_memory();
-    auto output_ptr = output.pointer<float>();;
-    float out_buffer[batch_num];
-    for (uint32_t i = 0; i < batch_num; i++)
-    {
-        out_buffer[i] = get_value<float>(output_ptr, i);
-    }
-    int size = x_size * y_size * feature_num;
-    float value;
-    for (int i = 0; i < batch_num; i++) {
-        value = out_buffer[i];
-        for (int j = 0; j < size; j++)
-        {
-            EXPECT_LE(input_vec[i*size + j], value);
-        }
-    }
-}
-
-TEST(lookup_table_axis, base) {
-    //  Input  : 2x3x2x2
-    static const int32_t x_size = 2, y_size = 2, feature_num = 3, batch_num = 2, number_of_values = 2;
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ batch_num, feature_num, x_size , y_size } });
-    auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 3, 2, 2 } });
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(input_layout("input2", input2.get_layout()));
-    topology.add(lookup_table("table", "input", "input2", lookup_table::batch));
-    vector<float> input_vec = {
-        //y0x0 y0x1 y1x0 y1x1
-        /*b0f0*/0.1f, -0.1f, 0.9f,  1.5f,
-        /*b0f1*/0.2f, 0.2f,  -10.f, 5.2f,
-        /*b0f2*/0.2f, 0.2f,  -10.f, 5.2f,
-
-        /*b1f0*/3.f,  0.5f,  7.f,   10.f,
-        /*b1f1*/4.f,  0.5f,  8.f,   8.2f,
-        /*b1f2*/0.2f, 0.2f,  -10.f, 5.2f
-    };
-    vector<float> input2_vec = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
-    set_values(input, input_vec);
-    set_values(input2, input2_vec);
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-    network.set_input_data("input2", input2);
-    auto outputs = network.execute();
-
-    EXPECT_EQ(outputs.size(), size_t(1));
-
-    auto output = outputs.at("table").get_memory();
-    auto output_ptr = output.pointer<float>();;
-    const int out_size = y_size * feature_num * x_size * number_of_values;
-    float out_buffer[out_size];
-    for (uint32_t i = 0; i < out_size; i++)
-    {
-        out_buffer[i] = get_value<float>(output_ptr, i);
-    }
-    for (int i = 0; i < out_size; i++)
-    {
-        EXPECT_EQ(out_buffer[i], (i%2==0 ? input_vec[i/2] : input_vec[(i/2+12)]));
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp
deleted file mode 100644
index c365ef58f7f..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/scale_grad_input.hpp"
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-#include <iostream>
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(scale_grad_input_gpu, basic_in2x3x2x2_scale_same_size) {
-    //  Scale  : 2x3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0    -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5  -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2  -13     
-    //  f1: b0:  7    8  -16   b1:   12   8    -17
-    //
-    //  Scale:
-    //  f0: b0:  0.1    0.2  0.25   b1:   0.3   0.4   0.5
-    //  f0: b0:  0.6    0.7  0.75   b1:   0.8   0.9   1  
-    //  f1: b0:  1.1    1.2  1.25   b1:   1.3   1.4   1.5     
-    //  f1: b0:  1.6    1.7  1.75   b1:   1.8   1.9   2
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-    auto scale_input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 3, 2 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(input_layout("scale_input", scale_input.get_layout()));
-    topology.add(scale_grad_input("scale_grad", "input", "scale_input"));
-
-    std::vector<float> input_vec = { 1.f, 0.f, 5.f, 1.5f,
-        2.f, 0.f, 6.f, 5.2f,
-        -10.f, -11.f, -12.f, -13.f,
-        3.f, 0.5f, 7.f, 12.f,
-        4.f, -0.5f, 8.f, 8.f,
-        -14.f, -15.f, -16.f, -17.f };
-    set_values(input, input_vec);
-
-    std::vector<float> scale_input_vec = {
-        0.1f, 0.3f, 1.1f, 1.3f,
-        0.2f, 0.4f, 1.2f, 1.4f,
-        0.25f, 0.5f, 1.25f, 1.5f,
-        0.6f, 0.8f, 1.6f, 1.8f,
-        0.7f, 0.9f, 1.7f, 1.9f,
-        0.75f, 1.f, 1.75f, 2.f
-    };
-    set_values(scale_input, scale_input_vec);
-
-    network network(engine, topology);
-
-    network.set_input_data("input", input);
-    network.set_input_data("scale_input", scale_input);
-
-    auto outputs = network.execute();
-
-    auto output = outputs.at("scale_grad").get_memory();
-    auto output_ptr = output.pointer<float>();
-
-    for (unsigned int i = 0; i < input_vec.size(); ++i) {
-        EXPECT_NEAR(output_ptr[i], input_vec[i] * scale_input_vec[i], 1e-05F);
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp
deleted file mode 100644
index 0c4521d1181..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/data.hpp>
-#include <api/input_layout.hpp>
-#include <api/mutable_data.hpp>
-#include "api/scale_grad_weights.hpp"
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-#include <iostream>
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(scale_grad_weights_gpu, basic_in2x3x2x2) {
-    //  Scale  : 2x3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0    -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5  -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2  -13     
-    //  f1: b0:  7    8  -16   b1:   12   8    -17
-    //
-    //  Input grad:
-    //  f0: b0:  1    2   3   b1:   0    0    -11
-    //  f0: b0:  4    5  -6   b1:   0.5 -0.5  -15  
-    //  f1: b0: -7    8  -9   b1:   1.5  5.2  -13     
-    //  f1: b0:  12  11  10   b1:   12   8    -17
-    //
-    //  Scale:
-    //  f0: 0.1
-    //  f1: 0.6  
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } });
-    auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } });
-    auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(mutable_data("scale_input", scale_input));
-    topology.add(data("grad_input", grad_input));
-    topology.add(scale_grad_weights("scale_grad", "input", "grad_input", "scale_input"));
-
-    std::vector<float> input_vec = { 
-        1.f, 2.f, -10.f,
-        3.f, 4.f, -14.f,
-        5.f, 6.f, -12.f,
-        7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 8.f, -17.f
-    };
-    set_values(input, input_vec);
-
-    std::vector<float> grad_vec = {
-        1.f, 2.f, 3.f,
-        4.f, 5.f, -6.f,
-        -7.f, 8.f, -9.f,
-        12.f, 11.f, 10.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 8.f, -17.f
-    };
-    set_values(grad_input, grad_vec);
-
-    std::vector<float> scale_input_vec = {
-        0.1f, 0.6f
-    };
-    set_values(scale_input, scale_input_vec);
-
-    build_options options;
-    network network(engine, topology);
-    
-    network.set_learning_rate(0.0001f);
-    network.set_input_data("input", input);
-
-    std::vector<float> expected_out = {
-        0.05625f, 0.517171f
-    };
-
-    auto outputs = network.execute();
-
-    auto output_ptr = scale_input.pointer<float>();
-
-    for (unsigned int i = 0; i < expected_out.size(); ++i) {
-        EXPECT_NEAR(output_ptr[i], expected_out[i], 1e-04F);
-    }
-}
-
-TEST(scale_grad_weights_gpu, basic_in2x3x2x2_bias) {
-    //  Scale  : 2x3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0    -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5  -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2  -13     
-    //  f1: b0:  7    8  -16   b1:   12   8    -17
-    //
-    //  Input grad:
-    //  f0: b0:  1    2   3   b1:   0    0    -11
-    //  f0: b0:  4    5  -6   b1:   0.5 -0.5  -15  
-    //  f1: b0: -7    8  -9   b1:   1.5  5.2  -13     
-    //  f1: b0:  12  11  10   b1:   12   8    -17
-    //
-    //  Scale:
-    //  f0: 0.1
-    //  f1: 0.6  
-    //
-    //  Bias:
-    //  f0: 1
-    //  f1: 0.5
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto bias = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(mutable_data("scale_input", scale_input));
-    topology.add(data("grad_input", grad_input));
-    topology.add(mutable_data("bias", bias));
-    topology.add(scale_grad_weights("scale_grad", "input", "grad_input", "scale_input", "bias", ""));
-
-    std::vector<float> input_vec = {
-        1.f, 2.f, -10.f,
-        3.f, 4.f, -14.f,
-        5.f, 6.f, -12.f,
-        7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 8.f, -17.f
-    };
-    set_values(input, input_vec);
-
-    std::vector<float> grad_vec = {
-        1.f, 2.f, 3.f,
-        4.f, 5.f, -6.f,
-        -7.f, 8.f, -9.f,
-        12.f, 11.f, 10.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 8.f, -17.f
-    };
-    set_values(grad_input, grad_vec);
-
-    std::vector<float> scale_input_vec = {
-        0.1f, 0.6f
-    };
-    set_values(scale_input, scale_input_vec);
-
-    std::vector<float> bias_vec = {
-        1.f, 0.5f  
-    };
-    set_values(bias, bias_vec);
-
-    build_options options;
-    network network(engine, topology);
-
-    network.set_learning_rate(0.0001f);
-    network.set_input_data("input", input);
-
-    std::vector<float> expected_scale = {
-        0.05625f, 0.517171f
-    };
-
-    std::vector<float> expected_bias = {
-        1.0017f, 0.4978f
-    };
-
-    auto outputs = network.execute();
-
-    auto scale_ptr = scale_input.pointer<float>();
-    auto bias_ptr = bias.pointer<float>();
-
-    for (unsigned int i = 0; i < expected_scale.size(); ++i) {
-        EXPECT_NEAR(scale_ptr[i], expected_scale[i], 1e-04F);
-    }
-    for (unsigned int i = 0; i < expected_bias.size(); ++i) {
-        EXPECT_NEAR(bias_ptr[i], expected_bias[i], 1e-04F);
-    }
-}
-
-TEST(scale_grad_weights_gpu, basic_in2x3x2x2_bias_momentum) {
-    //  Scale  : 2x3x2x2
-    //  Input  : 2x3x2x2
-    //  Output : 2x3x2x2
-
-    //  Input:
-    //  f0: b0:  1    2  -10   b1:   0    0    -11
-    //  f0: b0:  3    4  -14   b1:   0.5 -0.5  -15  
-    //  f1: b0:  5    6  -12   b1:   1.5  5.2  -13     
-    //  f1: b0:  7    8  -16   b1:   12   8    -17
-    //
-    //  Input grad:
-    //  f0: b0:  1    2   3   b1:   0    0    -11
-    //  f0: b0:  4    5  -6   b1:   0.5 -0.5  -15  
-    //  f1: b0: -7    8  -9   b1:   1.5  5.2  -13     
-    //  f1: b0:  12  11  10   b1:   12   8    -17
-    //
-    //  Scale:
-    //  f0: 0.1
-    //  f1: 0.6  
-    //
-    //  Bias:
-    //  f0: 1
-    //  f1: 0.5
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto grad_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 2, 3, 2 } });
-    auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto bias = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto prev_scale = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-    auto prev_bias = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
-
-    topology topology;
-    topology.add(input_layout("input", input.get_layout()));
-    topology.add(mutable_data("scale_input", scale_input));
-    topology.add(data("grad_input", grad_input));
-    topology.add(mutable_data("bias", bias));
-    topology.add(mutable_data("prev_scale", prev_scale));
-    topology.add(mutable_data("prev_bias", prev_bias));
-    topology.add(scale_grad_weights("scale_grad", "input", "grad_input", "scale_input", "bias", "prev_scale", "prev_bias"));
-
-    std::vector<float> input_vec = {
-        1.f, 2.f, -10.f,
-        3.f, 4.f, -14.f,
-        5.f, 6.f, -12.f,
-        7.f, 8.f, -16.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 8.f, -17.f
-    };
-    set_values(input, input_vec);
-
-    std::vector<float> grad_vec = {
-        1.f, 2.f, 3.f,
-        4.f, 5.f, -6.f,
-        -7.f, 8.f, -9.f,
-        12.f, 11.f, 10.f,
-        0.f, 0.f, -11.f,
-        0.5f, -0.5f, -15.f,
-        1.5f, 5.2f, -13.f,
-        12.f, 8.f, -17.f
-    };
-    set_values(grad_input, grad_vec);
-
-    std::vector<float> scale_input_vec = {
-        0.1f, 0.6f
-    };
-    set_values(scale_input, scale_input_vec);
-
-    std::vector<float> bias_vec = {
-        1.f, 0.5f
-    };
-    set_values(bias, bias_vec);
-
-    build_options options;
-    network network(engine, topology);
-
-    network.set_learning_rate(0.0001f);
-    network.set_input_data("input", input);
-
-    std::vector<float> expected_scale = {
-        0.05625f, 0.517171f
-    };
-
-    std::vector<float> expected_bias = {
-        1.0017f, 0.4978f
-    };
-
-    auto outputs = network.execute();
-
-    auto scale_ptr = scale_input.pointer<float>();
-    auto bias_ptr = bias.pointer<float>();
-    auto mom_scale_ptr = prev_scale.pointer<float>();
-    auto mom_bias_ptr = prev_bias.pointer<float>();
-
-    for (unsigned int i = 0; i < expected_scale.size(); ++i) {
-        EXPECT_NEAR(scale_ptr[i], expected_scale[i], 1e-04F);
-    }
-    for (unsigned int i = 0; i < expected_bias.size(); ++i) {
-        EXPECT_NEAR(bias_ptr[i], expected_bias[i], 1e-04F);
-    }
-    for (unsigned int i = 0; i < mom_scale_ptr.size(); ++i) {
-        EXPECT_NEAR(mom_scale_ptr[i], scale_input_vec[i] - expected_scale[i], 1e-04F);
-    }
-    for (unsigned int i = 0; i < mom_bias_ptr.size(); ++i) {
-        EXPECT_NEAR(mom_bias_ptr[i], bias_vec[i] - expected_bias[i], 1e-04F);
-    }
-}
\ No newline at end of file
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp
deleted file mode 100644
index 9ed37dde31b..00000000000
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include <gtest/gtest.h>
-#include "api/memory.hpp"
-#include <api/input_layout.hpp>
-#include "api/softmax_loss_grad.hpp"
-#include <api/data.hpp>
-#include <api/topology.hpp>
-#include <api/network.hpp>
-#include <api/engine.hpp>
-#include "test_utils/test_utils.h"
-
-using namespace cldnn;
-using namespace tests;
-
-TEST(softmax_loss_grad_f32_fw_gpu, basic1) {
-
-    const auto& engine = get_test_engine();
-
-    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 4, 1 } });
-    auto labels = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 1, 1, 1 } });
-
-    set_values(input, { 8.f, 0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 4.f });
-    set_values(labels, { 1.f, 3.f });
-
-    topology topology(
-        input_layout("input", input.get_layout()),
-        data("labels", labels),
-        softmax_loss_grad("softmax_loss_grad", "input", "labels")
-    );
-
-    network network(engine, topology);
-    network.set_input_data("input", input);
-
-    auto outputs = network.execute();
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "softmax_loss_grad");
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    auto output_ptr = output_prim.pointer<float>();
-
-    std::vector<float> expected_output_vec = { 8.f, -0.5f, 6.f, 9.f, 1.f, 3.f, 2.f, 3.f };
-
-    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
-    {
-        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
-    }
-}
\ No newline at end of file