[IE CLDNN] FP16 support for some quantized kernels (#1855)

2020-08-24 08:47:45 +03:00 · 2020-08-24 08:47:45 +03:00 · c8aa3a7f2b
commit c8aa3a7f2b
parent d4567d5ab5
26 changed files with 206 additions and 10 deletions
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv4_int8.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv4_int8.cpp
@ -23,11 +23,16 @@ constexpr size_t sub_group_size = 16;

 ParamsKey ConvolutionKernel_b_fs_yx_fsv4_int8::GetSupportedKey() const {
    ParamsKey k;
-    k.EnableOutputDataType(Datatype::F32);
    k.EnableInputDataType(Datatype::INT8);
-    k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableOutputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+
+    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableTensorOffset();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
@ -73,6 +73,7 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);

    k.EnableInputWeightsType(WeightsType::INT8);
    k.EnableInputWeightsType(WeightsType::UINT8);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_1x1.cpp
@ -74,11 +74,15 @@ ParamsKey ConvolutionKernel_imad_b_fs_yx_fsv4_1x1::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
    k.EnableInputWeightsType(WeightsType::UINT8);
+
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableDifferentTypes();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_dw.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_b_fs_yx_fsv4_dw.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -72,11 +72,15 @@ ParamsKey ConvolutionKernel_imad_b_fs_yx_fsv4_dw::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
    k.EnableInputWeightsType(WeightsType::UINT8);
+
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableDifferentTypes();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.cpp
@ -34,6 +34,7 @@ ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetSupportedKey() co
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);

    k.EnableInputWeightsType(WeightsType::INT8);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.cpp
@ -34,6 +34,7 @@ ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetSupportedKey() co
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);

    k.EnableInputWeightsType(WeightsType::INT8);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32.cpp
@ -25,10 +25,14 @@ ParamsKey ConvolutionKernel_mmad_b_fs_yx_fsv32::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32_dw.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_b_fs_yx_fsv32_dw.cpp
@ -26,10 +26,14 @@ ParamsKey ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
    k.EnableTensorOffset();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv32.cpp
@ -27,10 +27,14 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetSupportedKey() const
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.cpp
@ -25,10 +25,14 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
    k.EnableOutputLayout(DataLayout::byxf_af32);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_blocks.cpp
@ -43,11 +43,14 @@ ParamsKey ConvolutionKernel_mmad_blocks::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableInputLayout(DataLayout::byxf_af32);
    k.EnableOutputLayout(DataLayout::byxf_af32);
    k.EnableTensorOffset();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_along_f_tile_bfx.cpp
@ -31,10 +31,12 @@ ParamsKey DeconvolutionKernel_imad_along_f_tile_bfx::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableInputWeightsType(WeightsType::INT8);
    k.EnableInputWeightsType(WeightsType::UINT8);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_imad_ref.cpp
@ -25,10 +25,12 @@ ParamsKey DeconvolutionKernel_imad_ref::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableInputWeightsType(WeightsType::INT8);
    k.EnableInputWeightsType(WeightsType::UINT8);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -29,6 +29,7 @@ ParamsKey FullyConnectedKernelIMAD::GetSupportedKey() const {
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);

    k.EnableInputWeightsType(WeightsType::INT8);

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
@ -25,11 +25,14 @@ ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const {
    ParamsKey k;
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
+
    k.EnableInputWeightsType(WeightsType::INT8);
+
    k.EnableDifferentInputWeightsTypes();
    k.EnableDifferentTypes();

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_mmad_int8.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_mmad_int8.cpp
@ -23,10 +23,12 @@ ParamsKey GemmKernelMMADint8::GetSupportedKey() const {
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
    k.EnableInputDataType(Datatype::F32);
+
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableInputLayout(DataLayout::bfzyx);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_mmad_int8_slm.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/gemm/gemm_kernel_mmad_int8_slm.cpp
@ -23,10 +23,12 @@ ParamsKey GemmKernelMMADslmInt8::GetSupportedKey() const {
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
    k.EnableInputDataType(Datatype::F32);
+
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableInputLayout(DataLayout::bfzyx);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp
@ -30,10 +30,12 @@ ParamsKey MVNKernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {

    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_ref.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -25,10 +25,12 @@ ParamsKey MVNKernelRef::GetSupportedKey() const {
    k.EnableInputDataType(Datatype::F32);
    k.EnableInputDataType(Datatype::INT8);
    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableAllInputLayout();
    k.EnableAllOutputLayout();
    k.EnableTensorOffset();
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_zyx_fsv16_imad.cpp
@ -25,6 +25,7 @@ ParamsKey PoolingKernelGPU_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
    k.EnableOutputDataType(Datatype::INT8);
    k.EnableOutputDataType(Datatype::UINT8);
    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
    k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.cpp
@ -30,6 +30,7 @@ ParamsKey Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetSupportedKey() const {
    k.EnableOutputDataType(Datatype::UINT8);

    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);

    k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
    k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reduce/reduce_kernel_ref.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -26,10 +26,14 @@ ParamsKey ReduceKernelRef::GetSupportedKey() const {
    k.EnableInputDataType(Datatype::F32);
    k.EnableInputDataType(Datatype::INT32);
    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
    k.EnableOutputDataType(Datatype::F16);
    k.EnableOutputDataType(Datatype::F32);
    k.EnableOutputDataType(Datatype::INT32);
    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+
    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableInputLayout(DataLayout::bfzyx);
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_imad.cl
@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -92,7 +92,7 @@ KERNEL(fully_connected_gpu_IMAD)(

    output[out_index] = res;
 #else
-    output[out_index] = dequantized;
+    output[out_index] = TO_OUTPUT_TYPE(dequantized);
 #endif
 }

--- a/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
@ -191,9 +191,11 @@ attach_eltwise_gpu::attach_eltwise_gpu() {
         { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), eltwise_gpu::create },
         { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), eltwise_gpu::create },
         { std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv32), eltwise_gpu::create },
+         { std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv32), eltwise_gpu::create },
         { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), eltwise_gpu::create },
         { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), eltwise_gpu::create },
         { std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv32), eltwise_gpu::create },
+         { std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv32), eltwise_gpu::create },

         //
         { std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), eltwise_gpu::create }});
--- a/inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/resample_gpu.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -97,9 +97,11 @@ attach_resample_gpu::attach_resample_gpu() {
         {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), resample_gpu::create},
+         {std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), resample_gpu::create},
+         {std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), resample_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), resample_gpu::create}});
 }
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -2353,6 +2353,21 @@ TEST_P(fc_int8_scale, basic) {
    execute(p);
 }

+TEST_P(fc_int8_scale, fp16_scale_out) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+        data("weights", get_mem(get_weights_layout(p))),
+        data("bias", get_mem(get_bias_layout(p))),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
+        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
+        scale("scale", "fc_prim", "scale_data", optional_data_type{data_types::f16}),
+        reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
 INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale,
    ::testing::ValuesIn(std::vector<bc_test_params>{
                        bc_test_params{ CASE_FC_U8S8_1, 2, 3 },
@ -2523,6 +2538,45 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_quantize_u8,
                        gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 },
 }), );

+class gemm_2in_scale : public GemmFusingTest {};
+TEST_P(gemm_2in_scale, basic) {
+    auto p = GetParam();
+    create_topologies(input_layout("input0", get_input_layout(p, 0)),
+        input_layout("input1", get_input_layout(p, 1)),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
+        gemm("gemm_prim", { "input0", "input1" }, data_types::f32),
+        scale("scale", "gemm_prim", "scale_data"),
+        reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+TEST_P(gemm_2in_scale, fp16_scale_out) {
+    auto p = GetParam();
+    create_topologies(input_layout("input0", get_input_layout(p, 0)),
+        input_layout("input1", get_input_layout(p, 1)),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
+        gemm("gemm_prim", { "input0", "input1" }, data_types::f32),
+        scale("scale", "gemm_prim", "scale_data", optional_data_type{data_types::f16}),
+        reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_2in_scale,
+    ::testing::ValuesIn(std::vector<gemm_test_params>{
+                        gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 },
+                        gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 },
+                        gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 },
+                        gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 },
+                        gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 },
+}), );
+
 class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {};
 TEST_P(gemm_2in_act_scale_quantize_i8, basic) {
    auto p = GetParam();
@ -3882,6 +3936,74 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_actv,
        deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 3 },
 }), );

+class deconv_scale : public DeconvolutionFusingTest {};
+TEST_P(deconv_scale, basic) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("weights", get_mem(get_weights_layout(p))),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
+        deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad),
+        scale("scale", "deconv", "scale_data"),
+        reorder("out", "scale", p.default_format, data_types::f32)
+    );
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+TEST_P(deconv_scale, fp16_scale_out) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("weights", get_mem(get_weights_layout(p))),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
+        deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad),
+        scale("scale", "deconv", "scale_data", optional_data_type{data_types::f16}),
+        reorder("out", "scale", p.default_format, data_types::f32)
+    );
+    tolerance = 1e-5f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale,
+    ::testing::ValuesIn(std::vector<deconv_test_params>{
+        deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_4, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_5, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_6, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_7, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_8, 2, 3 },
+
+        deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_4, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_5, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_6, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_7, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_8, 2, 3 },
+
+        deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 3 },
+        deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 3 },
+
+        deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 3 },
+        deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 3 },
+}), );
+
 class deconv_actv_eltw_actv : public DeconvolutionFusingTest {};
 TEST_P(deconv_actv_eltw_actv, basic) {
    auto p = GetParam();
@ -4440,6 +4562,19 @@ TEST_P(pooling_f32_scale, basic) {
    execute(p);
 }

+TEST_P(pooling_f32_scale, fp16_scale_out) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 3, 3}.count())),
+        pooling("pooling", "input", p.pool_mode, tensor{1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}),
+        scale("scale", "pooling", "scale_data", optional_data_type{data_types::f16}),
+        reorder("output_reorder", "scale", format::bfyx, data_types::f32));
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
 INSTANTIATE_TEST_CASE_P(fusings_gpu,
                        pooling_f32_scale,
                        ::testing::ValuesIn(std::vector<pooling_test_params>{