diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp
index fc3e33267e3..dca5348789a 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp
@@ -139,6 +139,10 @@ bool ConvolutionKernel_fs_byx_fsv32::Validate(const Params& p, const optional_pa
     if (cp.output.Feature().pad.before % fsv != 0)
         return false;
 
+    // Input feature padding must be multiple of fsv to keep block alignment
+    if (cp.inputs[0].Feature().pad.before % fsv != 0)
+        return false;
+
     return true;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp
index 67c0b3e8124..5533baa796d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp
@@ -142,6 +142,10 @@ bool ConvolutionKernel_fs_byx_fsv32_1x1::Validate(const Params& p, const optiona
     if (cp.output.Feature().pad.before % fsv != 0)
         return false;
 
+    // Input feature padding must be multiple of fsv to keep block alignment
+    if (cp.inputs[0].Feature().pad.before % fsv != 0)
+        return false;
+
     return true;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp
index dc3f4147249..cbb39991735 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp
@@ -146,6 +146,10 @@ bool ConvolutionKernel_fs_byx_fsv32_depthwise::Validate(const Params& p, const o
     if (cp.output.Feature().pad.before % fsv != 0)
         return false;
 
+    // Input feature padding must be multiple of fsv to keep block alignment
+    if (cp.inputs[0].Feature().pad.before % fsv != 0)
+        return false;
+
     return true;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
index b963162bc18..25ccfe1c681 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
@@ -68,6 +68,9 @@ bool PoolingKerneGPU_fs_b_yx_fsv32::Validate(const Params& p, const optional_par
     if (pp.output.Feature().pad.before % 32 != 0)
         return false;
 
+    if (pp.inputs[0].Feature().pad.before % 32 != 0)
+        return false;
+
     return true;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
index 076464062ad..a3afa685304 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -103,7 +104,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
     uint input_offset = oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
     input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING;
     input_offset += INPUT0_PAD_BEFORE_FEATURE_NUM * INPUT0_FEATURE_PITCH;
-    input_offset += b * INPUT0_BATCH_PITCH;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * INPUT0_BATCH_PITCH;
 
     uint weight_offset = 0;
     weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV;
@@ -243,12 +244,19 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
+
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+    output_offset += (pad_before_fs + fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -309,5 +317,6 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
 
 #undef INPUT_BLOCK_WIDTH_EL_CNT
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl
index bed75181362..a6c7bbfa4b7 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -74,10 +76,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
+    // Calculate offset to first input data element
+    const uint in_pitch_x = FSV;
+    const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+    const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+    const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
     uint input_offset = 0;
-    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
-    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
-    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+    input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
 
     uint weight_offset = 0;
     weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV;
@@ -108,7 +117,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
             // ====================================================================
 
             // Move temporary input offset to next row
-            tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV;
+            tmp_input_offset += DILATION_SIZE_Y * in_pitch_y;
 
             uint tmp_weight_offset = weight_offset;
 
@@ -146,7 +155,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
             weight_offset += FILTER_SIZE_X * FSV;
         }
         // Move input offset to next input feature slice
-        input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+        input_offset += in_pitch_fs;
         // Move weight offset to next input feature slice (FSV input features)
         //  minus offset added by moving FILTER_SIZE_Y times to new row
         weight_offset += FSV * FILTER_SIZE_Y * FILTER_SIZE_X * FSV // FSV * input filter feature pitch
@@ -190,13 +199,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
 
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b;
+    output_offset += (fs + pad_before_fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -243,6 +258,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl
index a82af383d37..7e9f8e7ccbb 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -73,10 +75,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
+    // Calculate offset to first input data element
+    const uint in_pitch_x = FSV;
+    const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+    const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+    const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
     uint input_offset = 0;
-    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
-    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
-    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+    input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
 
     uint weight_offset = 0;
     weight_offset += fs * ALIGNED_IFM_NUM * FSV;
@@ -119,11 +128,11 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                 }
             }
             // Move temporary input offset to next strided row
-            tmp_input_offset += INPUT0_SIZE_X_WITH_PADDING * FSV * STRIDE_SIZE_Y;
+            tmp_input_offset += in_pitch_y * STRIDE_SIZE_Y;
         }
         // ========================================================================
         // Move input offset to next input feature slice
-        input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
+        input_offset += in_pitch_fs;
 
     }
     // ========================================================================
@@ -170,12 +179,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
+
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM)  * out_pitch_b;
+    output_offset += (pad_before_fs + fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -201,7 +217,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                 UNIT_BLOCK_WRITE2(output, output_offset + out_x * FSV, tmp_write);
             }
             // Move output offset to next row
-            output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING;
+            output_offset += out_pitch_y;
         }
     }
     else
@@ -225,7 +241,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
                 }
             }
             // Move output offset to next row
-            output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING;
+            output_offset += out_pitch_y;
         }
     }
     // ========================================================================
@@ -235,6 +251,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
 
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl
index 7131d3535e9..45c48973cc6 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl
@@ -21,9 +21,11 @@
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
 
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // In some cases input padding may be bigger than needed, those variables describe the offset into padding.
 #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X)
@@ -72,11 +74,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
+    // Calculate offset to first input data element
+    const uint in_pitch_x = FSV;
+    const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING;
+    const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING;
+    const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING;
+
     uint input_offset = 0;
-    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV;
-    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV;
-    input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
-    input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM;
+    input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x;
+    input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y;
+    input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b;
+    input_offset += (fs + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs;
 
     uint weight_offset = 0;
 
@@ -105,7 +113,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
         // ====================================================================
 
         // Move temporary input offset to next row
-        tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV;
+        tmp_input_offset += DILATION_SIZE_Y * in_pitch_y;
 
         uint tmp_weight_offset = weight_offset;
 
@@ -174,13 +182,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
     // ========================================================================
     // Store results:
+    // Calculate offset to first output element
+    const uint out_pitch_x = FSV;
+    const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING;
+    const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING;
+    const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING;
+
     const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV);
 
     uint output_offset = 0;
-    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV;
-    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING;
-    output_offset += b  * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING;
-    output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM;
+    output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x;
+    output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y;
+    output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM)  * out_pitch_b;
+    output_offset += (pad_before_fs + fs) * out_pitch_fs;
 
     const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM;
     const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X;
@@ -227,6 +241,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
 
 #undef INPUT0_SIZE_X_WITH_PADDING
 #undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
 
 #undef OUTPUT_SIZE_X_WITH_PADDING
 #undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
index 7c98ee79530..bec60b14b9e 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
@@ -37,8 +37,11 @@
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
+#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
+
 #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X)
 #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y)
+#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM)
 
 // Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI
 #define REQD_SUB_GROUP_SIZE 16
@@ -79,14 +82,18 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     const uint x_pitch = REQD_FEATURE_SLICE_SIZE;                        // difference in location between (x+1) and (x)
     const uint y_pitch = x_pitch * INPUT0_SIZE_X_WITH_PADDING;           // difference in location between (y+1) and (y)
     const uint b_pitch = y_pitch * INPUT0_SIZE_Y_WITH_PADDING;           // difference in location between (b+1) and (b)
-    const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM;                    // difference in location between (fs+1) and (fs)
+    const uint fs_pitch = b_pitch * INPUT0_SIZE_B_WITH_PADDING;          // difference in location between (fs+1) and (fs)
 
     const int offset_x = (int)out_x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)out_y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
 
-    const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch + INPUT0_PAD_BEFORE_SIZE_Y * y_pitch;
+    const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch +
+                                  INPUT0_PAD_BEFORE_SIZE_Y * y_pitch +
+                                  INPUT0_PAD_BEFORE_BATCH_NUM * b_pitch +
+                                  INPUT0_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE * fs_pitch;
     const size_t fs_offset = fs * fs_pitch; // locate beginning of feature tile
     const size_t b_offset = b * b_pitch;   // locate beginning of batch
+
 #ifdef CHECK_BOUNDRY
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
         offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
@@ -152,15 +159,14 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     const size_t out_x_pitch = REQD_FEATURE_SLICE_SIZE;
     const size_t out_y_pitch = out_x_pitch * OUTPUT_SIZE_X_WITH_PADDING;
     const size_t out_b_pitch = out_y_pitch * OUTPUT_SIZE_Y_WITH_PADDING;
-    const size_t out_fs_pitch = out_b_pitch * OUTPUT_BATCH_NUM;
+    const size_t out_fs_pitch = out_b_pitch * OUTPUT_SIZE_B_WITH_PADDING;
 
     const size_t out_pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE);
     const size_t out_x_offset = (out_x + OUTPUT_PAD_BEFORE_SIZE_X) * out_x_pitch;
     const size_t out_y_offset = (out_y + OUTPUT_PAD_BEFORE_SIZE_Y) * out_y_pitch;
-    const size_t out_b_offset = b * out_b_pitch;
+    const size_t out_b_offset = (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_b_pitch;
     const size_t out_fs_offset = (fs + out_pad_before_fs) * out_fs_pitch;
 
-
     const size_t output_offset = out_fs_offset + out_b_offset + out_y_offset + out_x_offset;
 
     const bool full_f = OUTPUT_FEATURE_NUM % REQD_FEATURE_SLICE_SIZE == 0 ||
@@ -204,3 +210,15 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
 
 #undef OUTPUT_VEC2
 #undef TO_OUTPUT_VEC2
+
+#undef INPUT0_SIZE_X_WITH_PADDING
+#undef INPUT0_SIZE_Y_WITH_PADDING
+#undef INPUT0_SIZE_B_WITH_PADDING
+
+#undef OUTPUT_SIZE_X_WITH_PADDING
+#undef OUTPUT_SIZE_Y_WITH_PADDING
+#undef OUTPUT_SIZE_B_WITH_PADDING
+
+#undef REQD_SUB_GROUP_SIZE
+#undef REQD_FEATURE_SLICE_SIZE
+#undef REQD_FEATURES_PER_WORK_ITEM
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
index 362e9a76579..bda68337a64 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
@@ -448,8 +448,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
             definitions.push_back({ safe_index_func_name, safe_index_func_val });
             definitions.push_back({ index_func_name, index_func_val });
         } else {
-            definitions.push_back({ safe_index_func_name, "(f)" });
-            definitions.push_back({ index_func_name, "(f)" });
+            definitions.push_back({ safe_index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" });
+            definitions.push_back({ index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" });
         }
     } else {
         definitions.push_back({ safe_index_func_name, safe_index_func_val });
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp
index d2ce48e8c60..81db4d52529 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp
@@ -38,6 +38,222 @@
 
 using namespace cldnn;
 
+namespace {
+
+struct concat_noop_optimization : pattern_match_optimization_typed<concat_noop_optimization, concatenation> {
+    // Removes concatenation nodes with single input.
+    using base = pattern_match_optimization_typed<concat_noop_optimization, concatenation>;
+    using base::base;
+
+    bool match(concatenation_node& node);
+    bool optimize(concatenation_node& node);
+};
+
+struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in_place_optimization, concatenation> {
+    // Performs in-place concat optimization.
+    // Padding of predecessors is updated to use single buffer by all, which is output from concatenation.
+    // Then concatenation can be optimized out, as memory will be correctly filled by previous nodes.
+    // If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it.
+    // This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards.
+    using base = pattern_match_optimization_typed<concat_in_place_optimization, concatenation>;
+    using base::base;
+
+    // Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to `needs_reoptimization`.
+    void optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization);
+    bool match(concatenation_node& node);
+    bool optimize(concatenation_node& node) {
+        std::list<concatenation_node*> need_reopt;
+        optimize_cascade(node, need_reopt);
+        while (!need_reopt.empty()) {
+            auto& prop = *need_reopt.front();
+            need_reopt.pop_front();
+            if (match(prop))
+                optimize_cascade(prop, need_reopt);
+            else
+                // TODO: Revert extra padding when cascade adjustment failed.
+                prop.can_be_optimized(false);
+        }
+        return false;  // node not invalidated
+    }
+};
+
+bool concat_noop_optimization::match(concatenation_node& node) {
+    if (node.is_output() && !get_program().is_debug_build())
+        return false;
+    return node.get_dependencies().size() == 1 &&
+        !node.has_fused_primitives() &&
+        node.get_fused_activations_funcs().empty();
+}
+
+bool concat_noop_optimization::optimize(concatenation_node& node) {
+    auto& dep = node.get_dependency(0);
+    dep.merge_output_padding(node.get_output_layout().data_padding);
+    prog.extract_and_remove(node);
+    // Node has been removed, so no further optimizations.
+    return true;
+}
+
+bool concat_in_place_optimization::match(concatenation_node& node) {
+    if (node.is_output() && !get_program().is_debug_build())
+        return false;
+    if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
+        return false;
+
+    // For in place concatenation input layouts and data types must match.
+    auto output_format = node.get_output_layout().format;
+    auto output_datatype = node.get_output_layout().data_type;
+    auto concat_axis = node.get_primitive()->axis;
+
+    for (auto& input : node.get_dependencies()) {
+        if (input->is_type<reshape>())
+            // reshapes should be optimized out.
+            return false;
+
+        layout l = input->get_output_layout();
+
+        if (output_format != l.format || output_datatype != l.data_type)
+            return false;
+
+        // TODO: Below condition should be moved to program_node::supports_padding.
+        // This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases.
+        // It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated.
+        if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
+
+        if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
+
+        if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
+            (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
+
+        // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
+        if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
+
+        if (l.format == format::bs_fs_yx_bsv16_fsv16)
+            return false;
+
+        if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
+            return false;
+    }
+
+    auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
+    lower_padd_in_axis = std::max(lower_padd_in_axis,
+                                  node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]);
+
+    // check if concatenation in place can be applied for inputs set
+    size_t idx = 0;
+    for (auto input : node.get_dependencies()) {
+        // reverted condition - if any of this node's inputs is used by more than one primitive
+        // and is not optimized concatenation then do not fuse buffers
+        // todo: we need add padding support for all optimized kernels to remove this condition
+        if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
+            !input->is_type<activation>() && !input->is_type<deconvolution>() &&
+            !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
+            !input->is_type<resample>())
+            return false;
+
+        // if an input is marked as network output, prevent optimizations
+        // which would affect a form of its output (unless debug flag is set),
+        // we also need to restrict input types to those which support padding on all axis
+        if ((input->is_output() && !get_program().is_debug_build()) ||
+            !input->is_padding_supported(concat_axis, lower_padd_in_axis))
+            return false;
+
+        // TODO: Investigate if this condition is needed
+        if (input->get_users().size() > 2)
+            return false;
+
+        // Check that input isn't optimized out concatenation along different axis.
+        if (input->is_type<concatenation>() && input->can_be_optimized() &&
+            input->as<concatenation>().get_primitive()->axis != concat_axis)
+            return false;
+
+        // Check that input isn't optimized out non-concatenation.
+        if (!input->is_type<concatenation>() && input->can_be_optimized())
+            return false;
+
+        size_t concat_users = 0;
+        for (auto& user : input->get_users())
+            if (user->is_type<concatenation>())
+                concat_users += 1;
+
+        // If input is used by more than one concatenation then they may require different paddings.
+        if (concat_users != 1)
+            return false;
+
+        auto input_padd = input->get_output_layout().data_padding;
+
+        // Check that there isn't already some padding between inputs in concat axis.
+        // If node has already been optimized we skip this check - this is just cascade adjustment.
+        if (!node.can_be_optimized()) {
+            if (idx != node.get_dependencies().size() && input_padd.upper_size().raw[concat_axis] != 0)
+                return false;
+            if (idx != 0 && input_padd.lower_size().raw[concat_axis] != 0)
+                return false;
+        }
+
+        lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
+        idx += 1;
+    }
+
+    return true;
+}
+
+void concat_in_place_optimization::optimize_cascade(concatenation_node& node, std::list<concatenation_node*>& need_reoptimization) {
+    auto concat_axis = node.get_primitive()->axis;
+
+    // Select output padding by propagating all required input paddings.
+    auto padd = node.get_output_layout().data_padding;
+    for (auto input : node.get_dependencies()) {
+        padd = padding::max(padd, input->get_output_layout().data_padding);
+    }
+
+    auto lower_padd = padd.lower_size();
+    auto upper_padd = padd.upper_size();
+
+    // For cascade adjustment override padding in concat axis to output padding.
+    // In other case match(...) already checked that only first/last input have lower/upper padding.
+    if (node.can_be_optimized()) {
+        lower_padd.raw[concat_axis] = node.get_output_layout().data_padding.lower_size().raw[concat_axis];
+        upper_padd.raw[concat_axis] = node.get_output_layout().data_padding.upper_size().raw[concat_axis];
+    }
+    node.set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes()));
+
+    upper_padd.raw[concat_axis] += node.get_output_layout().size.raw[concat_axis];
+
+    // apply concatenation in place optimization
+    for (auto input : node.get_dependencies()) {
+        auto input_lenght = input->get_output_layout().size.raw[concat_axis];
+
+        if (input->is_type<concatenation>() && input->can_be_optimized())
+            need_reoptimization.push_back(&input->as<concatenation>());
+
+        // shrink upper pad so it points at the end of the input's buffer
+        //
+        //   |--- lower padd ---|                    |---------- upper padd -----------|
+        //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+        upper_padd.raw[concat_axis] -= input_lenght;
+
+        // set new padding for input
+        input->set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes()));
+
+        // move lower padd further
+        //
+        //   |-------------- lower padd -------------|---------- upper padd -----------|
+        //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
+        lower_padd.raw[concat_axis] += input_lenght;
+    }
+
+    node.can_be_optimized(true);
+    for (auto dep : node.get_users()) {
+        dep->can_share_buffer(false);
+    }
+}
+
+}  // namespace
+
 // ToDo remove friendship relation from  program_node
 void prepare_buffer_fusing::run(program_impl& p) {
     bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
@@ -57,198 +273,11 @@ void prepare_buffer_fusing::run(program_impl& p) {
     };
 
     // [1] First try to optimize all concats
-    auto node_itr = p.get_processing_order().begin();
-    while (node_itr != p.get_processing_order().end()) {
-        auto& node = (*node_itr++);
-        if (!can_optimize(node))
-            continue;
-        program_helpers::do_for_types<concatenation>(*node, [&p, is_debug](concatenation_node& node) {
-            // For in place concatenation input layouts and data types must match
-            auto output_format = node.get_output_layout().format;
-            auto output_datatype = node.get_output_layout().data_type;
-            // we need to avoid mixing padded and unpadded buffer
-            bool all_dependencies_padded = true;
-            bool all_dependencies_unpadded = true;
-            for (auto& input : node.get_dependencies()) {
-                if (input->type() == reshape::type_id())
-                    // reshapes should be optimized out
-                    return;
-
-                layout l = input->get_output_layout();
-                if (static_cast<bool>(l.data_padding))
-                    all_dependencies_unpadded = false;
-                else
-                    all_dependencies_padded = false;
-
-                if (output_format != l.format || output_datatype != l.data_type)
-                    return;
-
-                if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
-
-                if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
-
-                if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) &&
-                    (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
-
-                // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
-                if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
-
-                if (l.format == format::bs_fs_yx_bsv16_fsv16)
-                    return;
-
-                if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f))
-                    return;
-            }
-
-            auto concat_axis = node.get_primitive()->axis;
-            auto padd = node.get_output_layout().data_padding;
-
-            tensor lower_padd = padd.lower_size();
-            tensor upper_padd = padd.upper_size();
-
-            auto upper_padd_val =
-                node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis];
-            tensor lower_padd_offset = lower_padd;
-
-            std::list<std::pair<const std::vector<program_node*>, tensor>> stack = {
-                std::make_pair(node.get_dependencies(), tensor(0))};
-            while (!stack.empty()) {
-                auto nodes_list = stack.front();
-                stack.pop_front();
-
-                // if concatenation has only one input it does nothing, remove the node
-                if (node.get_dependencies().size() == 1) {
-                    p.extract_and_remove(node);
-                    return;
-                }
-
-                auto cascade_adjustment = nodes_list.second;
-                upper_padd.raw[concat_axis] = upper_padd_val;
-                lower_padd = lower_padd_offset;
-
-                auto lower_padd_in_axis = lower_padd.raw[concat_axis] + cascade_adjustment.raw[concat_axis];
-                auto first_input_format = nodes_list.first[0]->get_output_layout().format;
-
-                // check if concatenation in place can be applied for inputs set
-                for (auto input : nodes_list.first) {
-                    // reverted condition - if any of this node's inputs is used by more than one primitive
-                    // and is not optimized concatenation then do not fuse buffers
-                    // todo: we need add padding support for all optimized kernels to remove this condition
-                    if (!input->is_type<pooling>() && !input->is_type<convolution>() &&
-                        !input->is_type<activation>() && !input->is_type<deconvolution>() &&
-                        !input->is_type<concatenation>() && !input->is_type<crop>() && !input->is_type<scale>() &&
-                        !input->is_type<resample>())
-                        return;
-
-                    // if an input is marked as network output, prevent optimizations
-                    // which would affect a form of its output (unless debug flag is set),
-                    // we also need to restrict input types to those which support padding on all axis
-                    if ((input->is_output() && !is_debug) || input->get_users().size() > 2 ||
-                        !input->is_padding_supported(concat_axis, lower_padd_in_axis))
-                        return;
-
-                    if (input->get_users().size() > 1) {
-                        auto user_count = input->get_users().size();
-                        for (auto& user : input->get_users())
-                            if (user->is_type<concatenation>())
-                                user_count--;
-                        if (user_count != 1)  // user_cout == 0 means that input will be used only by concatenations, so
-                                              // we cannot apply concat in place for it
-                            return;
-                    }
-
-                    // check if all inputs have the same format
-                    if (input->get_output_layout().format != first_input_format)
-                        return;
-
-                    lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
-                }
-
-                // check if it is worth doing concat in place, in case the following primitive is convolution
-                // with different input padding than concatenation's input users' convolutions,
-                // it is likely that convolution's implementation will be a reference one, due to mismatched padding
-                // and performance gain by doing in place concat is nullified by slower convolution implementation
-                // this should be handled by more advanced tuning mechanism on the topology level
-                auto& users = node.get_users();
-                if (users.size() == 1) {
-                    auto& user = users.front();
-                    if (node.get_output_layout().format == format::bfyx && user->type() == convolution::type_id()) {
-                        auto out_input_offsets = user->as<convolution>().get_primitive()->input_offset;
-
-                        std::vector<tensor> in_input_offsets;
-                        for (auto& in_user : nodes_list.first) {
-                            if (in_user->type() == convolution::type_id())
-                                in_input_offsets.push_back(in_user->as<convolution>().get_primitive()->input_offset);
-                        }
-
-                        for (auto& in_input_offset : in_input_offsets) {
-                            if (in_input_offset.spatial[0] != out_input_offsets.spatial[0] &&
-                                in_input_offset.spatial[1] != out_input_offsets.spatial[1])
-                                return;
-                        }
-                    } else if (user->type() == fused_conv_eltwise::type_id()) {
-                        if (!user->as<fused_conv_eltwise>().get_fused_primitives().empty() &&
-                            user->as<fused_conv_eltwise>().get_fused_primitives().begin()->node->is_type<depth_to_space>())
-                            return;
-                    }
-                }
-
-                // apply concatenation in place optimization
-                for (auto input : nodes_list.first) {
-                    auto input_lenght = input->get_output_layout().size.raw[concat_axis];
-
-                    bool optimized_concat_input = false;
-                    if (input->type() == concatenation::type_id() && input->can_be_optimized()) {
-                        if (input->as<concatenation>().get_primitive()->axis != node.get_primitive()->axis)
-                            return;
-                        optimized_concat_input = true;
-                    } else if (input->can_be_optimized()) {
-                        return;
-                    }
-
-                    // shrink upper pad so it points at the end of the input's buffer
-                    //
-                    //   |--- lower padd ---|                    |---------- upper padd -----------|
-                    //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
-                    upper_padd.raw[concat_axis] -= input_lenght;
-
-                    // adjust padding sizes for cascade concatenations
-                    auto lower_padd_tmp = lower_padd;
-                    lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis];
-                    auto upper_padd_tmp = upper_padd;
-                    upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis];
-
-                    // set new padding for input
-                    input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes()));
-
-                    // move lower padd further
-                    //
-                    //   |-------------- lower padd -------------|---------- upper padd -----------|
-                    //   |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --|
-
-                    lower_padd.raw[concat_axis] += input_lenght;
-
-                    if (optimized_concat_input && !input->get_dependencies().empty())
-                        stack.push_back(std::make_pair(input->get_dependencies(),
-                                                       input->get_output_layout().data_padding.lower_size()));
-                }
-            }
-
-            node.can_be_optimized(true);
-            for (auto dep : node.get_users()) {
-                dep->can_share_buffer(false);
-            }
-            if (!all_dependencies_padded && !all_dependencies_unpadded)
-                node.can_share_buffer(false);
-        });
-    }
+    run_node_optimizations<concat_noop_optimization,
+                           concat_in_place_optimization>(p);
 
     // [2] Then try to optimize all crops
-    node_itr = p.get_processing_order().begin();
+    auto node_itr = p.get_processing_order().begin();
     while (node_itr != p.get_processing_order().end()) {
         auto& node = (*node_itr++);
         if (!can_optimize(node))
diff --git a/inference-engine/thirdparty/clDNN/src/include/program_helpers.h b/inference-engine/thirdparty/clDNN/src/include/program_helpers.h
index 7ec262217e1..57e56f7587b 100644
--- a/inference-engine/thirdparty/clDNN/src/include/program_helpers.h
+++ b/inference-engine/thirdparty/clDNN/src/include/program_helpers.h
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -135,4 +135,105 @@ struct program_helpers {
     }
     static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
 };
+
+// Base class for performing pattern match style optimizations.
+// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`,
+// and overload match and optimize methods.
+template <typename Impl>
+struct pattern_match_optimization {
+    pattern_match_optimization(program_impl& prog)
+        : prog(prog)
+    {}
+
+    // Returns whether optimization can be performed for specified node.
+    bool match(program_node& node) {
+        return static_cast<Impl*>(this)->match(node);
+    }
+    // Returns whether optimization invalidated the node and no futher optimizations should execute.
+    bool optimize(program_node& node) {
+        // TODO: Add program optimizer class that would take responsibility of modifying program.
+        //       Then use it to provide more complex control over pattern-matches, ie:
+        //       new node added - run applicable optimizations on it as well;
+        //       node deleted - don't do more optimizations;
+        return static_cast<Impl*>(this)->optimize(node);
+    }
+    // Returns whether optimization invalidated the node and no futher optimizations should execute.
+    bool match_and_optimize(program_node& node) {
+        if (!match(node))
+            return false;
+        return optimize(node);
+    }
+
+    program_impl& get_program() { return prog; }
+
+    program_impl& prog;
+};
+
+// Class for pattern-match optimizations that provides support for matching
+// single primitive type `Prim`.
+// Implementing class `Impl` is expected to overload:
+// bool match(typed_program_node<Prim>&)
+// bool optimize(typed_program_node<Prim>&)
+// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`.
+template <typename Impl, typename Prim>
+struct pattern_match_optimization_typed : pattern_match_optimization<pattern_match_optimization_typed<Impl, Prim>> {
+    using base = pattern_match_optimization<pattern_match_optimization_typed<Impl, Prim>>;
+
+    using base::base;
+
+    // Returns whether optimization can be performed for specified node.
+    bool match(program_node& node) {
+        if (!node.is_type<Prim>())
+            return false;
+        return static_cast<Impl*>(this)->match(node.as<Prim>());
+    }
+    // Should be overloaded by implementation class to match specified primitive.
+    bool match(typed_program_node<Prim>& node) {
+        return false;
+    }
+
+    // Returns whether optimization invalidated the node and no futher optimizations should execute.
+    bool optimize(program_node& node) {
+        return static_cast<Impl*>(this)->optimize(node.as<Prim>());
+    }
+    // Should be overloaded by implementation class to optimize specified primitive.
+    bool optimize(typed_program_node<Prim>& node) {
+        return false;
+    }
+};
+
+// Runs pattern-match optimiations passed as arguments on `node`.
+inline bool run_node_optimizations(program_node& /*node*/) {
+    return false;
+}
+
+template <typename Opt, typename... Rest>
+bool run_node_optimizations(program_node& node, Opt&& opt, Rest&&... rest) {
+    if (opt.match_and_optimize(node))
+        return true;
+    return run_node_optimizations(node, std::forward<Rest>(rest)...);
+}
+
+// Runs pattern-match optimizations `Opts` on `node`.
+// Optimizations should have constructor with single argument `program_impl&`.
+template <typename... Opts>
+bool run_node_optimizations(program_impl& p, program_node& node) {
+    return run_node_optimizations<Opts...>(node, Opts(p)...);
+}
+
+// Runs specified pattern-match optimizations on whole program, in processing order.
+template <typename... Opts>
+void run_node_optimizations(program_impl& p, Opts&&... opts) {
+    auto it = p.get_processing_order().begin();
+    while (it != p.get_processing_order().end()) {
+        auto node = *it++;
+        run_node_optimizations(*node, std::forward<Opts>(opts)...);
+    }
+}
+
+template <typename... Opts>
+void run_node_optimizations(program_impl& p) {
+    run_node_optimizations(p, Opts(p)...);
+}
+
 }  // namespace cldnn
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp
index 0d8bc880002..05e32e547d3 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -431,6 +431,211 @@ TEST(depth_concatenate_f32_gpu, test05_different_formats) {
     }
 }
 
+TEST(depth_concatenate_f32_gpu, test06_padded_input) {
+    // input1 - activation - concatenation - concatenation - reorder
+    //                     /                /
+    // input2 - activation -  convolution* /
+    //
+    // *Convolution has input offset so it should be propagated, both back to reorders and to second concatenation.
+    // As a result both concatenations should be optimized out and convolution should use optimized implementation.
+    const int32_t input_f = 32;
+    const int32_t output_f = 3 * input_f;
+
+    const auto& engine = get_test_engine();
+    auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+    auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+
+    auto input1_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    auto input2_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    set_values(input1, flatten_4d(format::bfyx, input1_data));
+    set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+    auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {input_f, input_f, 3, 3} });
+    // Construct weights for convolution that just double input values.
+    VVVVF<FLOAT16> weights_data;
+    weights_data.resize(input_f);
+    for (size_t oi = 0; oi < input_f; ++oi) {
+        weights_data[oi].resize(input_f, VVF<FLOAT16>(3, VF<FLOAT16>(3, FLOAT16(0.f))));
+        weights_data[oi][oi][1][1] = 2.f;
+    }
+    set_values(weights, flatten_4d(format::bfyx, weights_data));
+
+    topology topology;
+    topology.add(input_layout("input1", input1.get_layout()));
+    topology.add(input_layout("input2", input2.get_layout()));
+    topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+    topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+    topology.add(data("weights", weights));
+    topology.add(convolution("conv", "actv2", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0))));
+    topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+    topology.add(concatenation("depth2", { "depth1", "conv" }, concatenation::along_f));
+    topology.add(reorder("output", "depth2", format::bfyx, data_types::f32));
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+     options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } }));
+    network network(engine, topology, options);
+
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+
+    auto outputs = network.execute({});
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "output");
+    // Check that all concatenations have been optimized out.
+    auto executed_primitives = network.get_executed_primitives();
+    EXPECT_TRUE(executed_primitives.count("depth1") == 0);
+    EXPECT_TRUE(executed_primitives.count("depth2") == 0);
+    // Check that convolution was able to use optimzed kernel.
+    for (auto& info : network.get_primitives_info()) {
+        if (info.original_id == "conv") {
+            EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id;
+        }
+    }
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output.count(), output_f);
+    for (size_t i = 0; i < output_f; ++i) {
+        auto& val = output_ptr[i];
+        float ref;
+        if (i < input_f)
+            ref = 0.75f * static_cast<float>(input1_data[0][i % input_f][0][0]);
+        else if (i < 2 * input_f)
+            ref = 0.5f * static_cast<float>(input2_data[0][i % input_f][0][0]);
+        else
+            ref = static_cast<float>(input2_data[0][i % input_f][0][0]);
+
+        EXPECT_EQ(val, ref) << " at i=" << i;
+    }
+}
+
+TEST(depth_concatenate_f32_gpu, test07_padded_output) {
+    // input1 - activation - concatenation - convolution - reorder
+    // input2 - activation /
+    //
+    // *Convolution has input offset so it should be propagated back to activations.
+    const int32_t input_f = 32;
+    const int32_t output_f = 2 * input_f;
+
+    const auto& engine = get_test_engine();
+    auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+    auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} });
+
+    auto input1_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    auto input2_data = generate_random_4d<FLOAT16>(1, input_f, 1, 1, -1, 1);
+    set_values(input1, flatten_4d(format::bfyx, input1_data));
+    set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+    auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {output_f, output_f, 3, 3} });
+    // Construct weights for convolution that just double input values.
+    VVVVF<FLOAT16> weights_data;
+    weights_data.resize(output_f);
+    for (size_t oi = 0; oi < output_f; ++oi) {
+        weights_data[oi].resize(output_f, VVF<FLOAT16>(3, VF<FLOAT16>(3, FLOAT16(0.f))));
+        weights_data[oi][oi][1][1] = 2.f;
+    }
+    set_values(weights, flatten_4d(format::bfyx, weights_data));
+
+    topology topology;
+    topology.add(input_layout("input1", input1.get_layout()));
+    topology.add(input_layout("input2", input2.get_layout()));
+    topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+    topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+    topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+    topology.add(data("weights", weights));
+    topology.add(convolution("conv", "depth1", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0))));
+    topology.add(reorder("output", "conv", format::bfyx, data_types::f32));
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+    options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } }));
+    network network(engine, topology, options);
+
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+
+    auto outputs = network.execute({});
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "output");
+    // Check that all concatenations have been optimized out.
+    auto executed_primitives = network.get_executed_primitives();
+    EXPECT_TRUE(executed_primitives.count("depth1") == 0);
+    // Check that convolution was able to use optimzed kernel.
+    for (auto& info : network.get_primitives_info()) {
+        if (info.original_id == "conv") {
+            EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id;
+        }
+    }
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output.count(), output_f);
+    for (size_t i = 0; i < output_f; ++i) {
+        auto& val = output_ptr[i];
+        float ref;
+        if (i < input_f)
+            ref = 1.5f * static_cast<float>(input1_data[0][i % input_f][0][0]);
+        else
+            ref = static_cast<float>(input2_data[0][i % input_f][0][0]);
+
+        EXPECT_EQ(val, ref) << " at i=" << i;
+    }
+}
+
+TEST(depth_concatenate_f32_gpu, test07_concat_is_output) {
+    // input1 - activation - concatenation
+    // input2 - activation /
+    //
+    // As concatenation is output it should not be optimizex out.
+    const int32_t input_f = 16;
+    const int32_t output_f = 2 * input_f;
+
+    const auto& engine = get_test_engine();
+    auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} });
+    auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} });
+
+    auto input1_data = generate_random_4d<float>(1, input_f, 1, 1, -1, 1);
+    auto input2_data = generate_random_4d<float>(1, input_f, 1, 1, -1, 1);
+    set_values(input1, flatten_4d(format::bfyx, input1_data));
+    set_values(input2, flatten_4d(format::bfyx, input2_data));
+
+    topology topology;
+    topology.add(input_layout("input1", input1.get_layout()));
+    topology.add(input_layout("input2", input2.get_layout()));
+    topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f }));
+    topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f }));
+    topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f));
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+    network network(engine, topology, options);
+
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+
+    auto outputs = network.execute({});
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "depth1");
+    // Check that concatenation haven't been optimized out.
+    auto executed_primitives = network.get_executed_primitives();
+    EXPECT_TRUE(executed_primitives.count("depth1") == 1);
+
+    auto output = outputs.at("depth1").get_memory();
+    auto output_ptr = output.pointer<float>();
+    ASSERT_EQ(output.count(), output_f);
+    for (size_t i = 0; i < output_f; ++i) {
+        auto& val = output_ptr[i];
+        float ref;
+        if (i < input_f)
+            ref = 0.75f * input1_data[0][i % input_f][0][0];
+        else
+            ref = 0.5f * input2_data[0][i % input_f][0][0];
+
+        EXPECT_EQ(val, ref) << " at i=" << i;
+    }
+}
+
 TEST(depth_concatenate_f32_gpu, concat_with_different_format_inputs) {
     const auto& engine = get_test_engine();
     build_options build_opt;