[VPU][OpenCL] Update custom kernels (#2131)

* [Custom CL] Updated OpenCL kernels and tests * [Custom CL] Update OpenCL compiler * Update firmware to 1365 * Disable ExpGenerateProposals tests * VPU: new firmware no. 1370 * Myriad: re-enable ExpGenerateProposals tests Co-authored-by: Maxim Kurin <maxim.kurin@intel.com>
2020-09-09 03:50:40 +03:00 · 2020-09-09 03:50:40 +03:00 · 5ad4811793
commit 5ad4811793
parent 867340e8f1
49 changed files with 2947 additions and 4072 deletions
--- a/inference-engine/cmake/vpu_dependencies.cmake
+++ b/inference-engine/cmake/vpu_dependencies.cmake
@ -19,8 +19,8 @@ set(VPU_SUPPORTED_FIRMWARES usb-ma2450 usb-ma2x8x pcie-ma248x)
 # Default packages
 #

-set(FIRMWARE_PACKAGE_VERSION 1360)
-set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.02.0")
+set(FIRMWARE_PACKAGE_VERSION 1370)
+set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.09.0")

 #
 # CMake variables to override default firmware files
--- a/inference-engine/src/vpu/common/src/utils/simple_math.cpp
+++ b/inference-engine/src/vpu/common/src/utils/simple_math.cpp
@ -65,9 +65,14 @@ void MathExpression::parse(const std::string& expression) {
        // parse number
        if (std::isdigit(*it)) {
            size_t len = 0;
+            // parse number and use its length
            const auto value = std::stof(&*it, &len);
+            (void) value;
+            // copy sub string that represents a number
+            auto substring = std::string{it, it + len};

-            _parsedTokens.emplace_back(TokenType::Value, ValueType{value}, "");
+            auto token = Token{TokenType::Value, ValueType{substring}, ""};
+            _parsedTokens.push_back(std::move(token));

            std::advance(it, len - 1);
            continue;
@ -84,6 +89,7 @@ void MathExpression::parse(const std::string& expression) {
                tokenStack.push(token);
                continue;
            }
+
            if (_vars.find(token) != _vars.end()) {
                _parsedTokens.emplace_back(TokenType::Value, ValueType{_vars.at(token)}, "");
                continue;
--- a/inference-engine/src/vpu/custom_kernels/binarization.cl
+++ b/inference-engine/src/vpu/custom_kernels/binarization.cl
@ -0,0 +1,67 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+__kernel void binarization(
+    const __global half *__restrict src_data,
+    const __global half *__restrict input_low_high,
+    const __global half *__restrict dst_data,
+    int switch_out,
+    int input_low_high_size,
+    int W,
+    int H)
+{
+    __local half local_src[15 * 1024];
+    __local half local_dst[15 * 1024];
+
+    event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
+    wait_group_events(1, &e1);
+
+    int c = get_global_id(2);
+    int C = get_global_size(2);
+
+    half dst_low  = switch_out ? 1.h : -1.h;
+    half dst_high = switch_out ? -1.h : 1.h;
+
+    half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
+
+    for (int h = 0; h < H; h++) {
+
+        __local const half *__restrict addr_src = local_src + h * W;
+        __local half *__restrict addr_dst       = local_dst + h * W;
+
+#if 1
+        for (int w = 0; w < W / 8; w++) {
+
+            half8 h_src_val8 = (*((__local half8 *)addr_src + w));
+
+            short8 cond1;
+            cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
+            cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
+            cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
+            cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
+            cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
+            cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
+            cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
+            cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
+
+            cond1 = ~(cond1 - (short8)1);
+
+            short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
+
+            *((__local half8 *)addr_dst + w) = as_half8(res);
+        }
+#endif
+        for (int w = W & (~0x7); w < W; w++) {
+            addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/binary_convolution.cl
+++ b/inference-engine/src/vpu/custom_kernels/binary_convolution.cl
@ -0,0 +1,95 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
+
+__kernel void binary_convolution(
+    const __global half *restrict src_data,
+    const __global uchar *restrict weights_data,
+    __global half *restrict dst_data,
+    float pad_value,
+
+    int IW,
+    int IH,
+    int IC,
+
+    int DW,
+    int DH,
+
+    int GC,
+
+    int KW,
+    int KH,
+
+    int PW,
+    int PH,
+
+    int SW,
+    int SH)
+{
+    int ipad_value = ((pad_value > 0.f) ? 1 : 0);
+    int c          = get_global_id(2);
+    int y          = get_global_id(1);
+    int x          = get_global_id(0);
+
+    int OC = get_global_size(2);
+    int OH = get_global_size(1);
+    int OW = get_global_size(0);
+
+    int KD = 1;
+    int SD = 0;
+    int DD = 0;
+    int PD = 0;
+    int ID = 1;
+    int OD = 1;
+
+    int nbits = 8;
+
+    int g  = c % GC;
+    int oc = c / GC;
+    int oh = y;
+    int ow = x;
+
+    for (int od = 0; od < OD; od++) {
+        int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
+
+        int res = 0;
+
+        for (int ic = 0; ic < IC / GC; ic++) {
+            for (int kd = 0; kd < KD; kd++) {
+                for (int kh = 0; kh < KH; kh++) {
+                    for (int kw = 0; kw < KW; kw++) {
+                        int widx = g * OC / GC * IC / GC * KD * KH * KW
+                                   + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
+                                   + kh * KW + kw;
+
+                        int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
+
+                        int s;
+
+                        int iw = ow * SW - PW + kw * DW;
+                        int ih = oh * SH - PH + kh * DH;
+                        int id = od * SD - PD + kd * DD;
+
+                        if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
+                            || id >= (int)ID) {
+                            s = ipad_value;
+                        } else {
+                            int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
+                                       + ih * IW + iw;
+
+                            s = ((src_data[iidx] > 0.f) ? 1 : 0);
+                        }
+
+                        res += s ^ w;
+                    }
+                }
+            }
+        }
+
+        dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
+    }
+}
--- a/inference-engine/src/vpu/custom_kernels/binary_convolution1x1.cl
+++ b/inference-engine/src/vpu/custom_kernels/binary_convolution1x1.cl
@ -3,186 +3,115 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-ushort extract_weights(uchar val, int bit)
-{
-    return ((val >> bit) & 1);
-}
+ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }

 __kernel void binary_convolution(
-                        const __global half*  restrict src_data,
-                        const __global uchar* restrict weights_data,
-                        const __global half*  restrict dst_data,
-                                 float pad_value,
+    const __global half *restrict src_data,
+    const __global uchar *restrict weights_data,
+    __global half *restrict dst_data,
+    float pad_value,

-                                       int    IW,
-                                       int    IH,
-                                       int    IC,
+    int IW,
+    int IH,
+    int IC,

-                                       int    DW,
-                                       int    DH,
+    int DW,
+    int DH,

-                                       int    GC,
+    int GC,

-                                       int    KW,
-                                       int    KH,
+    int KW,
+    int KH,

-                                       int    PW,
-                                       int    PH,
+    int PW,
+    int PH,

-                                       int    SW,
-                                       int    SH,
+    int SW,
+    int SH,

-                                       int    OW,
-                        const __local  half*  restrict src_local,
-                              __local  half*  restrict dst_local)
+    int OW)
 {
-    int oh = get_global_id(0);
-    int oc = get_global_id(1);
-    int OH = get_global_size(0);
-    int OC = get_global_size(1);
+    __local half src_local[32 * 1024];
+    __local half dst_local[2 * 1024];
+
+    const int oh = get_group_id(0);
+    const int oc = get_group_id(1);
+    const int OH = get_global_size(0);
+    const int OC = get_global_size(1);
+
+    const int gc = oc / (OC / GC);
+
+    if (oh * SH >= 0 && oh * SH <= IH - 1) {
+        const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
+
+        event_t e1 = async_work_group_copy_2D2D(
+            src_local, // dst
+            src, // src
+            IW, // num_elements_per_line,
+            IC / GC, // num_lines,
+            IH * IW - IW, // src_line_stride,
+            0, // dst_line_stride,
+            0);
+        wait_group_events(1, &e1);
+    }

    half pad_value_half = convert_half(pad_value);

    //padding row
-    if (oh * SH > IH - 1)
-    {
-        __local half* dst = src_local;
-        for(int c = 0; c < IC/GC; c++)
-        {
+    if (oh * SH > IH - 1) {
+        __local half *dst = src_local;
+        for (int c = 0; c < IC / GC; c++) {
            #pragma unroll 8
-            for(int j = 0; j < IW; j++)
-            {
+            for (int j = 0; j < IW; j++) {
                dst[j] = pad_value_half;
            }
            dst += IW;
        }
-    } 
-  
+    }
+
    int OWS = SW * OW;
    ushort8 in;

-    for (int ows8 = 0; ows8 < (OWS+7)/8; ows8++)
-    {
+    for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
        ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
-        for (int ic = 0; ic < IC/GC; ++ic)
-        {
-            __local half* src = (__local half*)((__local half8*)(src_local + ic * IW) + ows8);
-            int weight_pos = oc * IC/GC + ic;
-            ushort w = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
+        for (int ic = 0; ic < IC / GC; ++ic) {
+            __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
+            int weight_pos    = oc * IC / GC + ic;
+            ushort w =
+                extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));

-            if ((ows8 * 8) <= IW - 1)
-            {
-                in = *((__local ushort8*)(src)); 
+            if ((ows8 * 8) <= IW - 1) {
+                in = *((__local ushort8 *)(src));
            }

            //padding column
-            if (ows8 * 8 + 7 > IW - 1)
-            {
+            if (ows8 * 8 + 7 > IW - 1) {
                int boundary = (IW - 1) - ows8 * 8 + 1;
-                boundary = boundary < 0 ? 0 : boundary;
-                for (int offset = boundary; offset < 8; offset++)
-                {
-                    *((half*)(&in) + offset) = pad_value_half;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in) + offset) = pad_value_half;
                }
            }

            ushort8 w8 = (ushort8)(w);

-            ushort8 cond = (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            
+            ushort8 cond =
+                (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
+
            val += (cond ^ w8);
-        } 
-        
+        }
+
        ushort8 val_shift = val << 1;
-        int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
-        for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++)
-        {
-            *(dst_local + ow) = (half)(IC/GC - *((ushort*)(&val_shift) + ow * SW - ows8 * 8));
+        int boundary      = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
+        for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
+            *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
        }
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
+    wait_group_events(1, &e2);
 }
-
-__kernel void __dma_preload_binary_convolution(
-                        const __global half*  restrict src_data,
-                        const __global uchar* restrict weights_data,
-                        const __global half*  restrict dst_data,
-                                 float pad_value,
-
-                                       int    IW,
-                                       int    IH,
-                                       int    IC,
-
-                                       int    DW,
-                                       int    DH,
-
-                                       int    GC,
-
-                                       int    KW,
-                                       int    KH,
-
-                                       int    PW,
-                                       int    PH,
-
-                                       int    SW,
-                                       int    SH,
-
-                                       int    OW,
-                              __local  half*  restrict src_local,
-                        const __local  half*  restrict dst_local)
-{
-    const int oh = get_group_id(0);
-    const int oc = get_group_id(1);
-    const int OC = get_global_size(1);
-
-    const int gc = oc / (OC/GC);
-
-    if (oh * SH >= 0 && oh * SH <= IH - 1)
-    {
-        const __global half* src = src_data + (gc * IC/GC) * IW * IH + (SH * oh) * IW;
-        WorkGroupDmaCreateStrideTransaction(
-            src, // src
-            src_local, // dst
-            IW * sizeof(half), // src width
-            IW * sizeof(half), // dst width
-            IH * IW * sizeof(half), // src stride
-            IW * sizeof(half),  // dst stride
-            IW * IC/GC * sizeof(half), //total size
-            0
-            );  
-    }
-}
-__kernel void __dma_postwrite_binary_convolution(
-                        const __global half*  restrict src_data,
-                        const __global uchar* restrict weights_data,
-                              __global half*  restrict dst_data,
-                                 float pad_value,
-
-                                       int    IW,
-                                       int    IH,
-                                       int    IC,
-
-                                       int    DW,
-                                       int    DH,
-
-                                       int    GC,
-
-                                       int    KW,
-                                       int    KH,
-
-                                       int    PW,
-                                       int    PH,
-
-                                       int    SW,
-                                       int    SH,
-
-                                       int    OW,
-                        const __local  half*  restrict src_local,
-                        const __local  half*  restrict dst_local)
-{
-    const int oh = get_group_id(0);
-    const int oc = get_group_id(1);
-    const int OH = get_global_size(0);
-
-    async_work_group_copy(dst_data + oc*OW*OH + oh*OW, dst_local, OW, 0);
-}
--- a/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl
+++ b/inference-engine/src/vpu/custom_kernels/binary_convolution3x3.cl
@ -3,82 +3,131 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-ushort extract_weights(uchar val, int bit) 
-{
-    return ((val >> bit) & 1);
-}
+ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }

 __kernel void binary_convolution(
-                        const __global half*  restrict src_data,
-                        const __global uchar* restrict weights_data,
-                        const __global half*  restrict dst_data,
-                                 float pad_value,
+    const __global half *restrict src_data,
+    const __global uchar *restrict weights_data,
+    const __global half *restrict dst_data,
+    float pad_value,

-                                       int    IW,
-                                       int    IH,
-                                       int    IC,
+    int IW,
+    int IH,
+    int IC,

-                                       int    DW,
-                                       int    DH,
+    int DW,
+    int DH,

-                                       int    GC,
+    int GC,

-                                       int    KW,
-                                       int    KH,
+    int KW,
+    int KH,

-                                       int    PW,
-                                       int    PH,
+    int PW,
+    int PH,

-                                       int    SW,
-                                       int    SH,
+    int SW,
+    int SH,

-                                       int    OW,
-                        const __local  half*  restrict src_local,
-                              __local  half*  restrict dst_local)
+    int OW)
 {
-    int oh = get_global_id(0);
-    int oc = get_global_id(1);
-    int OH = get_global_size(0);
-    int OC = get_global_size(1);
+    __local half src_local[32 * 1024];
+    __local half dst_local[2 * 1024];
+
+    const int oh = get_group_id(0);
+    const int oc = get_group_id(1);
+    const int OH = get_global_size(0);
+    const int OC = get_global_size(1);
+
+    const int gc = oc / (OC / GC);
+
+    if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
+    {
+        event_t e = async_work_group_copy_3D3D(
+            src_local, // dst
+            src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
+            IW, // num_elements_per_line
+            3, // num_lines
+            DH * IW - IW, // src_line_stride
+            0, // dst_line_stride
+            IC / GC, // num planes
+            IH * IW - 3 * IW, // src plane stride
+            0, // dst plane stride
+            0);
+        wait_group_events(1, &e);
+    } else {
+        int ih = oh * SH - 1;
+        if (ih >= 0 && ih <= IH - 1) //dma for first row
+        {
+            event_t e = async_work_group_copy_2D2D(
+                src_local, // dst
+                src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
+                IW, // num_elements_per_line,
+                IC / GC, // num_lines,
+                IH * IW - IW, // src_line_stride,
+                2 * IW, // dst_line_stride,
+                0);
+
+            wait_group_events(1, &e);
+        }
+        ih = oh * SH - 1 + DH;
+        if (ih >= 0 && ih <= IH - 1) //dma for second row
+        {
+            event_t e = async_work_group_copy_2D2D(
+                src_local + IW, // dst
+                src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
+                IW, // num_elements_per_line,
+                IC / GC, // num_lines,
+                IH * IW - IW, // src_line_stride,
+                2 * IW, // dst_line_stride,
+                0);
+            wait_group_events(1, &e);
+        }
+        ih = oh * SH - 1 + 2 * DH;
+        if (ih >= 0 && ih <= IH - 1) //dma for third row
+        {
+            event_t e = async_work_group_copy_2D2D(
+                src_local + 2 * IW, // dst
+                src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
+                IW, // num_elements_per_line,
+                IC / GC, // num_lines,
+                IH * IW - IW, // src_line_stride,
+                2 * IW, // dst_line_stride,
+                0);
+            wait_group_events(1, &e);
+        }
+    }

    half pad_value_half = convert_half(pad_value);

    //padding row
-    if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1)
-    {
-        __local half* dst = src_local;
-        for(int c = 0; c < IC/GC; c++)
-        {
+    if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
+        __local half *dst = src_local;
+        for (int c = 0; c < IC / GC; c++) {
            #pragma unroll 8
-            for(int j = 0; j < IW; j++)
-            {
+            for (int j = 0; j < IW; j++) {
                dst[j] = pad_value_half;
            }
            dst += 3 * IW;
        }
    }
-    if (oh * SH + DH - 1 > IH - 1)
-    {
-        __local half* dst = src_local + IW;
-        for(int c = 0; c < IC/GC; c++)
-        {
+    if (oh * SH + DH - 1 > IH - 1) {
+        __local half *dst = src_local + IW;
+        for (int c = 0; c < IC / GC; c++) {
            #pragma unroll 8
-            for(int j = 0; j < IW; j++)
-            {
+            for (int j = 0; j < IW; j++) {
                dst[j] = pad_value_half;
            }
            dst += 3 * IW;
        }
    }
-    if (oh * SH + DH + DH - 1 > IH - 1)
-    {
-        __local half* dst = src_local + 2 * IW;
-        for(int c = 0; c < IC/GC; c++)
-        {
+    if (oh * SH + DH + DH - 1 > IH - 1) {
+        __local half *dst = src_local + 2 * IW;
+        for (int c = 0; c < IC / GC; c++) {
            #pragma unroll 8
-            for(int j = 0; j < IW; j++)
-            {
+            for (int j = 0; j < IW; j++) {
                dst[j] = pad_value_half;
            }
            dst += 3 * IW;
@ -97,13 +146,12 @@ __kernel void binary_convolution(
    ushort8 in21;
    ushort8 in22;

-    for (int ows8 = 0; ows8 < (OWS+7)/8; ows8++)
-    {
+    for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
        ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
-        for (int ic = 0; ic < IC/GC; ++ic)
-        {
-            __local half* src = (__local half*)((__local half8*)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
-            int weight_pos = oc*IC/GC*3*3 + ic*3*3;
+        for (int ic = 0; ic < IC / GC; ++ic) {
+            __local half *src =
+                (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
+            int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
            ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
            ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
            ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
@ -114,64 +162,55 @@ __kernel void binary_convolution(
            ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
            ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));

-            if ((ows8 * 8) - 1 <= IW - 1)
-            {
-                in00 = *((__local ushort8*)(src - IW - DW));
-                in01 = *((__local ushort8*)(src - IW));
-                in02 = *((__local ushort8*)(src - IW + DW));
+            if ((ows8 * 8) - 1 <= IW - 1) {
+                in00 = *((__local ushort8 *)(src - IW - DW));
+                in01 = *((__local ushort8 *)(src - IW));
+                in02 = *((__local ushort8 *)(src - IW + DW));

-                in10 = *((__local ushort8*)(src - DW));
-                in11 = *((__local ushort8*)(src));
-                in12 = *((__local ushort8*)(src + DW));
+                in10 = *((__local ushort8 *)(src - DW));
+                in11 = *((__local ushort8 *)(src));
+                in12 = *((__local ushort8 *)(src + DW));

-                in20 = *((__local ushort8*)(src + IW - DW));
-                in21 = *((__local ushort8*)(src + IW));
-                in22 = *((__local ushort8*)(src + IW + DW));
+                in20 = *((__local ushort8 *)(src + IW - DW));
+                in21 = *((__local ushort8 *)(src + IW));
+                in22 = *((__local ushort8 *)(src + IW + DW));
            }

            //padding column
-            if (ows8 * 8 - 1 < 0)
-            {
+            if (ows8 * 8 - 1 < 0) {
                int boundary = 1 - ows8 * 8;
-                boundary = boundary > 8 ? 8 : boundary;
-                for (int offset = 0; offset < boundary; offset++)
-                {
-                    *((half*)(&in00) + offset) = pad_value_half;
-                    *((half*)(&in10) + offset) = pad_value_half;
-                    *((half*)(&in20) + offset) = pad_value_half;
-                }
-            }            
-            if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1)
-            {
-                int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
-                boundary = boundary < 0 ? 0 : boundary;
-                for (int offset = boundary; offset < 8; offset++)
-                {
-                    *((half*)(&in02) + offset) = pad_value_half;
-                    *((half*)(&in12) + offset) = pad_value_half;
-                    *((half*)(&in22) + offset) = pad_value_half;
-                }
-            }            
-            if ((ows8 * 8 + 7) + DW - 1 > IW - 1)
-            {   
-                int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
-                boundary = boundary < 0 ? 0 : boundary;
-                for (int offset = boundary; offset < 8; offset++)
-                {
-                    *((half*)(&in01) + offset) = pad_value_half;
-                    *((half*)(&in11) + offset) = pad_value_half;
-                    *((half*)(&in21) + offset) = pad_value_half;
+                boundary     = boundary > 8 ? 8 : boundary;
+                for (int offset = 0; offset < boundary; offset++) {
+                    *((half *)(&in00) + offset) = pad_value_half;
+                    *((half *)(&in10) + offset) = pad_value_half;
+                    *((half *)(&in20) + offset) = pad_value_half;
                }
            }
-            if ((ows8 * 8 + 7) - 1 > IW - 1)
-            {
+            if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
+                int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in02) + offset) = pad_value_half;
+                    *((half *)(&in12) + offset) = pad_value_half;
+                    *((half *)(&in22) + offset) = pad_value_half;
+                }
+            }
+            if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
+                int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in01) + offset) = pad_value_half;
+                    *((half *)(&in11) + offset) = pad_value_half;
+                    *((half *)(&in21) + offset) = pad_value_half;
+                }
+            }
+            if ((ows8 * 8 + 7) - 1 > IW - 1) {
                int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
-                boundary = boundary < 0 ? 0 : boundary;
-                for (int offset = boundary; offset < 8; offset++)
-                {
-                    *((half*)(&in00) + offset) = pad_value_half;
-                    *((half*)(&in10) + offset) = pad_value_half;
-                    *((half*)(&in20) + offset) = pad_value_half;
+                boundary     = boundary < 0 ? 0 : boundary;
+                for (int offset = boundary; offset < 8; offset++) {
+                    *((half *)(&in00) + offset) = pad_value_half;
+                    *((half *)(&in10) + offset) = pad_value_half;
+                    *((half *)(&in20) + offset) = pad_value_half;
                }
            }

@ -185,16 +224,34 @@ __kernel void binary_convolution(
            ushort8 w21 = (ushort8)(w7);
            ushort8 w22 = (ushort8)(w8);

-            ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-            ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
-        
+            ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+            ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
+                                (ushort8)(1) :
+                                (ushort8)(0);
+
            val += (cond0 ^ w00);
            val += (cond1 ^ w01);
            val += (cond2 ^ w02);
@ -207,150 +264,15 @@ __kernel void binary_convolution(
        }

        ushort8 val_shift = val << 1;
-        int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
-        for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++)
-        {
-            *(dst_local + ow) = (half)(IC/GC*KH*KW - *((ushort*)(&val_shift) + ow * SW - ows8 * 8));
+        int boundary      = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
+        for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
+            *(dst_local + ow) =
+                (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
        }
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
+    wait_group_events(1, &e2);
 }
-
-__kernel void __dma_preload_binary_convolution(
-                        const __global half*  restrict src_data,
-                        const __global uchar* restrict weights_data,
-                        const __global half*  restrict dst_data,
-                                 float pad_value,
-
-                                       int    IW,
-                                       int    IH,
-                                       int    IC,
-
-                                       int    DW,
-                                       int    DH,
-
-                                       int    GC,
-
-                                       int    KW,
-                                       int    KH,
-
-                                       int    PW,
-                                       int    PH,
-
-                                       int    SW,
-                                       int    SH,
-
-                                       int    OW,
-                              __local  half*  restrict src_local,
-                        const __local  half*  restrict dst_local)
-{
-    const int oh = get_group_id(0);
-    const int oc = get_group_id(1);
-    const int OH = get_global_size(0);
-    const int OC = get_global_size(1);
-
-    const int gc = oc / (OC/GC);
-
-    if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
-    {
-        const __global half* src = src_data + (gc * IC/GC) * IW * IH + (SH * oh - 1) * IW;
-        WorkGroupDmaCreate3DTransaction(
-            src, //src,
-            src_local, //dst,
-            IW * sizeof(half), //src width,
-            IW * sizeof(half), //dst width,
-            DH * IW * sizeof(half), //src stride,
-            IW * sizeof(half), //dst stride,
-            IC/GC, //num planes //hang when > 256
-            IH * IW * sizeof(half), //src plane stride,
-            3 * IW * sizeof(half), //dst plane stride,
-            3 * IW * sizeof(half), //plane size,
-            0
-            ); 
-  
-    }
-    else
-    {
-        int ih = oh * SH - 1;
-        if (ih >= 0 && ih <= IH - 1) //dma for first row
-        {
-            const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW;
-            __local half* dst = src_local;
-            WorkGroupDmaCreateStrideTransaction(
-                src, // src
-                dst, // dst
-                IW * sizeof(half), // src width
-                IW * sizeof(half), // dst width
-                IH * IW * sizeof(half), // src stride
-                3 * IW * sizeof(half),  // dst stride
-                IW * IC/GC * sizeof(half), //total size
-                0
-                );
-        }
-        ih = oh * SH - 1 + DH;
-        if (ih >= 0 && ih <= IH - 1) //dma for second row
-        {
-            const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW;
-            __local half* dst = src_local + IW;
-            WorkGroupDmaCreateStrideTransaction(
-                src, // src
-                dst, // dst
-                IW * sizeof(half), // src width
-                IW * sizeof(half), // dst width
-                IH * IW * sizeof(half), // src stride
-                3 * IW * sizeof(half),  // dst stride
-                IW * IC/GC * sizeof(half), //total size
-                0
-                );
-        }
-        ih = oh * SH - 1 + 2 * DH;
-        if (ih >= 0 && ih <= IH - 1) //dma for third row
-        {
-            const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW;
-            __local half* dst = src_local + 2 * IW;
-            WorkGroupDmaCreateStrideTransaction(
-                src, // src
-                dst, // dst
-                IW * sizeof(half), // src width
-                IW * sizeof(half), // dst width
-                IH * IW * sizeof(half), // src stride
-                3 * IW * sizeof(half),  // dst stride
-                IW * IC/GC * sizeof(half), //total size
-                0
-                );
-        }
-    }
-}
-__kernel void __dma_postwrite_binary_convolution(
-                        const __global half*  restrict src_data,
-                        const __global uchar* restrict weights_data,
-                              __global half*  restrict dst_data,
-                                 float pad_value,
-
-                                       int    IW,
-                                       int    IH,
-                                       int    IC,
-
-                                       int    DW,
-                                       int    DH,
-
-                                       int    GC,
-
-                                       int    KW,
-                                       int    KH,
-
-                                       int    PW,
-                                       int    PH,
-
-                                       int    SW,
-                                       int    SH,
-
-                                       int    OW,
-                        const __local  half*  restrict src_local,
-                        const __local  half*  restrict dst_local)
-{
-    const int oh = get_group_id(0);
-    const int oc = get_group_id(1);
-    const int OH = get_global_size(0);
-
-    async_work_group_copy(dst_data + oc*OW*OH + oh*OW, dst_local, OW, 0);
-}
--- a/inference-engine/src/vpu/custom_kernels/binary_layers.cl
+++ b/inference-engine/src/vpu/custom_kernels/binary_layers.cl
@ -1,339 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-int extract_weights(uchar val, int bit) {
-  return ((val >> bit) & 1);
-}
-
-__kernel void binary_convolution(const __global half* restrict src_data,
-                                 const __global uchar* restrict weights_data,
-                                 __global half* restrict dst_data,
-                                 float pad_value,
-
-                                 int IW,
-                                 int IH,
-                                 int IC,
-
-                                 int DW,
-                                 int DH,
-
-                                 int GC,
-
-                                 int KW,
-                                 int KH,
-
-                                 int PW,
-                                 int PH,
-
-                                 int SW,
-                                 int SH)
-{
-    int ipad_value = ((pad_value > 0.f) ? 1 : 0);
-    int c = get_global_id(2);
-    int y = get_global_id(1);
-    int x = get_global_id(0);
-
-    int OC = get_global_size(2);
-    int OH = get_global_size(1);
-    int OW = get_global_size(0);
-
-    int KD = 1;
-    int SD = 0;
-    int DD = 0;
-    int PD = 0;
-    int ID = 1;
-    int OD = 1;
-
-    int nbits = 8;
-
-    int g  = c % GC;
-    int oc = c / GC;
-    int oh = y;
-    int ow = x;
-
-    for (int od = 0; od < OD; od++) {
-        int oidx = g  * OC / GC * OD * OH * OW
-                 + oc * OD * OH * OW
-                 + od * OH * OW
-                 + oh * OW
-                 + ow;
-
-        int res = 0;
-
-        for (int ic = 0; ic < IC / GC; ic++) {
-            for (int kd = 0; kd < KD; kd++) {
-                for (int kh = 0; kh < KH; kh++) {
-                    for (int kw = 0; kw < KW; kw++) {
-                        int widx = g  * OC / GC * IC / GC * KD * KH * KW
-                                 + oc * IC / GC * KD * KH * KW
-                                 + ic * KD * KH * KW
-                                 + kd * KH * KW
-                                 + kh * KW
-                                 + kw;
-
-                        int w = extract_weights(weights_data[widx/nbits], (widx % nbits));
-
-                        int s;
-
-                        int iw = ow * SW - PW + kw * DW;
-                        int ih = oh * SH - PH + kh * DH;
-                        int id = od * SD - PD + kd * DD;
-
-                        if (iw < 0 || iw >= (int) IW ||
-                            ih < 0 || ih >= (int) IH ||
-                            id < 0 || id >= (int) ID) {
-                            s = ipad_value;
-                        } else {
-                            int iidx = g  * IC / GC * ID * IH * IW
-                                     + ic * ID * IH * IW
-                                     + id * IH * IW
-                                     + ih * IW
-                                     + iw;
-
-                            s = ((src_data[iidx] > 0.f) ? 1 : 0);
-                        }
-
-                        res += s ^ w;
-                    }
-                }
-            }
-        }
-
-        dst_data[oidx] = (half)(IC/GC*KD*KH*KW - 2*res);
-    }
-}
-
-__kernel void quantize(const __global half* __restrict src,
-                       const __global half* __restrict input_low,
-                       const __global half* __restrict input_high,
-                       const __global half* __restrict output_low,
-                       const __global half* __restrict output_high,
-                       const __global half* __restrict dst,
-                       int levels,
-                       int input_low_size,
-                       int input_high_size,
-                       int output_low_size,
-                       int output_high_size,
-                       int W,
-                       int H,
-                       const __local half* __restrict src_local,
-                             __local half* __restrict dst_local)
-{
-
-    int c = get_global_id(2);
-    int C = get_global_size(2);
-
-    half h_ilow  = (input_low_size   == 1 ? input_low[0]   : input_low[c]);
-    half h_ihigh = (input_high_size  == 1 ? input_high[0]  : input_high[c]);
-    half h_olow  = (output_low_size  == 1 ? output_low[0]  : output_low[c]);
-    half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
-
-    half const1 = (half)(0.01 > (h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
-    half const2 = (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
-
-    for (int h = 0; h < H; h++)
-    {
-        __local const half* __restrict addr_src = src_local + h*W;
-        __local       half* __restrict addr_dst = dst_local + h*W;
-
-        for (int w = 0; w < W / 8; w++)
-        {
-            half8 val = *((__local half8*)addr_src + w);
-#if 1
-            // round is too slow =( 902 b of code
-            //half8 aux = round((val - (half8)h_ilow) * (half8)const1);
-
-            half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
-
-            aux = (half8){
-                (half)(short)(aux.s0),
-                (half)(short)(aux.s1),
-                (half)(short)(aux.s2),
-                (half)(short)(aux.s3),
-                (half)(short)(aux.s4),
-                (half)(short)(aux.s5),
-                (half)(short)(aux.s6),
-                (half)(short)(aux.s7)
-            };
-
-            aux = aux * (half8)const2 + (half8)h_olow;
-
-            // vector comparison add 756 b of assembly, so do in manually
-            // short8 a = val <= (half8)h_olow;
-            // short8 b = val >  (half8)h_ohigh;
-
-            short8 a;
-            short8 b;
-            a.s0 = (val.s0 <= h_ilow);
-            a.s1 = (val.s1 <= h_ilow);
-            a.s2 = (val.s2 <= h_ilow);
-            a.s3 = (val.s3 <= h_ilow);
-            a.s4 = (val.s4 <= h_ilow);
-            a.s5 = (val.s5 <= h_ilow);
-            a.s6 = (val.s6 <= h_ilow);
-            a.s7 = (val.s7 <= h_ilow);
-
-            b.s0 = (val.s0 > h_ihigh);
-            b.s1 = (val.s1 > h_ihigh);
-            b.s2 = (val.s2 > h_ihigh);
-            b.s3 = (val.s3 > h_ihigh);
-            b.s4 = (val.s4 > h_ihigh);
-            b.s5 = (val.s5 > h_ihigh);
-            b.s6 = (val.s6 > h_ihigh);
-            b.s7 = (val.s7 > h_ihigh);
-
-            a = ~(a-(short8)1);
-            b = ~(b-(short8)1);
-
-            short8 c1 = (~a &  b);
-            short8 c2 = (~a & ~b);
-
-            short8 res = a  & as_short8((half8)h_olow)
-                       | c1 & as_short8((half8)h_ohigh)
-                       | c2 & as_short8(aux);
-
-            *((__local half8*)addr_dst + w) = as_half8(res);
-#else
-            *((__local half8*)addr_dst + w) = val;
-#endif
-        }
-        for (int w = W & (~0x7); w < W; w++)
-        {
-            half val = addr_src[w];
-#if 1
-            short a = val <= h_ilow; a = ~(a-1);
-            short b = val > h_ihigh; b = ~(b-1);
-
-            short c1 = (~a &  b);
-            short c2 = (~a & ~b);
-
-            short res = a  & as_short(h_olow)
-                      | c1 & as_short(h_ohigh)
-                      | c2 & as_short(((half)(round( (val - h_ilow) * const1) * const2) + h_olow));
-
-            addr_dst[w] = as_half(res);
-#else
-            addr_dst[w] = val;
-#endif
-        }
-    }
-}
-__kernel void __dma_preload_quantize(const __global half* __restrict src,
-                       const __global half* __restrict input_low,
-                       const __global half* __restrict input_high,
-                       const __global half* __restrict output_low,
-                       const __global half* __restrict output_high,
-                       const __global half* __restrict dst,
-                       int levels,
-                       int input_low_size,
-                       int input_high_size,
-                       int output_low_size,
-                       int output_high_size,
-                       int W,
-                       int H,
-                             __local half* __restrict src_local,
-                       const __local half* __restrict dst_local)
-{
-    const int sizePlane = W*H;
-    async_work_group_copy(src_local ,src + get_group_id(2)*sizePlane, sizePlane, 0);
-}
-__kernel void __dma_postwrite_quantize(const __global half* __restrict src,
-                       const __global half* __restrict input_low,
-                       const __global half* __restrict input_high,
-                       const __global half* __restrict output_low,
-                       const __global half* __restrict output_high,
-                             __global half* __restrict dst,
-                       int levels,
-                       int input_low_size,
-                       int input_high_size,
-                       int output_low_size,
-                       int output_high_size,
-                       int W,
-                       int H,
-                       const __local half* __restrict src_local,
-                       const __local half* __restrict dst_local)
-{
-    const int sizePlane = W*H;
-    async_work_group_copy(dst + get_group_id(2)*sizePlane ,dst_local,  sizePlane, 0);
-}
-
-__kernel void binarization(const __global half* __restrict src,
-                           const __global half* __restrict input_low_high,
-                           const __global half* __restrict dst,
-                                          int              switch_out,
-                                          int              input_low_high_size,
-                                          int              W,
-                                          int              H,
-                           const __local  half* __restrict src_local,
-                                 __local  half* __restrict dst_local)
-{
-    int c = get_global_id(2);
-    int C = get_global_size(2);
-
-    half dst_low  = switch_out ? 1.h : -1.h;
-    half dst_high = switch_out ? -1.h : 1.h;
-
-    half s_ilow_ihigh  = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
-
-    for (int h = 0; h < H; h++) {
-
-        __local const half* __restrict addr_src = src_local + h*W;
-        __local       half* __restrict addr_dst = dst_local + h*W;
-
-#if 1
-        for (int w = 0; w < W / 8; w++) {
-
-            half8 h_src_val8 = (*((__local half8*)addr_src + w));
-
-            short8 cond1;
-            cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
-            cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
-            cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
-            cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
-            cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
-            cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
-            cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
-            cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
-
-            cond1 = ~(cond1-(short8)1);
-
-            short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
-
-            *((__local half8*)addr_dst + w) = as_half8(res);
-        }
-#endif
-        for (int w = W & (~0x7); w < W; w++)
-        {
-            addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
-        }
-    }
-}
-__kernel void __dma_preload_binarization(const __global half* __restrict src,
-                           const __global half* __restrict input_low_high,
-                           const __global half* __restrict dst,
-                                          int              switch_out,
-                                          int              input_low_high_size,
-                                          int              W,
-                                          int              H,
-                                 __local  half* __restrict src_local,
-                           const __local  half* __restrict dst_local)
-{
-    const int sizePlane = W*H;
-    async_work_group_copy(src_local ,src + get_group_id(2)*sizePlane, sizePlane, 0);
-}
-__kernel void __dma_postwrite_binarization(const __global half* __restrict src,
-                           const __global half* __restrict input_low_high,
-                                 __global half* __restrict dst,
-                                          int              switch_out,
-                                          int              input_low_high_size,
-                                          int              W,
-                                          int              H,
-                           const __local  half* __restrict src_local,
-                           const __local  half* __restrict dst_local)
-{
-    const int sizePlane = W*H;
-    async_work_group_copy(dst + get_group_id(2)*sizePlane ,dst_local,  sizePlane, 0);
-}
--- a/inference-engine/src/vpu/custom_kernels/convolution1x1.cl
+++ b/inference-engine/src/vpu/custom_kernels/convolution1x1.cl
@ -1,281 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void Convolution1x1_NCHW(
-                    const __global half* in, 
-                    const __global half* out, 
-                    const __global half* w,
-                                     int IW, 
-                                     int IH, 
-                                     int IC,
-                                     int OW, 
-                                     int OH, 
-                                     int OC,
-                    const __local  half* in_local,
-                          __local  half* out_local)
-{
-    int oh = get_global_id(0);
-    int oc = get_global_id(1);
-
-    int stride;
-    int write_output = 0;
-    __global half* src;
-
-    __global half8* w8  = (__global half8*)(&w[oc*IC]);
-    __global half* w1  = (__global half*)(&w[oc*IC]);
-
-   
-    for (uint ow = 0; ow < (OW & (~0x7)); ow += 8)
-    {
-        uint iw = ow;
-        uint ih = oh;
-
-        half8 val8_0 = 0.0f;
-
-        __local half8* in8_0 = (__local half8*)(&in_local[iw + 0 * IW]);
-        __local half8* in8_1 = (__local half8*)(&in_local[iw + 1 * IW]);
-        __local half8* in8_2 = (__local half8*)(&in_local[iw + 2 * IW]);
-        __local half8* in8_3 = (__local half8*)(&in_local[iw + 3 * IW]);
-        __local half8* in8_4 = (__local half8*)(&in_local[iw + 4 * IW]);
-        __local half8* in8_5 = (__local half8*)(&in_local[iw + 5 * IW]);
-        __local half8* in8_6 = (__local half8*)(&in_local[iw + 6 * IW]);
-        __local half8* in8_7 = (__local half8*)(&in_local[iw + 7 * IW]);
-
-        for (uint ic = 0; ic < IC / 8; ic ++)
-        {
-            val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
-            val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
-            val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
-            val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
-            val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
-            val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
-            val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
-            val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
-        }
-        
-        for (uint ic = (IC & (~0x7)); ic < IC; ++ic)
-        {
-            val8_0 += *((__local half8*)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
-        }
-        *((__local half8*)&out_local[ow + 0]) = (val8_0);
-    }
-
-    uint iw = (OW & (~0x7));
-    uint ih = oh;
-
-    half8 val8_0 = 0.0f;
-
-    __local half8* in8_0 = (__local half8*)(&in_local[iw + 0 * IW]);
-    __local half8* in8_1 = (__local half8*)(&in_local[iw + 1 * IW]);
-    __local half8* in8_2 = (__local half8*)(&in_local[iw + 2 * IW]);
-    __local half8* in8_3 = (__local half8*)(&in_local[iw + 3 * IW]);
-    __local half8* in8_4 = (__local half8*)(&in_local[iw + 4 * IW]);
-    __local half8* in8_5 = (__local half8*)(&in_local[iw + 5 * IW]);
-    __local half8* in8_6 = (__local half8*)(&in_local[iw + 6 * IW]);
-    __local half8* in8_7 = (__local half8*)(&in_local[iw + 7 * IW]);
-
-    for (uint ic = 0; ic < IC / 8; ic ++)
-    {
-        val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
-        val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
-        val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
-        val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
-        val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
-        val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
-        val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
-        val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
-    }
-    
-    for (uint ic = (IC & (~0x7)); ic < IC; ++ic)
-    {
-        val8_0 += *((__local half8*)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
-    }
-    for (uint ow = (OW & (~0x7)); ow < OW; ow ++)
-    {
-        out_local[ow + 0] = (val8_0[ow % 8]);
-    }
-}
-__kernel void __dma_preload_Convolution1x1_NCHW(
-                    const __global half* in, 
-                    const __global half* out, 
-                    const __global half* w,
-                                     int IW, 
-                                     int IH, 
-                                     int IC,
-                                     int OW, 
-                                     int OH, 
-                                     int OC,
-                          __local  half* in_local,
-                    const __local  half* out_local)
-{
-    const int sizePlane = IW*IH;
-    WorkGroupDmaCreateStrideTransaction(
-        in + get_group_id(0)*IW, // src
-        in_local, // dst
-        IW * sizeof(half), // src width
-        IW * sizeof(half), // dst width
-        sizePlane * sizeof(half), // src stride
-        IW * sizeof(half),  // dst stride
-        IW * IC * sizeof(half), //total size
-        0
-        );
-}
-__kernel void __dma_postwrite_Convolution1x1_NCHW(
-                    const __global half* in, 
-                          __global half* out, 
-                    const __global half* w,
-                                     int IW, 
-                                     int IH, 
-                                     int IC,
-                                     int OW, 
-                                     int OH, 
-                                     int OC,
-                    const __local  half* in_local,
-                    const __local  half* out_local)
-{
-    async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0);
-}
-
-__kernel void Convolution1x1_NHWC(
-                const __global half* in, 
-                const __global half* out, 
-                const __global half* w,
-                                int  IW, 
-                                int  IH, 
-                                int  IC,
-                                int  OW, 
-                                int  OH, 
-                                int  OC,
-                const __local  half* in_local,
-                      __local  half* out_local)
-{
-    int oh = get_global_id(0);
-    int oc = get_global_id(1);
-
-    int stride;
-    int write_output = 0;
-    __global half* src;
-
-    __global half8* w8  = (__global half8*)(&w[oc*IC]);
-    __global half* w1  = (__global half*)(&w[oc*IC]);
-
-    for (uint ow = 0; ow < (OW & (~0x7)); ow += 8)
-    {
-        uint iw = ow;
-        uint ih = oh;
-
-        half8 val8_0 = 0.0f;
-        half8 val8_1 = 0.0f;
-        half8 val8_2 = 0.0f;
-        half8 val8_3 = 0.0f;
-        half8 val8_4 = 0.0f;
-        half8 val8_5 = 0.0f;
-        half8 val8_6 = 0.0f;
-        half8 val8_7 = 0.0f;
-
-        __local half8* in8_0 = (__local half8*)(&in_local[(iw + 0) * IC]);
-        __local half8* in8_1 = (__local half8*)(&in_local[(iw + 1) * IC]);
-        __local half8* in8_2 = (__local half8*)(&in_local[(iw + 2) * IC]);
-        __local half8* in8_3 = (__local half8*)(&in_local[(iw + 3) * IC]);
-        __local half8* in8_4 = (__local half8*)(&in_local[(iw + 4) * IC]);
-        __local half8* in8_5 = (__local half8*)(&in_local[(iw + 5) * IC]);
-        __local half8* in8_6 = (__local half8*)(&in_local[(iw + 6) * IC]);
-        __local half8* in8_7 = (__local half8*)(&in_local[(iw + 7) * IC]);
-
-        for (uint ic = 0; ic < IC / 8; ++ic)
-        {
-            val8_0 += (in8_0[ic]) * (w8[ic]);
-            val8_1 += (in8_1[ic]) * (w8[ic]);
-            val8_2 += (in8_2[ic]) * (w8[ic]);
-            val8_3 += (in8_3[ic]) * (w8[ic]);
-            val8_4 += (in8_4[ic]) * (w8[ic]);
-            val8_5 += (in8_5[ic]) * (w8[ic]);
-            val8_6 += (in8_6[ic]) * (w8[ic]);
-            val8_7 += (in8_7[ic]) * (w8[ic]);
-        }
-
-        half val_0 = 0.0f;
-        half val_1 = 0.0f;
-        half val_2 = 0.0f;
-        half val_3 = 0.0f;
-        half val_4 = 0.0f;
-        half val_5 = 0.0f;
-        half val_6 = 0.0f;
-        half val_7 = 0.0f;
-        for (uint ic = IC & (~0x7); ic < IC; ++ic)
-        {
-            val_0 += *((__local half*)in8_0 + ic) * (*((__global half*)w8 + ic));
-            val_1 += *((__local half*)in8_1 + ic) * (*((__global half*)w8 + ic));
-            val_2 += *((__local half*)in8_2 + ic) * (*((__global half*)w8 + ic));
-            val_3 += *((__local half*)in8_3 + ic) * (*((__global half*)w8 + ic));
-            val_4 += *((__local half*)in8_4 + ic) * (*((__global half*)w8 + ic));
-            val_5 += *((__local half*)in8_5 + ic) * (*((__global half*)w8 + ic));
-            val_6 += *((__local half*)in8_6 + ic) * (*((__global half*)w8 + ic));
-            val_7 += *((__local half*)in8_7 + ic) * (*((__global half*)w8 + ic));
-        }
-        out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
-        out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
-        out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
-        out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
-        out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
-        out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
-        out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
-        out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
-    }
-    for (uint ow = (OW & (~0x7)); ow < OW; ow ++)
-    {
-
-        uint iw = ow;
-        uint ih = oh;
-
-        half8 val8 = 0.0f;
-
-        __local half8* in8 = (__local half8*)(&in_local[iw * IC]);
-
-        for (uint ic = 0; ic < IC / 8; ++ic)
-        {
-            val8 += (in8[ic]) * (w8[ic]);
-        }
-
-        half val = 0.0f;
-        for (uint ic = (IC & (~0x7)); ic < IC; ++ic)
-        {
-            val += (*((__local half*)in8 + ic)) * (*((__global half*)w8 + ic));
-        }
-        out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
-    }
-}
-__kernel void __dma_preload_Convolution1x1_NHWC(
-                const __global half* in, 
-                const __global half* out, 
-                const __global half* w,
-                                int  IW, 
-                                int  IH, 
-                                int  IC,
-                                int  OW, 
-                                int  OH, 
-                                int  OC,
-                      __local  half* in_local,
-                const __local  half* out_local)
-{
-    const int sizeAct = IW*IC;
-    async_work_group_copy(in_local, in + get_group_id(0)*sizeAct, sizeAct, 0);
-}
-__kernel void __dma_postwrite_Convolution1x1_NHWC(
-                const __global half* in, 
-                      __global half* out, 
-                const __global half* w,
-                                int  IW, 
-                                int  IH, 
-                                int  IC,
-                                int  OW, 
-                                int  OH, 
-                                int  OC,
-                const __local  half* in_local,
-                const __local  half* out_local)
-{
-    async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0);
-}
--- a/inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl
+++ b/inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl
@ -0,0 +1,114 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+__kernel void Convolution1x1_NCHW(
+    const __global half *in,
+    const __global half *out,
+    const __global half *w,
+    int IW,
+    int IH,
+    int IC,
+    int OW,
+    int OH,
+    int OC)
+{
+    __local half in_local[8 * 1024];
+    __local half out_local[8 * 1024];
+
+    event_t e1 = async_work_group_copy_2D2D(
+        in_local, // dst
+        in + get_group_id(0) * IW, // src
+        IW, // num_elements_per_line,
+        IC, // num_lines,
+        IW * IH - IW, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+    wait_group_events(1, &e1);
+
+    int oh = get_global_id(0);
+    int oc = get_global_id(1);
+
+    int stride;
+    int write_output = 0;
+    __global half *src;
+
+    __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
+    __global half *w1  = (__global half *)(&w[oc * IC]);
+
+    for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
+        uint iw = ow;
+        uint ih = oh;
+
+        half8 val8_0 = 0.0f;
+
+        __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
+        __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
+        __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
+        __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
+        __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
+        __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
+        __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
+        __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
+
+        for (uint ic = 0; ic < IC / 8; ic++) {
+            val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
+            val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
+            val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
+            val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
+            val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
+            val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
+            val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
+            val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
+        }
+
+        for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
+            val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
+        }
+        *((__local half8 *)&out_local[ow + 0]) = (val8_0);
+    }
+
+    uint iw = (OW & (~0x7));
+    uint ih = oh;
+
+    half8 val8_0 = 0.0f;
+
+    __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
+    __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
+    __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
+    __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
+    __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
+    __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
+    __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
+    __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
+
+    for (uint ic = 0; ic < IC / 8; ic++) {
+        val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
+        val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
+        val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
+        val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
+        val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
+        val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
+        val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
+        val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
+    }
+
+    for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
+        val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
+    }
+    for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
+        out_local[ow + 0] = (val8_0[ow % 8]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(
+        out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
+        out_local,
+        OW,
+        0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl
+++ b/inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl
@ -0,0 +1,126 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+__kernel void Convolution1x1_NHWC(
+    const __global half *in,
+    const __global half *out,
+    const __global half *w,
+    int IW,
+    int IH,
+    int IC,
+    int OW,
+    int OH,
+    int OC)
+{
+
+    __local half in_local[8 * 1024];
+    __local half out_local[8 * 1024];
+
+    const int sizeAct = IW * IC;
+
+    event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
+    wait_group_events(1, &e1);
+
+    int oh = get_global_id(0);
+    int oc = get_global_id(1);
+
+    int stride;
+    int write_output = 0;
+    __global half *src;
+
+    __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
+    __global half *w1  = (__global half *)(&w[oc * IC]);
+
+    for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
+        uint iw = ow;
+        uint ih = oh;
+
+        half8 val8_0 = 0.0f;
+        half8 val8_1 = 0.0f;
+        half8 val8_2 = 0.0f;
+        half8 val8_3 = 0.0f;
+        half8 val8_4 = 0.0f;
+        half8 val8_5 = 0.0f;
+        half8 val8_6 = 0.0f;
+        half8 val8_7 = 0.0f;
+
+        __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
+        __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
+        __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
+        __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
+        __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
+        __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
+        __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
+        __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
+
+        for (uint ic = 0; ic < IC / 8; ++ic) {
+            val8_0 += (in8_0[ic]) * (w8[ic]);
+            val8_1 += (in8_1[ic]) * (w8[ic]);
+            val8_2 += (in8_2[ic]) * (w8[ic]);
+            val8_3 += (in8_3[ic]) * (w8[ic]);
+            val8_4 += (in8_4[ic]) * (w8[ic]);
+            val8_5 += (in8_5[ic]) * (w8[ic]);
+            val8_6 += (in8_6[ic]) * (w8[ic]);
+            val8_7 += (in8_7[ic]) * (w8[ic]);
+        }
+
+        half val_0 = 0.0f;
+        half val_1 = 0.0f;
+        half val_2 = 0.0f;
+        half val_3 = 0.0f;
+        half val_4 = 0.0f;
+        half val_5 = 0.0f;
+        half val_6 = 0.0f;
+        half val_7 = 0.0f;
+        for (uint ic = IC & (~0x7); ic < IC; ++ic) {
+            val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
+            val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
+            val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
+            val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
+            val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
+            val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
+            val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
+            val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
+        }
+        out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
+        out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
+        out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
+        out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
+        out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
+        out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
+        out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
+        out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
+    }
+    for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
+
+        uint iw = ow;
+        uint ih = oh;
+
+        half8 val8 = 0.0f;
+
+        __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
+
+        for (uint ic = 0; ic < IC / 8; ++ic) {
+            val8 += (in8[ic]) * (w8[ic]);
+        }
+
+        half val = 0.0f;
+        for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
+            val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
+        }
+        out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(
+        out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
+        out_local,
+        OW,
+        0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/convolution3x3.cl
+++ b/inference-engine/src/vpu/custom_kernels/convolution3x3.cl
@ -3,64 +3,89 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-__kernel void Convolution3x3(const __global half*  in_param,
-                             const __global half*  out,
-                             const __global half*  w,
-                             int IW, int IH, int IC,
-                             int OW, int OH, int OC, int KX, int KY,
-                             int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y,
-                             const __local  half* in_local,
-                                   __local  half* out_local,
-                             const __local  half* w_local)
+__kernel void Convolution3x3(
+    const __global half *in_param,
+    const __global half *out,
+    const __global half *w,
+    int IW,
+    int IH,
+    int IC,
+    int OW,
+    int OH,
+    int OC,
+    int KX,
+    int KY,
+    int stride_x,
+    int stride_y,
+    int pad_x,
+    int pad_y,
+    int dilation_x,
+    int dilation_y)
 {
+    __local half in_local[8 * 1024];
+    __local half out_local[8 * 1024];
+    __local half w_local[8 * 1024];
+
+    const int sizePlane = IW * IH;
+    event_t e1          = async_work_group_copy_2D2D(
+        in_local, // dst
+        in_param + get_group_id(0) * stride_y * IW, // src
+        3 * IW, // num_elements_per_line,
+        IC, // num_lines,
+        IW * IH - 3 * IW, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+    wait_group_events(1, &e1);
+
+    const int sizeWeight = IC * 3 * 3;
+    e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
+    wait_group_events(1, &e1);
+
    int oh = get_global_id(0);
    int oc = get_global_id(1);

-    __local half*   in = (__local half* )in_local + 1;
+    __local half *in = (__local half *)in_local + 1;

    int stride;
    int write_output = 0;
-    __local half*  src;
+    __local half *src;

-    if((stride_x == 1) && (stride_y == 1))
-    {
-        stride = OW / 8;
+    if ((stride_x == 1) && (stride_y == 1)) {
+        stride       = OW / 8;
        write_output = 1;
    }
-    if((stride_x == 2) && (stride_y == 2))
-    {
-        stride = OW / 4;
+    if ((stride_x == 2) && (stride_y == 2)) {
+        stride       = OW / 4;
        write_output = 2;
    }

-    for (int ow = 0; ow < stride; ow++)
-    {
+    for (int ow = 0; ow < stride; ow++) {
        float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        for (int ic = 0; ic < IC; ++ic)
-        {
-            src = (__local half* )((__local half8*)(in + ic * IW * 3) + ow);
-            __local half*  k = (__local half* )(w_local + ic*3*3);
+        for (int ic = 0; ic < IC; ++ic) {
+            src             = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
+            __local half *k = (__local half *)(w_local + ic * 3 * 3);

-            half8 aux_in00 = *((__local half8*)src  - 1);
-            half8 aux_in01 = *((__local half8*)src  + 0);
-            half8 aux_in02 = *((__local half8*)src  + 1);
-            half8 aux_in10 = *((__local half8*)(src + IW) - 1);
-            half8 aux_in11 = *((__local half8*)(src + IW) + 0);
-            half8 aux_in12 = *((__local half8*)(src + IW) + 1);
-            half8 aux_in20 = *((__local half8*)(src + IW * 2) - 1);
-            half8 aux_in21 = *((__local half8*)(src + IW * 2) + 0);
-            half8 aux_in22 = *((__local half8*)(src + IW * 2) + 1);
+            half8 aux_in00 = *((__local half8 *)src - 1);
+            half8 aux_in01 = *((__local half8 *)src + 0);
+            half8 aux_in02 = *((__local half8 *)src + 1);
+            half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
+            half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
+            half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
+            half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
+            half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
+            half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);

-            short8 in00 = *((short8*)&aux_in00);
-            short8 in01 = *((short8*)&aux_in01);
-            short8 in02 = *((short8*)&aux_in02);
-            short8 in10 = *((short8*)&aux_in10);
-            short8 in11 = *((short8*)&aux_in11);
-            short8 in12 = *((short8*)&aux_in12);
-            short8 in20 = *((short8*)&aux_in20);
-            short8 in21 = *((short8*)&aux_in21);
-            short8 in22 = *((short8*)&aux_in22);
+            short8 in00 = *((short8 *)&aux_in00);
+            short8 in01 = *((short8 *)&aux_in01);
+            short8 in02 = *((short8 *)&aux_in02);
+            short8 in10 = *((short8 *)&aux_in10);
+            short8 in11 = *((short8 *)&aux_in11);
+            short8 in12 = *((short8 *)&aux_in12);
+            short8 in20 = *((short8 *)&aux_in20);
+            short8 in21 = *((short8 *)&aux_in21);
+            short8 in22 = *((short8 *)&aux_in22);

            short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
            short8 aux_aux01 = in01;
@ -72,15 +97,15 @@ __kernel void Convolution3x3(const __global half*  in_param,
            short8 aux_aux21 = in21;
            short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);

-            half8 aux00 = *((half8*)&aux_aux00);
-            half8 aux01 = *((half8*)&aux_aux01);
-            half8 aux02 = *((half8*)&aux_aux02);
-            half8 aux10 = *((half8*)&aux_aux10);
-            half8 aux11 = *((half8*)&aux_aux11);
-            half8 aux12 = *((half8*)&aux_aux12);
-            half8 aux20 = *((half8*)&aux_aux20);
-            half8 aux21 = *((half8*)&aux_aux21);
-            half8 aux22 = *((half8*)&aux_aux22);
+            half8 aux00 = *((half8 *)&aux_aux00);
+            half8 aux01 = *((half8 *)&aux_aux01);
+            half8 aux02 = *((half8 *)&aux_aux02);
+            half8 aux10 = *((half8 *)&aux_aux10);
+            half8 aux11 = *((half8 *)&aux_aux11);
+            half8 aux12 = *((half8 *)&aux_aux12);
+            half8 aux20 = *((half8 *)&aux_aux20);
+            half8 aux21 = *((half8 *)&aux_aux21);
+            half8 aux22 = *((half8 *)&aux_aux22);

            half8 w00 = (half8)(*(k + 0));
            half8 w01 = (half8)(*(k + 1));
@ -102,69 +127,32 @@ __kernel void Convolution3x3(const __global half*  in_param,
            val += convert_float8(aux21) * convert_float8(w21);
            val += convert_float8(aux22) * convert_float8(w22);
        }
-        if(write_output == 2)
-            *((__local half4*)(out_local) + ow) = convert_half4(val.s0246);
-        if(write_output == 1)
-            *((__local half8*)(out_local) + ow) = convert_half8(val);
+        if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
+        if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
    }

-    for (int ow = OW & ~(0x7); ow < OW; ow++)
-    {
+    for (int ow = OW & ~(0x7); ow < OW; ow++) {
        float val = 0.0f;
-        for (int ic = 0; ic < IC; ++ic)
-        {
-            for (int ky = 0; ky < 3; ++ky)
-            {
-                for (int kx = 0; kx < 3; ++kx)
-                {
+        for (int ic = 0; ic < IC; ++ic) {
+            for (int ky = 0; ky < 3; ++ky) {
+                for (int kx = 0; kx < 3; ++kx) {
                    int iw = ow * stride_x - pad_x + kx * dilation_x;
                    int ih = oh * stride_y - pad_y + ky * dilation_y;

-                    val += convert_float(in[ic*IW*3 + (ky * dilation_y)*IW + iw]) * convert_float(w_local[ic*3*3 + ky*3 + kx]);
+                    val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
+                           * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
                }
            }
        }
        out_local[ow] = convert_half(val);
    }
-}

-__kernel void __dma_preload_Convolution3x3(
-                             const __global half*  in_param,
-                             const __global half*  out,
-                             const __global half*  w,
-                             int IW, int IH, int IC,
-                             int OW, int OH, int OC, int KX, int KY,
-                             int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y,
-                                   __local  half* in_local,
-                             const __local  half* out_local,
-                                   __local  half* w_local)
-{
-    const int sizePlane = IW*IH;
-    WorkGroupDmaCreateStrideTransaction(
-        in_param + get_group_id(0)*stride_y*IW, // src
-        in_local, // dst
-        3 * IW * sizeof(half), // src width
-        3 * IW * sizeof(half), // dst width
-        sizePlane * sizeof(half), // src stride
-        3 * IW * sizeof(half),  // dst stride
-        3 * IW * IC * sizeof(half), //total size
-        0
-        );
+    barrier(CLK_LOCAL_MEM_FENCE);

-    const int sizeWeight = IC*3*3;
-    async_work_group_copy(w_local, w + get_group_id(1)*sizeWeight, sizeWeight, 0);    
-}
-
-__kernel void __dma_postwrite_Convolution3x3(
-                             const __global half*  in_param,
-                                   __global half*  out,
-                             const __global half*  w,
-                             int IW, int IH, int IC,
-                             int OW, int OH, int OC, int KX, int KY,
-                             int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y,
-                             const __local  half* in_local,
-                             const __local  half* out_local,
-                             const __local  half* w_local)
-{
-    async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0);
+    event_t e2 = async_work_group_copy(
+        out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
+        out_local,
+        OW,
+        0);
+    wait_group_events(1, &e2);
 }
--- a/inference-engine/src/vpu/custom_kernels/correlate.cl
+++ b/inference-engine/src/vpu/custom_kernels/correlate.cl
@ -4,112 +4,105 @@

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-#define MAX_OPENCL_BUFF_SIZE 64*1024
+#define MAX_OPENCL_BUFF_SIZE 64 * 1024

-// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
-#define USE_MANUAL_DMA 1
+#define USE_DMA 1

-#if defined (USE_MANUAL_DMA)
-void dmacpyLineSrcStrideStart(global half* from, private half* to, int size, int src_width, int src_stride)
+#if defined(USE_DMA)
+void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride)
 {
-    item_dma_event_t copyEvent = WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
+    item_dma_event_t copyEvent =
+        WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
    WaitWorkItemDmaEvents(1, &copyEvent);
 }

-void dmacpyLineDstStrideStart(private half* from, global half* to, int size, int src_width, int src_stride)
+void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride)
 {
-    item_dma_event_t copyEvent = WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
+    item_dma_event_t copyEvent =
+        WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
    WaitWorkItemDmaEvents(1, &copyEvent);
 }
 #endif

-void memzero(void * ptr, size_t num)
+void memzero(void *ptr, size_t num)
 {
-    float4* line0_ = (float4*) ptr;
+    float4 *line0_ = (float4 *)ptr;
    #pragma unroll 16
-    for (int i = 0; i < num/16; i++)
-    {
+    for (int i = 0; i < num / 16; i++) {
        line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
    }
-    uchar* ptr_ = (uchar*) ptr;
-    for (int i = num/16*16; i < num; i++)
-    {
+    uchar *ptr_ = (uchar *)ptr;
+    for (int i = num / 16 * 16; i < num; i++) {
        ptr_[i] = 0;
    }
 }

-void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
-                                           __private const half* restrict line1,
-                                           __private half* restrict dline,
-                                           int topwidth,
-                                           int max_displacement,
-                                           int neighborhood_grid_radius,
-                                           int kernel_size,
-                                           int padding,
-                                           int bottomwidth,
-                                           int stride1,
-                                           int stride2,
-                                           int max_channels,
-                                           int cur_subchannels)
+void __attribute__((noinline)) crosscorrh(
+    __private const half *restrict line0,
+    __private const half *restrict line1,
+    __private half *restrict dline,
+    int topwidth,
+    int max_displacement,
+    int neighborhood_grid_radius,
+    int kernel_size,
+    int padding,
+    int bottomwidth,
+    int stride1,
+    int stride2,
+    int max_channels,
+    int cur_subchannels)
 {
-    if (max_channels == 64)
-    {
-        for (int i = 0; i < kernel_size; i++)
-        {
-            int x1 = max_displacement - padding + i;
-            int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1)/stride1;
-            x1 += offset1*stride1;
+    if (max_channels == 64) {
+        for (int i = 0; i < kernel_size; i++) {
+            int x1      = max_displacement - padding + i;
+            int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
+            x1 += offset1 * stride1;

-            for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1)
-            {
-                int x2 = x1 - neighborhood_grid_radius*stride2;
-                int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1)/stride2;
-                x2 += offset2*stride2;
+            for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
+                int x2      = x1 - neighborhood_grid_radius * stride2;
+                int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
+                x2 += offset2 * stride2;

                for (int top_channel_x = offset2 - neighborhood_grid_radius;
                     top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
-                     top_channel_x++, x2 += stride2)
-                {
+                     top_channel_x++, x2 += stride2) {
                    half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};

-                    half8* src0 = (half8*)(line0 + x1*max_channels);
-                    half8* src1 = (half8*)(line1 + x2*max_channels);
+                    half8 *src0 = (half8 *)(line0 + x1 * max_channels);
+                    half8 *src1 = (half8 *)(line1 + x2 * max_channels);

                    #pragma unroll 8
-                    for (int ch = 0; ch < max_channels/8; ch++)
-                        sum4 += (src0[ch])*(src1[ch]);
+                    for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);

                    half sum = __builtin_shave_sau_sumx_f16_r(sum4);
-                    dline[(top_channel_x + neighborhood_grid_radius)*topwidth + blockIdx_x] += (sum);
+                    dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
                }
            }
        }
-    }
-    else
-    {
-        int neighborhood_grid_width = 2*neighborhood_grid_radius + 1;
+    } else {
+        int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;

-        for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++)
-        {
-            for (int i = 0; i < kernel_size; i++)
-            {
-                int x1 = blockIdx_x*stride1 + max_displacement + i - padding;
+        for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
+            for (int i = 0; i < kernel_size; i++) {
+                int x1 = blockIdx_x * stride1 + max_displacement + i - padding;

-                if ((x1 >= 0) && (x1 < bottomwidth))
-                {
-                    int o_min =                                 - neighborhood_grid_radius*stride2;
-                    int o_max = neighborhood_grid_width*stride2 - neighborhood_grid_radius*stride2;
-                    if ((o_min) <  (                    - x1)) o_min -= ((x1 + o_min - (stride2 - 1))/stride2)*stride2;
-                    if ((o_max) >= (bottomwidth+stride2 - x1)) o_max -= ((x1 + o_max -  bottomwidth )/stride2)*stride2;
+                if ((x1 >= 0) && (x1 < bottomwidth)) {
+                    int o_min = -neighborhood_grid_radius * stride2;
+                    int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
+                    if ((o_min) < (-x1)) {
+                        o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
+                    }
+                    if ((o_max) >= (bottomwidth + stride2 - x1)) {
+                        o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
+                    }

                    int o = o_min;
-                    for (; o <= o_max - 4*stride2; o += 4*stride2)
-                    {
-                        half8* bottom0   = (half8*)(line0 + x1*max_channels);
-                        half8* bottom1_0 = (half8*)(line1 + (x1 + o + 0*stride2)*max_channels);
-                        half8* bottom1_1 = (half8*)(line1 + (x1 + o + 1*stride2)*max_channels);
-                        half8* bottom1_2 = (half8*)(line1 + (x1 + o + 2*stride2)*max_channels);
-                        half8* bottom1_3 = (half8*)(line1 + (x1 + o + 3*stride2)*max_channels);
+                    for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
+                        half8 *bottom0   = (half8 *)(line0 + x1 * max_channels);
+                        half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels);
+                        half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels);
+                        half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels);
+                        half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels);

                        int c = 0;

@ -118,8 +111,7 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
                        half8 sum42 = 0;
                        half8 sum43 = 0;

-                        for (; c <= cur_subchannels/8 - 4; c += 4)
-                        {
+                        for (; c <= cur_subchannels / 8 - 4; c += 4) {
                            sum40 += bottom0[c + 0] * bottom1_0[c + 0];
                            sum40 += bottom0[c + 1] * bottom1_0[c + 1];
                            sum40 += bottom0[c + 2] * bottom1_0[c + 2];
@ -141,8 +133,7 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
                            sum43 += bottom0[c + 3] * bottom1_3[c + 3];
                        }

-                        for (; c < cur_subchannels/8; c++)
-                        {
+                        for (; c < cur_subchannels / 8; c++) {
                            sum40 += bottom0[c] * bottom1_0[c];
                            sum41 += bottom0[c] * bottom1_1[c];
                            sum42 += bottom0[c] * bottom1_2[c];
@ -154,48 +145,47 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
                        half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
                        half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);

-                        for (c = c*8; c < cur_subchannels; c++)
-                        {
-                            sum0 += line0[x1*max_channels + c] * line1[(x1 + o + 0*stride2)*max_channels + c];
-                            sum1 += line0[x1*max_channels + c] * line1[(x1 + o + 1*stride2)*max_channels + c];
-                            sum2 += line0[x1*max_channels + c] * line1[(x1 + o + 2*stride2)*max_channels + c];
-                            sum3 += line0[x1*max_channels + c] * line1[(x1 + o + 3*stride2)*max_channels + c];
+                        for (c = c * 8; c < cur_subchannels; c++) {
+                            sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
+                            sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
+                            sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
+                            sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
                        }

-                        dline[blockIdx_x + (((o/stride2) + 0)*topwidth + neighborhood_grid_radius*topwidth)] += sum0;
-                        dline[blockIdx_x + (((o/stride2) + 1)*topwidth + neighborhood_grid_radius*topwidth)] += sum1;
-                        dline[blockIdx_x + (((o/stride2) + 2)*topwidth + neighborhood_grid_radius*topwidth)] += sum2;
-                        dline[blockIdx_x + (((o/stride2) + 3)*topwidth + neighborhood_grid_radius*topwidth)] += sum3;
+                        dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
+                            sum0;
+                        dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
+                            sum1;
+                        dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
+                            sum2;
+                        dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
+                            sum3;
                    }

-                    for (; o < o_max; o += 1*stride2)
-                    {
-                        half8* bottom0 = (half8*)(line0 + x1*max_channels);
-                        half8* bottom1 = (half8*)(line1 + (x1 + o)*max_channels);
+                    for (; o < o_max; o += 1 * stride2) {
+                        half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
+                        half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels);

                        int c = 0;

                        half8 sum4 = 0;
-                        for (; c <= cur_subchannels/8 - 4; c += 4)
-                        {
+                        for (; c <= cur_subchannels / 8 - 4; c += 4) {
                            sum4 += bottom0[c + 0] * bottom1[c + 0];
                            sum4 += bottom0[c + 1] * bottom1[c + 1];
                            sum4 += bottom0[c + 2] * bottom1[c + 2];
                            sum4 += bottom0[c + 3] * bottom1[c + 3];
                        }
-                        for (; c < cur_subchannels/8; c++)
-                        {
+                        for (; c < cur_subchannels / 8; c++) {
                            sum4 += bottom0[c] * bottom1[c];
                        }

                        half sum = __builtin_shave_sau_sumx_f16_r(sum4);

-                        for (c = c*8; c < cur_subchannels; c++)
-                        {
-                            sum += line0[x1*max_channels + c] * line1[(x1 + o)*max_channels + c];
+                        for (c = c * 8; c < cur_subchannels; c++) {
+                            sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
                        }

-                        dline[blockIdx_x + (((o + neighborhood_grid_radius*stride2)/stride2)*topwidth)] += sum;
+                        dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
                    }
                }
            }
@ -203,243 +193,257 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
    }
 }

-
-__kernel void correlate2_half(__global const half* restrict bottom0,
-                              __global const half* restrict bottom1,
-                              __global half* restrict top,
-                             int topwidth,
-                             int topheight,
-                             int bottomwidth,
-                             int bottomheight,
-                             int bottomchannels,
-                             int max_displacement,
-                             int padding,
-                             int neighborhood_grid_radius,
-                             int neighborhood_grid_width,
-                             int kernel_size,
-                             int stride1,
-                             int stride2)
+__kernel void correlate2_half(
+    __global const half *restrict bottom0,
+    __global const half *restrict bottom1,
+    __global half *restrict top,
+    int topwidth,
+    int topheight,
+    int bottomwidth,
+    int bottomheight,
+    int bottomchannels,
+    int max_displacement,
+    int padding,
+    int neighborhood_grid_radius,
+    int neighborhood_grid_width,
+    int kernel_size,
+    int stride1,
+    int stride2)
 {
-    int max_channels = (MAX_OPENCL_BUFF_SIZE/sizeof(half) - topwidth*neighborhood_grid_width) / (3*bottomwidth);
+    int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
    if (max_channels > 64) max_channels = 64;
    int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
-    int subchannels = (bottomchannels + subchannels_count-1) / subchannels_count;
+    int subchannels       = (bottomchannels + subchannels_count - 1) / subchannels_count;
    if (subchannels < max_channels) subchannels = max_channels;

-    const int sumelems = kernel_size*kernel_size*bottomchannels;
+    const int sumelems = kernel_size * kernel_size * bottomchannels;

-    __private half cmx[MAX_OPENCL_BUFF_SIZE/sizeof(half)];
+    __private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];

-    __private half* line0 = cmx;
-    __private half* line1 = line0 + bottomwidth*subchannels;
-    __private half* dline = line1 + bottomwidth*subchannels;
+    __private half *line0 = cmx;
+    __private half *line1 = line0 + bottomwidth * subchannels;
+    __private half *dline = line1 + bottomwidth * subchannels;

    int blockIdx_y = get_global_id(0);

-#if defined(USE_MANUAL_DMA)
-    __private half* dmabuf = dline + topwidth*neighborhood_grid_width;
+#if defined(USE_DMA)
+    __private half *dmabuf = dline + topwidth * neighborhood_grid_width;
 #endif

-    int y1 = blockIdx_y*stride1 + max_displacement;
+    int y1 = blockIdx_y * stride1 + max_displacement;

-    for (int j = 0; j < kernel_size; j++)
-    {
-        for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels)
-        {
+    for (int j = 0; j < kernel_size; j++) {
+        for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
            // configure channel batching
            int startchannel = bottomchannel;
            int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
-            int deltachannels = endchannel-startchannel;
+            int deltachannels = endchannel - startchannel;

            // load line form blob 0 with repackaging
-            if (y1+j-padding >= 0 && y1+j-padding < bottomheight)
-            {
-#if defined(USE_MANUAL_DMA)
-                __global const half* curr = bottom0 + startchannel*bottomheight*bottomwidth + (y1+j-padding)*bottomwidth;
-                dmacpyLineSrcStrideStart(curr,
-                                         dmabuf,
-                                         bottomwidth*deltachannels*sizeof(half),
-                                         bottomwidth*sizeof(half),
-                                         bottomwidth*bottomheight*sizeof(half));
+            if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
+#if defined(USE_DMA)
+                __global const half *curr =
+                    bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
+                dmacpyLineSrcStrideStart(
+                    curr,
+                    dmabuf,
+                    bottomwidth * deltachannels * sizeof(half),
+                    bottomwidth * sizeof(half),
+                    bottomwidth * bottomheight * sizeof(half));

-                for (int ch = 0; ch < deltachannels; ch++)
-                {
-                    for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++)
-                    {
-                        half8 val = ((half8*)(dmabuf + ch*bottomwidth))[blockIdx_x];
-                        line0[(blockIdx_x*8 + 0)*max_channels+ch] = val[0];
-                        line0[(blockIdx_x*8 + 1)*max_channels+ch] = val[1];
-                        line0[(blockIdx_x*8 + 2)*max_channels+ch] = val[2];
-                        line0[(blockIdx_x*8 + 3)*max_channels+ch] = val[3];
+                for (int ch = 0; ch < deltachannels; ch++) {
+                    for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
+                        half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
+                        line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
+                        line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
+                        line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
+                        line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];

-                        line0[(blockIdx_x*8 + 4)*max_channels+ch] = val[4];
-                        line0[(blockIdx_x*8 + 5)*max_channels+ch] = val[5];
-                        line0[(blockIdx_x*8 + 6)*max_channels+ch] = val[6];
-                        line0[(blockIdx_x*8 + 7)*max_channels+ch] = val[7];
+                        line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
+                        line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
+                        line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
+                        line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
                    }

-                    for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++)
-                    {
-                        line0[(blockIdx_x)*max_channels+ch] = dmabuf[blockIdx_x + ch*bottomwidth];
+                    for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
+                        line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
                    }
                }

                if (deltachannels < subchannels)
                    for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
-                        memzero(line0 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half));
+                        memzero(
+                            line0 + blockIdx_x * max_channels + deltachannels,
+                            (subchannels - deltachannels) * sizeof(half));
 #else
-                for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
-                {
+                for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
                    for (int ch = 0; ch < deltachannels; ch++)
-                        line0[blockIdx_x*max_channels+ch]
-                            = bottom0[(ch+startchannel)*bottomheight*bottomwidth + (y1+j-padding)*bottomwidth + blockIdx_x];
+                        line0[blockIdx_x * max_channels + ch] = bottom0
+                            [(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
+                             + blockIdx_x];

                    if (deltachannels < subchannels)
-                        memzero(line0 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half));
+                        memzero(
+                            line0 + blockIdx_x * max_channels + deltachannels,
+                            (subchannels - deltachannels) * sizeof(half));
                }
 #endif
-            }
-            else
-                memzero(line0, max_channels*bottomwidth*sizeof(half));
+            } else
+                memzero(line0, max_channels * bottomwidth * sizeof(half));

-            for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++)
-            {
+            for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
                int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;

-                // load line form blob 1 with repackaging according to the line we work on now
-                if (y2+j-padding >= 0 && y2+j-padding < bottomheight)
-                {
-#if defined(USE_MANUAL_DMA)
-                    __global const half* curr = bottom1 + startchannel*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth;
-                    dmacpyLineSrcStrideStart(curr,
-                                             dmabuf,
-                                             bottomwidth*deltachannels*sizeof(half),
-                                             bottomwidth*sizeof(half),
-                                             bottomwidth*bottomheight*sizeof(half));
+                if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
+#if defined(USE_DMA)
+                    __global const half *curr =
+                        bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
+                    dmacpyLineSrcStrideStart(
+                        curr,
+                        dmabuf,
+                        bottomwidth * deltachannels * sizeof(half),
+                        bottomwidth * sizeof(half),
+                        bottomwidth * bottomheight * sizeof(half));

-                    for (int ch = 0; ch < deltachannels; ch++)
-                    {
-                        for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++)
-                        {
-                            half8 val = ((half8*)(dmabuf + ch*bottomwidth))[blockIdx_x];
-                            line1[(blockIdx_x*8 + 0)*max_channels+ch] = val[0];
-                            line1[(blockIdx_x*8 + 1)*max_channels+ch] = val[1];
-                            line1[(blockIdx_x*8 + 2)*max_channels+ch] = val[2];
-                            line1[(blockIdx_x*8 + 3)*max_channels+ch] = val[3];
+                    for (int ch = 0; ch < deltachannels; ch++) {
+                        for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
+                            half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
+                            line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
+                            line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
+                            line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
+                            line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];

-                            line1[(blockIdx_x*8 + 4)*max_channels+ch] = val[4];
-                            line1[(blockIdx_x*8 + 5)*max_channels+ch] = val[5];
-                            line1[(blockIdx_x*8 + 6)*max_channels+ch] = val[6];
-                            line1[(blockIdx_x*8 + 7)*max_channels+ch] = val[7];
+                            line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
+                            line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
+                            line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
+                            line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
                        }

-                        for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++)
-                        {
-                            line1[(blockIdx_x)*max_channels+ch] = dmabuf[blockIdx_x + ch*bottomwidth];
+                        for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
+                            line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
                        }
                    }
 #else
-                    for (int ch = 0; ch < deltachannels; ch++)
-                    {
-                        for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++)
-                        {
-                            half8 val = ((__global half8*)(bottom1 + (ch+startchannel)*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth))[blockIdx_x];
-                            line1[(blockIdx_x*8 + 0)*max_channels+ch] = val[0];
-                            line1[(blockIdx_x*8 + 1)*max_channels+ch] = val[1];
-                            line1[(blockIdx_x*8 + 2)*max_channels+ch] = val[2];
-                            line1[(blockIdx_x*8 + 3)*max_channels+ch] = val[3];
+                    for (int ch = 0; ch < deltachannels; ch++) {
+                        for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
+                            half8 val = ((
+                                __global half8
+                                    *)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
+                                [blockIdx_x];
+                            line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
+                            line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
+                            line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
+                            line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];

-                            line1[(blockIdx_x*8 + 4)*max_channels+ch] = val[4];
-                            line1[(blockIdx_x*8 + 5)*max_channels+ch] = val[5];
-                            line1[(blockIdx_x*8 + 6)*max_channels+ch] = val[6];
-                            line1[(blockIdx_x*8 + 7)*max_channels+ch] = val[7];
+                            line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
+                            line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
+                            line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
+                            line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
                        }
-                        for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++)
-                        {
-                            half val = (bottom1 + (ch+startchannel)*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth)[blockIdx_x];
-                            line1[(blockIdx_x)*max_channels+ch] = val;
+                        for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
+                            half val =
+                                (bottom1 + (ch + startchannel) * bottomheight * bottomwidth
+                                 + (y2 + j - padding) * bottomwidth)[blockIdx_x];
+                            line1[(blockIdx_x)*max_channels + ch] = val;
                        }
                    }
 #endif
-                    for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
-                    {
+                    for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
                        if (deltachannels < subchannels)
-                            memzero(line1 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half));
+                            memzero(
+                                line1 + blockIdx_x * max_channels + deltachannels,
+                                (subchannels - deltachannels) * sizeof(half));
                    }
-                }
-                else
-                    memzero(line1, max_channels*bottomwidth*sizeof(half));
+                } else
+                    memzero(line1, max_channels * bottomwidth * sizeof(half));

-                if(j == 0 && startchannel == 0)
-                {
-                    memzero(dline, neighborhood_grid_width*topwidth*sizeof(half));
-                }
-                else
-                {
-#if defined(USE_MANUAL_DMA)
-                    dmacpyLineSrcStrideStart(top + top_channel_y*neighborhood_grid_width*topheight*topwidth + blockIdx_y*topwidth,
-                                             dline,
-                                             topwidth*neighborhood_grid_width*sizeof(half),
-                                             topwidth*sizeof(half),
-                                             topwidth*topheight*sizeof(half));
+                if (j == 0 && startchannel == 0) {
+                    memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
+                } else {
+#if defined(USE_DMA)
+                    dmacpyLineSrcStrideStart(
+                        top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
+                        dline,
+                        topwidth * neighborhood_grid_width * sizeof(half),
+                        topwidth * sizeof(half),
+                        topwidth * topheight * sizeof(half));
 #else
-                    for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++)
-                    {
-                        for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++)
-                        {
-                            half8 val = ((__global half8*)(top + ((top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth)))[blockIdx_x];
-                            ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] = val;
+                    for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
+                        for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
+                            half8 val = ((
+                                __global half8
+                                    *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
+                                [blockIdx_x];
+                            ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val;
                        }
-                        for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++)
-                        {
-                            dline[top_channel_x*topwidth+blockIdx_x] =
-                                top[(top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth+blockIdx_x];
+                        for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
+                            dline[top_channel_x * topwidth + blockIdx_x] =
+                                top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
+                                    + blockIdx_y * topwidth + blockIdx_x];
                        }
                    }
 #endif
                }

-                if (y1+j-padding >= 0 && y1+j-padding < bottomheight && y2+j-padding >= 0 && y2+j-padding < bottomheight)
-                {
-                    crosscorrh(line0, line1, dline, topwidth, max_displacement, neighborhood_grid_radius,
-                               kernel_size, padding, bottomwidth, stride1, stride2, max_channels, subchannels);
+                if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
+                    && y2 + j - padding < bottomheight) {
+                    crosscorrh(
+                        line0,
+                        line1,
+                        dline,
+                        topwidth,
+                        max_displacement,
+                        neighborhood_grid_radius,
+                        kernel_size,
+                        padding,
+                        bottomwidth,
+                        stride1,
+                        stride2,
+                        max_channels,
+                        subchannels);
                }

-                if (j == kernel_size-1 && endchannel == bottomchannels)
-                {
-                    half8 scale = (half8){(half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems};
-                    for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++)
-                    {
-                        for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++)
-                        {
-                            ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] =
-                                ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] / scale;
+                if (j == kernel_size - 1 && endchannel == bottomchannels) {
+                    half8 scale = (half8){
+                        (half)sumelems,
+                        (half)sumelems,
+                        (half)sumelems,
+                        (half)sumelems,
+                        (half)sumelems,
+                        (half)sumelems,
+                        (half)sumelems,
+                        (half)sumelems};
+                    for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
+                        for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
+                            ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] =
+                                ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale;
                        }
-                        for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++)
-                        {
-                            dline[top_channel_x*topwidth+blockIdx_x] = dline[top_channel_x*topwidth+blockIdx_x]/(half)sumelems;
+                        for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
+                            dline[top_channel_x * topwidth + blockIdx_x] =
+                                dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
                        }
                    }
                }

-#if defined(USE_MANUAL_DMA)
-                dmacpyLineDstStrideStart(dline,
-                                         top + top_channel_y*neighborhood_grid_width*topheight*topwidth + blockIdx_y*topwidth,
-                                         topwidth*neighborhood_grid_width*sizeof(half),
-                                         topwidth*sizeof(half),
-                                         topwidth*topheight*sizeof(half));
+#if defined(USE_DMA)
+                dmacpyLineDstStrideStart(
+                    dline,
+                    top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
+                    topwidth * neighborhood_grid_width * sizeof(half),
+                    topwidth * sizeof(half),
+                    topwidth * topheight * sizeof(half));
 #else
-                for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++)
-                {
-                    for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++)
-                    {
-                        ((__global half8*)(top + ((top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth)))[blockIdx_x] =
-                            ((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] + (half8) {0, 0, 0, 0, 0, 0, 0, 0};
+                for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
+                    for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
+                        ((__global half8
+                              *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
+                            [blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x]
+                                           + (half8){0, 0, 0, 0, 0, 0, 0, 0};
                    }
-                    for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++)
-                    {
-                        top[(top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth+blockIdx_x]
-                            = dline[top_channel_x*topwidth+blockIdx_x] + (half)0;
+                    for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
+                        top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
+                            + blockIdx_y * topwidth + blockIdx_x] =
+                            dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
                    }
                }
 #endif
--- a/inference-engine/src/vpu/custom_kernels/ctc.cl
+++ b/inference-engine/src/vpu/custom_kernels/ctc.cl
@ -3,10 +3,12 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-__global half *find(__global const half *begin, __global const half *end, half value) {
+__global half *find(__global const half *begin, __global const half *end, half value)
+{
    while (begin != end) {
-        if (*begin == value)  {
+        if (*begin == value) {
            return begin;
        }
        ++begin;
@ -14,160 +16,79 @@ __global half *find(__global const half *begin, __global const half *end, half v
    return end;
 }

-#define USE_MANUAL_DMA
-
-#ifdef USE_MANUAL_DMA
-
-__kernel void __dma_preload_CTCDecoder(__global half *probabilities,
-                                       __global half *sequence_indicators,
-                                       __global half *output_sequences,
-                                       int width,
-                                       int height,
-                                       int channels,
-                                       __local half *local_src,
-                                       __local half *local_dst)
+__kernel void CTCDecoder(
+    __global half *restrict probabilities,
+    __global half *restrict sequence_indicators,
+    __global half *restrict output,
+    int width,
+    int height,
+    int channels)
 {
-    WorkGroupDmaCreateStrideTransaction(
-        probabilities, // src
+    __local half local_src[88 * 1 * 77];
+    __local half local_dst[88 * 1];
+
+    event_t e1 = async_work_group_copy_2D2D(
        local_src, // dst
-        width * sizeof(half), // src_width,
-        width * sizeof(half), // dst_width,
-        width * height * sizeof(half), // src_stride,
-        width * sizeof(half), // dst_stride,
-        width * height * channels * sizeof(half), // size
+        probabilities, // src
+        width, // num_elements_per_line,
+        height * channels, // num_lines,
+        width * (height - 1), // src_line_stride,
+        width * (height - 1), // dst_line_stride,
        0);
-}

-__kernel void __dma_postwrite_CTCDecoder(__global half *probabilities,
-                                         __global half *sequence_indicators,
-                                         __global half *output_sequences,
-                                         int width,
-                                         int height,
-                                         int channels,
-                                         __local half *local_src,
-                                         __local half *local_dst)
-{
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst, // src
-        output_sequences, // dst
-        channels * sizeof(half), // src_width,
-        channels * sizeof(half), // dst_width,
-        channels * sizeof(half), // src_stride,
-        channels * sizeof(half), // dst_stride,
-        channels * height * sizeof(half), // size
-        0);
-}
+    wait_group_events(1, &e1);

-__kernel void CTCDecoder(__global half *probabilities,
-                         __global half *sequence_indicators,
-                         __global half *output_sequences,
-                         int width,
-                         int height,
-                         int channels,
-                         __local half *local_src,
-                         __local half *local_dst)
-{
-    const int T = channels;
-    const int B = height;
-    const int C = width;
+    const int T = channels; // Time
+    const int B = height; // Batches
+    const int C = width; // Chars

-    for (int i = 0; i < B*T; i++)
-    {
+    #pragma unroll 4
+    for (int i = 0; i < B * T; i++) {
        local_dst[i] = -1.h;
    }

    int output_index = 0;

-    for (int b = 0; b < B; ++b)
-    {
-        __global const half *seq_ind = sequence_indicators + b*T;
+    for (int b = 0; b < B; ++b) {
+        __global const half *restrict seq_ind = sequence_indicators + b * T;
        const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
-        const int time = min(seq_len, T);
+        const int time    = min(seq_len, T);

        int prev_class_idx = -1;

-        for (int t = 0; t < time; ++t)
-        {
-            __local const half *probs = local_src + b*C + t*C*B;
-            int max_class_idx = 0;
-            half max_prob = probs[0];
+        #pragma unroll 4
+        for (int t = 0; t < time; ++t) {
+            __local const half *restrict probs = local_src + b * C + t * C * B;

-            for (int c = 1; c < C; ++c)
-            {
+            int max_class_idx = 0;
+            half max_prob     = probs[0];
+            for (int c = 1; c < C; ++c) {
                const half prob = probs[c];
-                if (prob > max_prob)
-                {
+                if (prob > max_prob) {
                    max_class_idx = c;
-                    max_prob = prob;
+                    max_prob      = prob;
                }
            }

-            if (max_class_idx < C-1 && max_class_idx != prev_class_idx)
-            {
-                local_dst[b*T + output_index] = (half)max_class_idx;
+            if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) {
+                local_dst[b * T + output_index] = (half)max_class_idx;
                output_index++;
            }

            prev_class_idx = max_class_idx;
        }
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_2D2D(
+        output, // dst
+        local_dst, // src
+        channels, // num_elements_per_line,
+        height, // num_lines,
+        0, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+
+    wait_group_events(1, &e2);
 }
-
-#else
-
-__kernel void CTCDecoder(__global half *probabilities,
-                         __global half *sequence_indicators,
-                         __global half *output_sequences,
-                         int width,
-                         int height,
-                         int channels,
-                         __local half *local_src,
-                         __local half *local_dst)
-{
-    const int T = channels;
-    const int B = height;
-    const int C = width;
-
-    for (int i = 0; i < B*T; i++)
-    {
-        output_sequences[i] = -1.h;
-    }
-
-    int output_index = 0;
-
-    for (int b = 0; b < B; ++b)
-    {
-        __global const half *seq_ind = sequence_indicators + b*T;
-        const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
-        const int time = min(seq_len, T);
-
-        int prev_class_idx = -1;
-
-        for (int t = 0; t < time; ++t)
-        {
-            __global const half *probs = probabilities + b*C + t*C*B;
-            int max_class_idx = 0;
-            half max_prob = probs[0];
-
-            for (int c = 1; c < C; ++c)
-            {
-                const half prob = probs[c];
-                if (prob > max_prob)
-                {
-                    max_class_idx = c;
-                    max_prob = prob;
-                }
-            }
-
-            if (max_class_idx < C-1 && max_class_idx != prev_class_idx)
-            {
-                output_sequences[b*T + output_index] = (half)max_class_idx;
-                output_index++;
-            }
-
-            prev_class_idx = max_class_idx;
-        }
-    }
-}
-
-#endif
--- a/inference-engine/src/vpu/custom_kernels/customLayerBindings.xml
+++ b/inference-engine/src/vpu/custom_kernels/customLayerBindings.xml
@ -1,6 +1,6 @@
 <CustomLayer name="ReorgYolo" type="MVCL" version="1">
    <Kernel entry="reorg_hwc_naive">
-        <Source filename="reorg_hwc.bin"/>
+        <Source filename="reorg_hwc_naive.bin"/>
        <Parameters>
            <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
            <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
@ -8,15 +8,12 @@
            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
            <Scalar arg-name="stride" type="int" source="stride"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="0"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="0"/>
        </Parameters>
        <WorkSizes dim="input,0" global="F,1,1" local="stride*stride,1,1"/>
    </Kernel>
 </CustomLayer>

 <CustomLayer name="ReorgYolo" type="MVCL" version="1">
-    <Where stride="2"/>
    <Kernel entry="reorg_chw">
        <Source filename="reorg_chw.bin"/>
        <Parameters>
@ -26,22 +23,18 @@
            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
            <Scalar arg-name="stride" type="int" source="stride"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*2*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*2*2"/>
        </Parameters>
-        <WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="1,stride,1"/>
+        <WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="stride,stride,1"/>
    </Kernel>
 </CustomLayer>

 <CustomLayer name="RegionYolo" type="MVCL" version="1">
    <Where do_softmax="1"/>
    <Kernel entry="region_chw">
-        <Source filename="region.bin"/>
+        <Source filename="region_chw.bin"/>
        <Parameters>
            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="classes" type="int" source="classes"/>
@ -50,82 +43,74 @@
            <Scalar arg-name="maskSize" type="int" source="3"/>
            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
        </Parameters>
-        <WorkSizes global="((X+7)/8)*8*Y,num,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
+        <WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
    </Kernel>
 </CustomLayer>

-<CustomLayer name="RegionYolo" type="MVCL" version="1">
-    <Where do_softmax="0" mask="0,1,2"/>
-    <Kernel entry="region_chw">
-        <Source filename="region.bin"/>
-        <Parameters>
-            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
-            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
-            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
-            <Scalar arg-name="classes" type="int" source="classes"/>
-            <Scalar arg-name="coords" type="int" source="coords"/>
-            <Scalar arg-name="num" type="int" source="num"/>
-            <Scalar arg-name="maskSize" type="int" source="3"/>
-            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
-        </Parameters>
-        <WorkSizes global="((X+7)/8)*8*Y,3,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
-    </Kernel>
+<CustomLayer name="RegionYolo" type="MVCL" version="1">-->
+   <Where do_softmax="0" mask="0,1,2"/>
+   <Kernel entry="region_chw">
+       <Source filename="region_chw.bin"/>
+       <Parameters>
+           <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
+           <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
+           <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
+           <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
+           <Scalar arg-name="classes" type="int" source="classes"/>
+           <Scalar arg-name="coords" type="int" source="coords"/>
+           <Scalar arg-name="num" type="int" source="num"/>
+           <Scalar arg-name="maskSize" type="int" source="3"/>
+           <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
+       </Parameters>
+       <WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
+   </Kernel>
 </CustomLayer>

 <CustomLayer name="RegionYolo" type="MVCL" version="1">
-    <Where do_softmax="1"/>
-    <Kernel entry="region_hwc">
-        <Source filename="region.bin"/>
-        <Parameters>
-            <Tensor arg-name="src_data" type="input" port-index="0" format="BYXF"/>
-            <Tensor arg-name="dst_data" type="output" port-index="0" format="BYXF"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
-            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
-            <Scalar arg-name="classes" type="int" source="classes"/>
-            <Scalar arg-name="coords" type="int" source="coords"/>
-            <Scalar arg-name="num" type="int" source="num"/>
-            <Scalar arg-name="maskSize" type="int" source="3"/>
-            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
-        </Parameters>
-        <WorkSizes global="((X+7)/8)*8*Y,num,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
-    </Kernel>
+   <Where do_softmax="1"/>
+   <Kernel entry="region_hwc">
+       <Source filename="region_hwc.bin"/>
+       <Parameters>
+           <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
+           <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
+           <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
+           <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
+           <Scalar arg-name="classes" type="int" source="classes"/>
+           <Scalar arg-name="coords" type="int" source="coords"/>
+           <Scalar arg-name="num" type="int" source="num"/>
+           <Scalar arg-name="maskSize" type="int" source="3"/>
+           <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
+       </Parameters>
+       <WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
+   </Kernel>
 </CustomLayer>

 <CustomLayer name="RegionYolo" type="MVCL" version="1">
-    <Where do_softmax="0" mask="0,1,2"/>
-    <Kernel entry="region_hwc">
-        <Source filename="region.bin"/>
-        <Parameters>
-            <Tensor arg-name="src_data" type="input" port-index="0" format="BYXF"/>
-            <Tensor arg-name="dst_data" type="output" port-index="0" format="BYXF"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
-            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
-            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
-            <Scalar arg-name="classes" type="int" source="classes"/>
-            <Scalar arg-name="coords" type="int" source="coords"/>
-            <Scalar arg-name="num" type="int" source="num"/>
-            <Scalar arg-name="maskSize" type="int" source="3"/>
-            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
-        </Parameters>
-        <WorkSizes global="((X+7)/8)*8*Y,3,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
-    </Kernel>
+   <Where do_softmax="0" mask="0,1,2"/>
+   <Kernel entry="region_hwc">
+       <Source filename="region_hwc.bin"/>
+       <Parameters>
+           <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
+           <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
+           <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
+           <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
+           <Scalar arg-name="classes" type="int" source="classes"/>
+           <Scalar arg-name="coords" type="int" source="coords"/>
+           <Scalar arg-name="num" type="int" source="num"/>
+           <Scalar arg-name="maskSize" type="int" source="3"/>
+           <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
+       </Parameters>
+       <WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
+   </Kernel>
 </CustomLayer>

 <!-- Pixel-wise kernel binding, local work group config is per line in the input tensor  -->
 <CustomLayer name="GRN" type="MVCL" version="1">
-    <Kernel entry="grn_NCHW">
+    <Kernel entry="grn">
        <Source filename="grn.bin"/>
        <Parameters>
-            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
-            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*F*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*F*2"/>
+            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
+            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
            <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
            <Scalar arg-name="bias" type="float" source="bias"/>
        </Parameters>
@ -136,7 +121,7 @@
 <!-- Two stage layer binding, first kernel computes mean and variance, the second one normalizes input tensor-->
 <CustomLayer name="MVN" type="MVCL" version="1">
    <Kernel entry="reduction_mean" stage="0">
-        <Source filename="mvn.bin"/>
+        <Source filename="mvn_reduction.bin"/>
        <Parameters>
            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="mean" type="output_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
@ -144,12 +129,11 @@
            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="across_channels" type="int" source="across_channels"/>
-            <Data arg-name="src_line" type="local_data" dim="input,0" size="X*2"/>
        </Parameters>
        <WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
    </Kernel>
    <Kernel entry="mvn_scale" stage="1">
-        <Source filename="mvn.bin"/>
+        <Source filename="mvn_scale.bin"/>
        <Parameters>
            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
@ -160,8 +144,6 @@
            <Scalar arg-name="across_channels" type="int" source="across_channels"/>
            <Scalar arg-name="normalize_variance" type="int" source="normalize_variance"/>
            <Scalar arg-name="nparts" type="int" port-index="0" source="I.Y"/>
-            <Data arg-name="src_line" type="local_data" dim="input,0" size="X*2"/>
-            <Data arg-name="dst_line" type="local_data" dim="input,0" size="X*2"/>
        </Parameters>
        <WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
    </Kernel>
@ -174,12 +156,10 @@
        <Parameters>
            <Tensor arg-name="probabilities" type="input" port-index="0" format="FYX"/>
            <Tensor arg-name="sequence_indicators" type="input" port-index="1" format="BF"/>
-            <Tensor arg-name="output_sequences" type="output" port-index="0" format="BFYX"/>
+            <Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
            <Scalar arg-name="width" type="int" port-index="0" source="I.X"/>
            <Scalar arg-name="height" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="F*Y*X*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="F*Y*2"/>
        </Parameters>
        <WorkSizes dim="output,0" global="1,1,1" local="1,1,1"/>
    </Kernel>
@ -204,64 +184,36 @@

 <!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
 <CustomLayer name="FakeQuantize" type="MVCL" version="1">
-    <!-- <Where levels="2"/> -->
    <Kernel entry="quantize">
-        <Source filename="quantize.bin"/>
+        <Source filename="fakequantize.bin"/>
        <Parameters>
-            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
+            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="input_low" type="input" port-index="1" format="ANY"/>
            <Tensor arg-name="input_high" type="input" port-index="2" format="ANY"/>
            <Tensor arg-name="output_low" type="input" port-index="3" format="ANY"/>
            <Tensor arg-name="output_high" type="input" port-index="4" format="ANY"/>
-            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
+            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
            <Scalar arg-name="levels" type="int" source="levels"/>
            <Scalar arg-name="input_low_size" type="int" port-index="1" source="I.F"/>
            <Scalar arg-name="input_high_size" type="int" port-index="2" source="I.F"/>
            <Scalar arg-name="output_low_size" type="int" port-index="3" source="I.F"/>
            <Scalar arg-name="output_high_size" type="int" port-index="4" source="I.F"/>
            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
-            <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*F*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*F*2"/>
+            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
        </Parameters>
-        <WorkSizes dim="input,0" global="1,Y,1" local="1,1,1"/>
+        <WorkSizes dim="input,0" global="1,Y,F" local="1,Y,1"/>
    </Kernel>
 </CustomLayer>

 <!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
-<CustomLayer name="QuantizeTemporaryType" type="MVCL" version="1">
-    <Where levels="256"/>
-    <Kernel entry="quantize">
-        <Source filename="binary_layers.bin"/>
-        <Parameters>
-            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
-            <Tensor arg-name="input_low" type="input" port-index="1" format="BFYX"/>
-            <Tensor arg-name="input_high" type="input" port-index="2" format="BFYX"/>
-            <Tensor arg-name="output_low" type="input" port-index="3" format="BFYX"/>
-            <Tensor arg-name="output_high" type="input" port-index="4" format="BFYX"/>
-            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
-            <Scalar arg-name="levels" type="int" source="levels"/>
-            <Scalar arg-name="input_low_size" type="int" source="input_low_size"/>
-            <Scalar arg-name="input_high_size" type="int" source="input_high_size"/>
-            <Scalar arg-name="output_low_size" type="int" source="output_low_size"/>
-            <Scalar arg-name="output_high_size" type="int" source="output_high_size"/>
-            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
-            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
-            <Data arg-name="src_local" type="local_data" dim="input,0" size="X*Y*2"/>
-            <Data arg-name="dst_local" type="local_data" dim="input,0" size="X*Y*2"/>
-        </Parameters>
-        <WorkSizes dim="input,0" global="1,1,F" local="1,1,1"/>
-    </Kernel>
-</CustomLayer>
-
-<CustomLayer name="QuantizeTemporaryType" type="MVCL" version="1">
+<CustomLayer name="FakeQuantizeBin" type="MVCL" version="1">
    <Where levels="2"/>
    <Kernel entry="binarization">
-        <Source filename="binary_layers.bin"/>
+        <Source filename="binarization.bin"/>
        <Parameters>
-            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
+            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="input_low_high" type="input" port-index="1" format="BFYX"/>
-            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
+            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
            <Scalar arg-name="switch_out" type="int" source="switch_out"/>
            <Scalar arg-name="input_low_high_size" type="int" source="input_low_size"/>
            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
@ -301,9 +253,6 @@
            <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
            <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
            <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
-
-            <Data arg-name="src_local" type="local_data" dim="input,0" size="X*F*3*2"/>
-            <Data arg-name="dst_local" type="local_data" dim="output,0" size="X*2"/>
        </Parameters>
        <WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
    </Kernel>
@ -331,9 +280,6 @@
            <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
            <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
            <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
-
-            <Data arg-name="src_local" type="local_data" dim="input,0" size="X*F*2"/>
-            <Data arg-name="dst_local" type="local_data" dim="output,0" size="X*2"/>
        </Parameters>
        <WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
    </Kernel>
@ -343,7 +289,7 @@
 <!-- An example of a kernel binding that uses data blob from IR -->
 <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
    <Kernel entry="binary_convolution">
-        <Source filename="binary_layers.bin"/>
+        <Source filename="binary_convolution.bin"/>
        <Parameters>
            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
            <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
@ -369,12 +315,10 @@
 <CustomLayer name="Resample" type="MVCL" version="1">
    <Where antialias="0"/>
    <Kernel entry="resample_nearest">
-        <Source filename="resample_nn.bin"/>
+        <Source filename="resample_noAA.bin"/>
        <Parameters>
            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*ceil(1/factor)*F*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="output,0" size="X*F*2"/>
            <Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
            <Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="factor" type="float" source="factor"/>
@ -389,12 +333,10 @@
 <CustomLayer name="Resample" type="MVCL" version="1">
    <Where antialias="1"/>
    <Kernel entry="resample_with_antialias">
-        <Source filename="resample_with_antialias.bin"/>
+        <Source filename="resample_AA.bin"/>
        <Parameters>
            <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*5*F*2"/>
-            <Data arg-name="local_dst" type="local_data" dim="output,0" size="X*F*2"/>
            <Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
            <Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
            <Scalar arg-name="factor" type="float" source="factor"/>
@ -409,7 +351,7 @@
 <CustomLayer name="Convolution" type="MVCL" version="1">
    <Where kernel="1,1" dilation="1,1"/>
    <Kernel entry="Convolution1x1_NCHW">
-       <Source filename="convolution1x1.bin"/>
+       <Source filename="convolution1x1_chw.bin"/>
       <Parameters>
           <Tensor arg-name="in" type="input" port-index="0" format="BFYX"/>
           <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
@ -429,10 +371,6 @@
           <Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
           <Scalar arg-name="output" type="int" port-index="0" source="output"/>
           <Scalar arg-name="group" type="int" port-index="0" source="group"/>
-
-           <Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*2"/>
-           <Data arg-name="out_local" type="local_data" dim="output,0" size="X*2"/>
-
       </Parameters>
       <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
   </Kernel>
@ -441,7 +379,7 @@
 <CustomLayer name="Convolution" type="MVCL" version="1">
    <Where kernel="1,1" dilation="1,1"/>
    <Kernel entry="Convolution1x1_NHWC">
-       <Source filename="convolution1x1.bin"/>
+       <Source filename="convolution1x1_hwc.bin"/>
       <Parameters>
           <Tensor arg-name="in" type="input" port-index="0" format="BYXF"/>
           <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
@ -461,9 +399,6 @@
           <Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
           <Scalar arg-name="output" type="int" port-index="0" source="output"/>
           <Scalar arg-name="group" type="int" port-index="0" source="group"/>
-
-           <Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*2"/>
-           <Data arg-name="out_local" type="local_data" dim="output,0" size="X*2"/>
       </Parameters>
       <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
   </Kernel>
@ -509,8 +444,6 @@
            <Tensor arg-name="input_feature_map" type="input" port-index="1" format="BFYX"/>
            <Tensor arg-name="input_rois" type="input" port-index="2" format="BFYX"/>
            <Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
-            <Data arg-name="local_input_priors" type="local_data" dim="input,1" size="X*2"/>
-            <Data arg-name="local_output" type="local_data" dim="input,1" size="((X+7)/8)*12*2"/>
            <Scalar arg-name="grid_h" type="int" port-index="1" source="I.Y"/>
            <Scalar arg-name="grid_w" type="int" port-index="1" source="I.X"/>
            <Scalar arg-name="stride_h" type="float" source="stride_h"/>
@ -530,8 +463,6 @@
            <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
            <Scalar arg-name="scale" type="float" source="scale"/>
            <Scalar arg-name="bias" type="float" source="bias"/>
-            <Data arg-name="local_src" type="local_data" dim="input,0" size="X*1"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*2"/>
        </Parameters>
        <WorkSizes dim="input,0" global="X,Y,F" local="X,1,1"/>
    </Kernel>
@ -570,7 +501,6 @@
            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
            <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
-            <Data arg-name="local_dst" type="local_data" dim="input,0" size="X*F*2"/>
        </Parameters>
        <WorkSizes dim="input,0" global="(X+511)/512,Y,1" local="1,1,1"/>
    </Kernel>
--- a/inference-engine/src/vpu/custom_kernels/cvtu8f16.cl
+++ b/inference-engine/src/vpu/custom_kernels/cvtu8f16.cl
@ -3,88 +3,46 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-#define USE_MANUAL_DMA 1
-
-#if defined (USE_MANUAL_DMA)
-
-__kernel void __dma_preload_cvtu8f16(
-    __global uchar* restrict src,
-    __global half*  restrict dst,
-    float   scale,
-    float   bias,
-    __local uchar* restrict local_src,
-    __local half*  restrict local_dst)
+__kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias)
 {
-    WorkGroupDmaCreate3DTransaction(
-        src + get_group_id(0)*get_local_size(0)
-            + get_group_id(1)*get_local_size(1)*get_global_size(0)
-            + get_group_id(2)*get_local_size(2)*get_global_size(0)*get_global_size(1), // src
+    __local uchar local_src[8 * 1024];
+    __local half local_dst[8 * 1024];
+
+    event_t e1 = async_work_group_copy_3D3D(
        local_src, // dst
-        get_local_size(0) * sizeof(uchar), // src width
-        get_local_size(0) * sizeof(uchar), // dst width
-        get_global_size(0) * sizeof(uchar), // src stride
-        get_local_size(0) * sizeof(uchar),  // dst stride
+        src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
+            + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src
+        get_local_size(0), // num_elements_per_line
+        get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines
+        get_global_size(0) - get_local_size(0), // src_line_stride
+        0, // dst_line_stride
        get_local_size(2), // num planes
-        get_global_size(0) * get_global_size(1) * sizeof(uchar), // src plane stride
-        get_local_size(0) * get_local_size(1) * sizeof(uchar), // dst plane stride
-        get_local_size(0) * get_local_size(1) * sizeof(uchar), // plane size
+        get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride
+        0, // dst plane stride
        0);
-}
+    wait_group_events(1, &e1);

-__kernel void __dma_postwrite_cvtu8f16(
-    __global uchar* restrict src,
-    __global half*  restrict dst,
-    float   scale,
-    float   bias,
-    __local uchar* restrict local_src,
-    __local half*  restrict local_dst)
-{
-    WorkGroupDmaCreate3DTransaction(
+    size_t idx = get_local_id(0)
+               + get_local_id(1) * get_local_size(0)
+               + get_local_id(2) * get_local_size(0) * get_local_size(1);
+
+    local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_3D3D(
+        dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
+            + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst
        local_dst, // src
-        dst + get_group_id(0)*get_local_size(0)
-            + get_group_id(1)*get_local_size(1)*get_global_size(0)
-            + get_group_id(2)*get_local_size(2)*get_global_size(0)*get_global_size(1), // dst
-        get_local_size(0) * sizeof(half), // src width
-        get_local_size(0) * sizeof(half), // dst width
-        get_local_size(0) * sizeof(half), // src stride
-        get_global_size(0) * sizeof(half), // dst stride
-        get_local_size(2), // num planes
-        get_local_size(0) * get_local_size(1) * sizeof(half), // src plane stride
-        get_global_size(0) * get_global_size(1) * sizeof(half), // dst plane stride
-        get_local_size(0) * get_local_size(1) * sizeof(half), // plane size
+        get_local_size(0), // num_elements_per_line
+        get_local_size(1), // num_lines
+        0, // src_line_stride
+        get_global_size(0) - get_local_size(0), // dst_line_stride
+        get_local_size(2), // num_planes
+        0, // src_plane_stride
+        get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
        0);
+    wait_group_events(1, &e2);
 }
-
-__kernel void cvtu8f16(
-    __global uchar* restrict src,
-    __global half*  restrict dst,
-    float   scale,
-    float   bias,
-    __local uchar* restrict local_src,
-    __local half* restrict local_dst)
-{
-    size_t idx = get_local_id(0) +
-                 get_local_id(1)*get_local_size(0) +
-                 get_local_id(2)*get_local_size(0)*get_local_size(1);
-    local_dst[idx] = convert_half(local_src[idx])*(half)scale+(half)bias;
-}
-
-#else // defined (USE_MANUAL_DMA)
-
-__kernel void cvtu8f16(
-    __global uchar* restrict src,
-    __global half*  restrict dst,
-    float   scale,
-    float   bias,
-    __local uchar* restrict local_src, // unused, added for compatibility with DMA variant
-    __local half* restrict local_dst) // unused, added for compatibility with DMA variant
-{
-    int idx = get_global_id(0) +
-              get_global_id(1) * get_global_size(0) +
-              get_global_id(2) * get_global_size(0) * get_global_size(1);
-    dst[idx] = convert_half(src[idx])*(half)scale+(half)bias;
-}
-
-#endif // defined (USE_MANUAL_DMA)
-
--- a/inference-engine/src/vpu/custom_kernels/detectron_prior_grid_gen.cl
+++ b/inference-engine/src/vpu/custom_kernels/detectron_prior_grid_gen.cl
@ -3,102 +3,63 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void __dma_preload_experimental_detectron_prior_grid_generator(
-    __global const half* restrict input_priors,
-    __global const half* restrict input_feature_map,
-    __global const half* restrict input_rois,
-    __global half* restrict output,
-    __local half* restrict local_input_priors,
-    __local half* restrict local_output,
-    int grid_h,
-    int grid_w,
-    float stride_h,
-    float stride_w,
-    int num_priors,
-    int num_anchors_per_prior) {
-
-    // Move input_priors to local memory.
-    WorkGroupDmaCreateStrideTransaction(
-        input_priors, // src
-        local_input_priors, // dst
-        num_anchors_per_prior * num_priors * sizeof(half), // src_width
-        num_anchors_per_prior * num_priors * sizeof(half), // dst_width
-        num_anchors_per_prior * num_priors * sizeof(half), // src_stride
-        num_anchors_per_prior * num_priors * sizeof(half), // dst_stride
-        num_anchors_per_prior * num_priors * sizeof(half), // total_size
-        0);
-}
-
-__kernel void __dma_postwrite_experimental_detectron_prior_grid_generator(
-    __global const half* restrict input_priors,
-    __global const half* restrict input_feature_map,
-    __global const half* restrict input_rois,
-    __global half* restrict output,
-    __local half* restrict local_input_priors,
-    __local half* restrict local_output,
-    int grid_h,
-    int grid_w,
-    float stride_h,
-    float stride_w,
-    int num_priors,
-    int num_anchors_per_prior) {
-
-    int local_width = get_local_size(0);
-    int width_start = get_group_id(0) * get_local_size(0);
-    int width_end = min(width_start + local_width, grid_w);
-    int width = width_end - width_start;
-
-    WorkGroupDmaCreateStrideTransaction(
-        local_output,                                               // src
-        output + get_group_id(0) * get_local_size(0) *
-                 num_anchors_per_prior * num_priors
-               + get_group_id(1) * get_local_size(1) * grid_w *
-                 num_anchors_per_prior * num_priors,                // dst
-        width * num_anchors_per_prior * num_priors * sizeof(half),  // src_width
-        width * num_anchors_per_prior * num_priors * sizeof(half),  // dst_width
-        grid_w * num_anchors_per_prior * num_priors * sizeof(half), // src_stride
-        grid_w * num_anchors_per_prior * num_priors * sizeof(half), // dst_stride
-        width * num_anchors_per_prior * num_priors * sizeof(half),  // total_size
-        0);
-}
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

 __kernel void experimental_detectron_prior_grid_generator(
-    __global const half* restrict input_priors,
-    __global const half* restrict input_feature_map,
-    __global const half* restrict input_rois,
-    __global half* restrict output,
-    __local half* restrict local_input_priors,
-    __local half* restrict local_output,
+    __global const half *restrict input_priors,
+    __global const half *restrict input_feature_map,
+    __global const half *restrict input_rois,
+    __global half *restrict output,
    int grid_h,
    int grid_w,
    float stride_h,
    float stride_w,
    int num_priors,
-    int num_anchors_per_prior) {
+    int num_anchors_per_prior)
+{
+    __local half local_input_priors[8 * 1024];
+    __local half local_output[8 * 1024];

-    int workgroup_width = get_local_size(0);
-    int width_start = get_group_id(0) * workgroup_width;
-    int width_end = min(width_start + workgroup_width, grid_w);
-    int width = width_end - width_start;
+    event_t e1 = async_work_group_copy(
+        local_input_priors,
+        input_priors,
+        num_anchors_per_prior * num_priors,
+        0);
+    wait_group_events(1, &e1);

-    int h = get_group_id(1);
-    int w_idx = get_group_id(0) * workgroup_width;
+    int width_start = get_group_id(0) * get_local_size(0);
+    int width_end   = min(width_start + get_local_size(0), (unsigned)grid_w);
+    int width       = width_end - width_start;
+
+    int h     = get_group_id(1);
+    int w_idx = get_group_id(0) * get_local_size(0);
    for (int w = 0; w < width; ++w) {
        #pragma unroll 4
        for (int p = 0; p < num_priors; ++p) {
            local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
-                local_input_priors[4 * p + 0] +
-                convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
+                local_input_priors[4 * p + 0]
+                + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
            local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
-                local_input_priors[4 * p + 1] +
-                convert_half(stride_h) * (convert_half(h) + 0.5);
+                local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
            local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
-                local_input_priors[4 * p + 2] +
-                convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
+                local_input_priors[4 * p + 2]
+                + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
            local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
-                local_input_priors[4 * p + 3] +
-                convert_half(stride_h) * (convert_half(h) + 0.5);
+                local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
        }
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_2D2D(
+        output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
+            + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
+                  * num_priors, // dst
+        local_output, // src
+        width * num_anchors_per_prior * num_priors, // num_elements_per_line
+        1, // num_lines
+        (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
+        (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
+        0);
+    wait_group_events(1, &e2);
 }
--- a/inference-engine/src/vpu/custom_kernels/fakequantize.cl
+++ b/inference-engine/src/vpu/custom_kernels/fakequantize.cl
@ -0,0 +1,111 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+__kernel void quantize(
+    __global const half *restrict src_data,
+    __global const half *restrict input_low,
+    __global const half *restrict input_high,
+    __global const half *restrict output_low,
+    __global const half *restrict output_high,
+    __global half *restrict dst_data,
+    int levels,
+    int input_low_size,
+    int input_high_size,
+    int output_low_size,
+    int output_high_size,
+    int W,
+    int H)
+{
+    __local half local_src[15 * 1024];
+    __local half local_dst[15 * 1024];
+
+    event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
+    wait_group_events(1, &e1);
+
+    int c = get_group_id(2);
+
+    half h_ilow  = (input_low_size == 1 ? input_low[0] : input_low[c]);
+    half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
+    half h_olow  = (output_low_size == 1 ? output_low[0] : output_low[c]);
+    half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
+
+    half const1 = (half)(
+        !(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
+    half const2 =
+        (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
+
+    __local const half *restrict src = local_src + W * get_local_id(1);
+    __local half       *restrict dst = local_dst + W * get_local_id(1);
+
+    for (int w = 0; w < W / 8; w++) {
+        half8 val = *((__local half8 *)src + w);
+        half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
+
+        aux = (half8){
+            (half)(short)(aux.s0),
+            (half)(short)(aux.s1),
+            (half)(short)(aux.s2),
+            (half)(short)(aux.s3),
+            (half)(short)(aux.s4),
+            (half)(short)(aux.s5),
+            (half)(short)(aux.s6),
+            (half)(short)(aux.s7)};
+
+        aux = aux * (half8)const2 + (half8)h_olow;
+
+        short8 a;
+        short8 b;
+        a.s0 = (val.s0 <= h_ilow);
+        a.s1 = (val.s1 <= h_ilow);
+        a.s2 = (val.s2 <= h_ilow);
+        a.s3 = (val.s3 <= h_ilow);
+        a.s4 = (val.s4 <= h_ilow);
+        a.s5 = (val.s5 <= h_ilow);
+        a.s6 = (val.s6 <= h_ilow);
+        a.s7 = (val.s7 <= h_ilow);
+
+        b.s0 = (val.s0 > h_ihigh);
+        b.s1 = (val.s1 > h_ihigh);
+        b.s2 = (val.s2 > h_ihigh);
+        b.s3 = (val.s3 > h_ihigh);
+        b.s4 = (val.s4 > h_ihigh);
+        b.s5 = (val.s5 > h_ihigh);
+        b.s6 = (val.s6 > h_ihigh);
+        b.s7 = (val.s7 > h_ihigh);
+
+        a = ~(a - (short8)1);
+        b = ~(b - (short8)1);
+
+        short8 c1 = (~a & b);
+        short8 c2 = (~a & ~b);
+
+        short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux));
+
+        *((__local half8 *)dst + w) = as_half8(res);
+    }
+
+    for (int w = W & (~0x7); w < W; w++) {
+        half val = src[w];
+        short a  = val <= h_ilow;
+        a        = ~(a - 1);
+        short b  = val > h_ihigh;
+        b        = ~(b - 1);
+
+        short c1 = (~a & b);
+        short c2 = (~a & ~b);
+
+        short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh))
+                    | (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow)));
+
+        dst[w] = as_half(res);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/grn.cl
+++ b/inference-engine/src/vpu/custom_kernels/grn.cl
@ -3,111 +3,61 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-#define USE_MANUAL_DMA 1
-
-#if defined (USE_MANUAL_DMA)
-
-__kernel void __dma_preload_grn_NCHW(
-    __global const half* restrict src,
-    __global       half* restrict dst,
-    __local        half* restrict local_src,
-    __local        half* restrict local_dst,
-    int C,
-    float bias)
+__kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias)
 {
-    WorkGroupDmaCreate3DTransaction(
-        src + get_group_id(0)*get_local_size(0)
-            + get_group_id(1)*get_local_size(1)*get_global_size(0), // src
-        local_src, // dst
-        get_local_size(0) * sizeof(half), // src width
-        get_local_size(0) * sizeof(half), // dst width
-        get_global_size(0) * sizeof(half), // src stride
-        get_local_size(0) * sizeof(half), // dst stride
-        C, // num planes
-        get_global_size(0) * get_global_size(1) * sizeof(half), // src plane stride
-        get_local_size(0) * get_local_size(1) * sizeof(half), // dst plane stride
-        get_local_size(0) * get_local_size(1) * sizeof(half), // plane size
+    __local half src[8 * 1024];
+    __local half dst[8 * 1024];
+
+    const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0);
+
+    event_t e1 = async_work_group_copy_3D3D(
+        src, // dst
+        src_data + index, // src
+        get_local_size(0), // num_elements_per_line,
+        get_local_size(1), // num_lines,
+        get_global_size(0) - get_local_size(0), // src_line_stride,
+        0, // dst_line_stride,
+        C, // num_planes,
+        get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride
+        0, // dst_plane_stride
        0);
-}
+    wait_group_events(1, &e1);

-__kernel void __dma_postwrite_grn_NCHW(
-    __global const half* restrict src,
-    __global       half* restrict dst,
-    __local  const half* restrict local_src,
-    __local        half* restrict local_dst,
-    int C,
-    float bias)
-{
-    WorkGroupDmaCreate3DTransaction(
-        local_dst, // src
-        dst + get_group_id(0)*get_local_size(0)
-            + get_group_id(1)*get_local_size(1)*get_global_size(0), // dst
-        get_local_size(0) * sizeof(half), // src width
-        get_local_size(0) * sizeof(half), // dst width
-        get_local_size(0) * sizeof(half), // src stride
-        get_global_size(0) * sizeof(half), // dst stride
-        C, // num planes
-        get_local_size(0) * get_local_size(1) * sizeof(half), // src plane stride
-        get_global_size(0) * get_global_size(1) * sizeof(half), // dst plane stride
-        get_local_size(0) * get_local_size(1) * sizeof(half), // plane size
-        0);
-}
-
-__kernel void grn_NCHW(
-    __global const half* restrict src,
-    __global       half* restrict dst,
-    __local        half* restrict local_src,
-    __local        half* restrict local_dst,
-    int C,
-    float bias)
-{
    float variance = bias + 1e-9f;

    #pragma unroll 8
-    for (int c = 0; c < C; c++)
-    {
-        float val = (float) local_src[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)];
-        variance += val*val;
+    for (int c = 0; c < C; c++) {
+        float val = (float)src[c * get_local_size(1) * get_local_size(0)
+                               + get_local_id(1) * get_local_size(0)
+                               + get_local_id(0)];
+        variance += val * val;
    }

-    half hvariance = (half)(native_rsqrt((half)(variance/16.f))*0.25f);
+    half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f);

    #pragma unroll 8
-    for (int c = 0; c < C; c++)
-    {
-        local_dst[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)]
-        = local_src[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)] * hvariance;
+    for (int c = 0; c < C; c++) {
+        dst[c * get_local_size(1) * get_local_size(0)
+            + get_local_id(1) * get_local_size(0)
+            + get_local_id(0)] =
+            src[c * get_local_size(1) * get_local_size(0)
+                  + get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance;
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_3D3D(
+        dst_data + index, // src
+        dst, // dst
+        get_local_size(0), // num_elements_per_line,
+        get_local_size(1), // num_lines,
+        0, // src_line_stride,
+        get_global_size(0) - get_local_size(0), // dst_line_stride,
+        C, // num_planes,
+        0, // src_plane_stride
+        get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
+        0);
+    wait_group_events(1, &e2);
 }
-
-#else // defined (USE_MANUAL_DMA)
-
-__kernel void grn_NCHW(
-    __global const half* restrict src,
-    __global       half* restrict dst,
-    __local        half* restrict local_src, // unused, added for compatibility with DMA variant
-    __local        half* restrict local_dst, // unused, added for compatibility with DMA variant
-    int C,
-    float bias)
-{
-    float variance = bias + 1e-9f;
-
-    #pragma unroll 4
-    for (int c = 0; c < C; c++)
-    {
-        float val = (float) src[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)];
-        variance += val*val;
-    }
-
-    half hvariance = (half)(native_rsqrt((half)(variance/16.f))*0.25f);
-
-    #pragma unroll 4
-    for (int c = 0; c < C; c++)
-    {
-        dst[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)]
-        = src[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)] * hvariance;
-    }
-}
-
-#endif // defined (USE_MANUAL_DMA)
--- a/inference-engine/src/vpu/custom_kernels/mvn.cl
+++ b/inference-engine/src/vpu/custom_kernels/mvn.cl
@ -1,390 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
-#define USE_MANUAL_DMA 1
-
-// Set to 1 if only output is zerroed before kernel execution
-#define USE_ATOMICS 0
-
-void atomic_add_global(volatile __global float *source, const float operand) {
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } newVal;
-    union {
-        unsigned int intVal;
-        float floatVal;
-    } prevVal;
-
-    do {
-        prevVal.floatVal = *source;
-        newVal.floatVal = prevVal.floatVal + operand;
-    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
-}
-
-#if defined (USE_MANUAL_DMA)
-
-__kernel void __dma_preload_reduction_mean(const __global half*  restrict src,
-                                                 __global float* restrict mean,
-                                                 __global float* restrict variance,
-                                           int W,
-                                           int H,
-                                           int across_channels,
-                                           __local half* restrict src_line)
-{
-    WorkGroupDmaCreateStrideTransaction(
-        src + get_group_id(1)*get_local_size(1)*W +
-              get_group_id(2)*get_local_size(2)*W*get_global_size(1), // src
-        src_line,                                                     // dst
-        W*get_local_size(1) * sizeof(half), // src width
-        W*get_local_size(1) * sizeof(half), // dst width
-        W*get_global_size(1) * sizeof(half), // src stride
-        W*get_local_size(1) * sizeof(half),  // dst stride
-        W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size
-        0
-        );
-}
-
-__kernel void reduction_mean(const __global half*  restrict src,
-                                   __global float* restrict mean,
-                                   __global float* restrict variance,
-                            int W,
-                            int H,
-                            int across_channels,
-                            __local half* restrict src_line)
-{
-    int h = get_global_id(1);
-    int c = get_global_id(2);
-
-    const int MAX_LOCAL_SIZE = 8;
-
-    __local float mbuf[MAX_LOCAL_SIZE];
-    __local float vbuf[MAX_LOCAL_SIZE];
-
-    mbuf[get_local_id(1)] = 0;
-    vbuf[get_local_id(1)] = 0;
-
-    if (h < H)
-    {
-        float sum  = 0.f;
-        float sum2 = 0.f;
-
-        float8 sum4  = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-        float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-
-        const __local half8* lsrc = ((const __local half8*)(src_line + get_local_id(1)*W) );
-
-        #pragma unroll 16
-        for (size_t w = 0; w < W/8; w++)
-        {
-            half8 sh = lsrc[w];
-            float8 valf = convert_float8(sh);
-
-            sum4 += valf;
-            sum24 += valf*valf;
-        }
-
-        for (size_t w = W/8*8; w < W; w++)
-        {
-            float val = (float)src_line[get_local_id(1)*W + w];
-            sum  += val;
-            sum2 += val*val;
-        }
-
-        mbuf[get_local_id(1)] = sum4.s0  + sum4.s1  + sum4.s2  + sum4.s3  + sum4.s4  + sum4.s5  + sum4.s6  + sum4.s7  + sum;
-        vbuf[get_local_id(1)] = sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(1) == 0)
-    {
-        float res  = 0;
-        float res2 = 0;
-
-        for (int i = 0; i < get_local_size(1); i++)
-        {
-            res  += mbuf[i];
-            res2 += vbuf[i];
-        }
-
-// requires memory reset before layer execution
-#if USE_ATOMICS
-        int idx = (across_channels == 0) ? c : 0;
-
-        atomic_add_global(mean + idx, res);
-        atomic_add_global(variance + idx, res2);
-#else
-        int idx = c*get_num_groups(1) + get_group_id(1);
-
-        mean[idx] = res;
-        variance[idx] = res2;
-#endif
-    }
-}
-
-__kernel void __dma_preload_mvn_scale(const __global half * restrict src,
-                          __global half * restrict dst,
-                          __global float * restrict mean_part,
-                          __global float * restrict power_mean,
-                          int W,
-                          int H1,
-                          int across_channels,
-                          int normalize_variance,
-                          int nparts,
-                          __local half * restrict src_line,
-                          __local half * restrict dst_line
-                          )
-{
-    WorkGroupDmaCreateStrideTransaction(
-        src + get_group_id(1)*get_local_size(1)*W +
-              get_group_id(2)*get_local_size(2)*W*get_global_size(1), // src
-        src_line,                                                     // dst
-        W*get_local_size(1) * sizeof(half), // src width
-        W*get_local_size(1) * sizeof(half), // dst width
-        W*get_global_size(1) * sizeof(half), // src stride
-        W*get_local_size(1) * sizeof(half),  // dst stride
-        W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size
-        0
-        );
-}
-
-__kernel void __dma_postwrite_mvn_scale(const __global half * restrict src,
-                          __global half * restrict dst,
-                          __global float * restrict mean_part,
-                          __global float * restrict power_mean,
-                          int W,
-                          int H1,
-                          int across_channels,
-                          int normalize_variance,
-                          int nparts,
-                          __local half * restrict src_line,
-                          __local half * restrict dst_line)
-{
-    WorkGroupDmaCreateStrideTransaction(
-        dst_line,                                                     // src
-        dst + get_group_id(1)*get_local_size(1)*W +
-              get_group_id(2)*get_local_size(2)*W*get_global_size(1), // dst
-        W*get_local_size(1) * sizeof(half), // src width
-        W*get_local_size(1) * sizeof(half), // dst width
-        W*get_local_size(1) * sizeof(half),  // dst stride
-        W*get_global_size(1) * sizeof(half), // src stride
-        W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size
-        0
-        );
-}
-
-__kernel void mvn_scale(const __global half * restrict src,
-                          __global half * restrict dst,
-                          __global float * restrict mean_part,
-                          __global float * restrict power_mean,
-                          int W,
-                          int H1,
-                          int across_channels,
-                          int normalize_variance,
-                          int nparts,
-                          __local half * restrict src_line,
-                          __local half * restrict dst_line)
-{
-    int h = get_global_id(1);
-    int H = get_global_size(1);
-
-    // can we avoid this check and use min/max? We can pass number of groups just as a param.
-//#if !USE_ATOMICS
-//    if (h >= H1) return;
-//#endif
-
-    int c = get_global_id(2);
-    int C = get_global_size(2);
-
-    int idx     = (across_channels == 0) ? nparts*c : 0;
-    float scale = (across_channels == 0) ?      H*W : H*W*C;
-
-#if USE_ATOMICS
-    float mean = mean_part[idx];
-    float variance = power_mean[idx];
-#else
-
-    int total   = (across_channels == 0) ?   nparts : nparts*C;
-    float mean = 0.f;
-    float variance = 0.f;
-
-    for (int i = 0; i < total; i++)
-    {
-        mean     += mean_part[idx+i];
-        variance += power_mean[idx+i];
-    }
-#endif
-
-    mean = mean/scale;
-    variance = variance/scale;
-    variance = variance - mean*mean;
-    variance = native_sqrt(variance) + 1e-9f;
-
-    half hmean = mean;
-    half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
-
-    const __local half8 * restrict src_data8 = (const __local half8 * restrict)(src_line + get_local_id(1)*W);
-    __local half8 * restrict dst_data8 = (__local half8 * restrict)(dst_line + get_local_id(1)*W);
-
-    #pragma unroll 16
-    for (size_t w = 0; w < W/8; w++)
-    {
-        dst_data8[w] = (src_data8[w] - hmean) * hvariance;
-    }
-    for (size_t w = W/8*8; w < W; w++)
-    {
-        dst_line[get_local_id(1)*W + w] = (src_line[get_local_id(1)*W + w] - hmean) * hvariance;
-    }
-}
-
-#else
-
-__kernel void reduction_mean(const __global half*  restrict src,
-                                   __global float* restrict mean,
-                                   __global float* restrict variance,
-                            int W,
-                            int H,
-                            int across_channels,
-                            __local half* restrict src_line) // for compatimility with DMA kernel
-{
-    int h = get_global_id(1);
-    int c = get_global_id(2);
-
-    const int MAX_LOCAL_SIZE = 8;
-
-    __local float mbuf[MAX_LOCAL_SIZE];
-    __local float vbuf[MAX_LOCAL_SIZE];
-
-    mbuf[get_local_id(1)] = 0;
-    vbuf[get_local_id(1)] = 0;
-
-    if (h < H)
-    {
-        float sum  = 0.f;
-        float sum2 = 0.f;
-
-        float8 sum4  = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-        float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-
-        const __global half8* src_line = (const __global half8 *)(src + c*H*W + h*W);
-
-        #pragma unroll 16
-        for (size_t w = 0; w < W/8; w++)
-        {
-            half8 sh = src_line[w];
-            float8 valf = convert_float8(sh);
-
-            sum4 += valf;
-            sum24 += valf*valf;
-        }
-
-        for (size_t w = W/8*8; w < W; w++)
-        {
-            float val = (float)src[c*H*W + h*W + w];
-
-            sum  += val;
-            sum2 += val*val;
-        }
-
-        mbuf[get_local_id(1)] = sum4.s0  + sum4.s1  + sum4.s2  + sum4.s3  + sum4.s4  + sum4.s5  + sum4.s6  + sum4.s7  + sum;
-        vbuf[get_local_id(1)] = sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (get_local_id(1) == 0)
-    {
-        float res  = 0;
-        float res2 = 0;
-
-        for (int i = 0; i < get_local_size(1); i++)
-        {
-            res  += mbuf[i];
-            res2 += vbuf[i];
-        }
-
-// requires memory reset before layer execution
-#if USE_ATOMICS
-        int idx = (across_channels == 0) ? c : 0;
-
-        atomic_add_global(mean + idx, res);
-        atomic_add_global(variance + idx, res2);
-#else
-        int idx = c*get_num_groups(1) + get_group_id(1);
-
-        mean[idx] = res;
-        variance[idx] = res2;
-#endif
-    }
-}
-
-__kernel void mvn_scale(const __global half * restrict src_data,
-                          __global half * restrict dst_data,
-                          __global float * restrict mean_part,
-                          __global float * restrict power_mean,
-                          int W,
-                          int H1,
-                          int across_channels,
-                          int normalize_variance,
-                          int nparts,
-                          __local half * restrict src_line,
-                          __local half * restrict dst_line)
-{
-    int h = get_global_id(1);
-    int H = get_global_size(1);
-
-    // can we avoid this check and use min/max? We can pass number of groups just as a param.
-//#if !USE_ATOMICS
-//    if (h >= H1) return;
-//#endif
-
-    int c = get_global_id(2);
-    int C = get_global_size(2);
-
-    int idx     = (across_channels == 0) ? nparts*c : 0;
-    float scale = (across_channels == 0) ?      H*W : H*W*C;
-
-#if USE_ATOMICS
-    float mean = mean_part[idx];
-    float variance = power_mean[idx];
-#else
-
-    int total   = (across_channels == 0) ?   nparts : nparts*C;
-    float mean = 0.f;
-    float variance = 0.f;
-
-    for (int i = 0; i < total; i++)
-    {
-        mean     += mean_part[idx+i];
-        variance += power_mean[idx+i];
-    }
-#endif
-
-    mean = mean/scale;
-    variance = variance/scale;
-    variance = variance - mean*mean;
-    variance = native_sqrt(variance) + 1e-9f;
-
-    half hmean = mean;
-    half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
-
-    const __global half8 * restrict src_data8 = (const __global half8 * restrict)(src_data + c*H*W + h*W);
-    __global half8 * restrict dst_data8 = (__global half8 * restrict)(dst_data + c*H*W + h*W);
-
-    #pragma unroll 16
-    for (size_t w = 0; w < W/8; w++)
-    {
-        dst_data8[w] = (src_data8[w] - hmean) * hvariance;
-    }
-    for (size_t w = W/8*8; w < W; w++)
-    {
-        dst_data[c*H*W + h*W + w] = (src_data[c*H*W + h*W + w] - hmean) * hvariance;
-    }
-}
-
-#endif // USE_MANUAL_DMA
--- a/inference-engine/src/vpu/custom_kernels/mvn_reduction.cl
+++ b/inference-engine/src/vpu/custom_kernels/mvn_reduction.cl
@ -0,0 +1,115 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+// Set to 1 only if output is zerroed before kernel execution
+#define USE_ATOMICS 0
+
+void atomic_add_global(volatile __global float *source, const float operand)
+{
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+
+    do {
+        prevVal.floatVal = *source;
+        newVal.floatVal  = prevVal.floatVal + operand;
+    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+__kernel void reduction_mean(
+    __global const half *restrict src,
+    __global float *restrict mean,
+    __global float *restrict variance,
+    int W,
+    int H,
+    int across_channels)
+{
+    __local half src_line[4 * 1024];
+    event_t e;
+
+    e = async_work_group_copy_2D2D(
+        src_line, // dst
+        src + get_group_id(1) * get_local_size(1) * W
+            + get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src
+        W * get_local_size(1), // num_elements_per_line,
+        get_local_size(2), // num_lines,
+        W * (get_global_size(1) - get_local_size(1)), // src_line_stride,
+        0, // dst_line_stride,
+        0);
+
+    wait_group_events(1, &e);
+
+    int h = get_global_id(1);
+    int c = get_global_id(2);
+
+    const int MAX_LOCAL_SIZE = 8;
+
+    __local float mbuf[MAX_LOCAL_SIZE];
+    __local float vbuf[MAX_LOCAL_SIZE];
+
+    mbuf[get_local_id(1)] = 0;
+    vbuf[get_local_id(1)] = 0;
+
+    if (h < H) {
+        float sum  = 0.f;
+        float sum2 = 0.f;
+
+        float8 sum4  = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+        float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+
+        const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W));
+
+        #pragma unroll 16
+        for (size_t w = 0; w < W / 8; w++) {
+            half8 sh    = lsrc[w];
+            float8 valf = convert_float8(sh);
+
+            sum4 += valf;
+            sum24 += valf * valf;
+        }
+
+        for (size_t w = W / 8 * 8; w < W; w++) {
+            float val = (float)src_line[get_local_id(1) * W + w];
+            sum += val;
+            sum2 += val * val;
+        }
+
+        mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
+        vbuf[get_local_id(1)] =
+            sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (get_local_id(1) == 0) {
+        float res  = 0;
+        float res2 = 0;
+
+        for (int i = 0; i < get_local_size(1); i++) {
+            res += mbuf[i];
+            res2 += vbuf[i];
+        }
+
+// requires memory reset before layer execution
+#if USE_ATOMICS
+        int idx = (across_channels == 0) ? c : 0;
+
+        atomic_add_global(mean + idx, res);
+        atomic_add_global(variance + idx, res2);
+#else
+        int idx = c * get_num_groups(1) + get_group_id(1);
+
+        mean[idx]     = res;
+        variance[idx] = res2;
+#endif
+    }
+}
--- a/inference-engine/src/vpu/custom_kernels/mvn_scale.cl
+++ b/inference-engine/src/vpu/custom_kernels/mvn_scale.cl
@ -0,0 +1,68 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+// Set to 1 only if output is zerroed before kernel execution
+#define USE_ATOMICS 0
+
+__attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale(
+    const __global half *restrict src,
+    __global float *restrict mean_part,
+    __global float *restrict power_mean,
+    __global half *restrict dst,
+    int W,
+    int H1,
+    int across_channels,
+    int normalize_variance,
+    int nparts)
+{
+    __local half src_line[4 * 1024];
+    __local half dst_line[4 * 1024];
+
+    int c = get_group_id(2);
+    int C = get_global_size(2);
+
+    int h = get_group_id(1);
+    int H = get_global_size(1);
+
+    event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0);
+    wait_group_events(1, &e1);
+
+    int idx     = (across_channels == 0) ? nparts * c : 0;
+    float scale = (across_channels == 0) ? H * W : H * W * C;
+
+#if USE_ATOMICS
+    float mean     = mean_part[idx];
+    float variance = power_mean[idx];
+#else
+
+    int total      = (across_channels == 0) ? nparts : nparts * C;
+    float mean     = 0.f;
+    float variance = 0.f;
+
+    for (int i = 0; i < total; i++) {
+        mean += mean_part[idx + i];
+        variance += power_mean[idx + i];
+    }
+#endif
+
+    mean     = mean / scale;
+    variance = variance / scale;
+    variance = variance - mean * mean;
+    variance = native_sqrt(variance) + 1e-9f;
+
+    half hmean     = mean;
+    half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
+
+    for (size_t w = 0; w < W; w++) {
+        dst_line[w] = (src_line[w] - hmean) * hvariance;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/quantize.cl
+++ b/inference-engine/src/vpu/custom_kernels/quantize.cl
@ -1,176 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void __dma_preload_quantize(__global half const *const restrict src,
-                                     __global half const *const restrict input_low,
-                                     __global half const *const restrict input_high,
-                                     __global half const *const restrict output_low,
-                                     __global half const *const restrict output_high,
-                                     __global half       *const restrict dst,
-                                     int levels,
-                                     int input_low_size,
-                                     int input_high_size,
-                                     int output_low_size,
-                                     int output_high_size,
-                                     int W,
-                                     int C,
-                                     __local half       *const restrict local_src,
-                                     __local half const *const restrict local_dst)
-{
-    WorkGroupDmaCreateStrideTransaction(
-        src + get_group_id(1) * get_local_size(1) * W, // src
-        local_src, // dst
-        W * sizeof(half), // src_width,
-        W * sizeof(half), // dst_width,
-        get_global_size(1) * W * sizeof(half), // src_stride,
-        W * sizeof(half), // dst_stride,
-        W * C * sizeof(half), // size
-        0);
-}
-
-__kernel void __dma_postwrite_quantize(__global half const *const restrict src,
-                                       __global half const *const restrict input_low,
-                                       __global half const *const restrict input_high,
-                                       __global half const *const restrict output_low,
-                                       __global half const *const restrict output_high,
-                                       __global half       *const restrict dst,
-                                       int levels,
-                                       int input_low_size,
-                                       int input_high_size,
-                                       int output_low_size,
-                                       int output_high_size,
-                                       int W,
-                                       int C,
-                                       __local half const *const restrict local_src,
-                                       __local half const *const restrict local_dst)
-{
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst, // src
-        dst + get_group_id(1) * get_local_size(1) * W, // dst
-        W * sizeof(half), // src_width,
-        W * sizeof(half), // dst_width,
-        W * sizeof(half), // src_stride,
-        get_global_size(1) * W * sizeof(half), // dst_stride,
-        W * C * sizeof(half), // size
-        0);
-}
-
-__kernel void quantize(__global half const *const restrict src,
-                       __global half const *const restrict input_low,
-                       __global half const *const restrict input_high,
-                       __global half const *const restrict output_low,
-                       __global half const *const restrict output_high,
-                       __global half const *const restrict dst,
-                       int levels,
-                       int input_low_size,
-                       int input_high_size,
-                       int output_low_size,
-                       int output_high_size,
-                       int W,
-                       int C,
-                       __local half const *const restrict local_src,
-                       __local half       *const restrict local_dst)
-{
-    int h = get_global_id(1);
-    int H = get_global_size(1);
-
-    for (int c = 0; c < C; c++)
-    {
-        half h_ilow  = (input_low_size   == 1 ? input_low[0]   : input_low[c]);
-        half h_ihigh = (input_high_size  == 1 ? input_high[0]  : input_high[c]);
-        half h_olow  = (output_low_size  == 1 ? output_low[0]  : output_low[c]);
-        half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
-
-        half const1 = (half)(!(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
-        half const2 = (half)(!(levels - 1)       ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
-
-        __local const half* restrict addr_src = local_src + c*W;
-        __local       half* restrict addr_dst = local_dst + c*W;
-
-        for (int w = 0; w < W / 8; w++)
-        {
-            half8 val = *((__local half8*)addr_src + w);
-#if 1
-            // round is too slow =( 902 b of code
-            //half8 aux = round((val - (half8)h_ilow) * (half8)const1);
-
-            half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
-
-            aux = (half8){
-                (half)(short)(aux.s0),
-                (half)(short)(aux.s1),
-                (half)(short)(aux.s2),
-                (half)(short)(aux.s3),
-                (half)(short)(aux.s4),
-                (half)(short)(aux.s5),
-                (half)(short)(aux.s6),
-                (half)(short)(aux.s7)
-            };
-
-            aux = aux * (half8)const2 + (half8)h_olow;
-
-            // vector comparison add 756 b of assembly, so do in manually
-            // short8 a = val <= (half8)h_olow;
-            // short8 b = val >  (half8)h_ohigh;
-
-            short8 a;
-            short8 b;
-            a.s0 = (val.s0 <= h_ilow);
-            a.s1 = (val.s1 <= h_ilow);
-            a.s2 = (val.s2 <= h_ilow);
-            a.s3 = (val.s3 <= h_ilow);
-            a.s4 = (val.s4 <= h_ilow);
-            a.s5 = (val.s5 <= h_ilow);
-            a.s6 = (val.s6 <= h_ilow);
-            a.s7 = (val.s7 <= h_ilow);
-
-            b.s0 = (val.s0 > h_ihigh);
-            b.s1 = (val.s1 > h_ihigh);
-            b.s2 = (val.s2 > h_ihigh);
-            b.s3 = (val.s3 > h_ihigh);
-            b.s4 = (val.s4 > h_ihigh);
-            b.s5 = (val.s5 > h_ihigh);
-            b.s6 = (val.s6 > h_ihigh);
-            b.s7 = (val.s7 > h_ihigh);
-
-            a = ~(a-(short8)1);
-            b = ~(b-(short8)1);
-
-            short8 c1 = (~a &  b);
-            short8 c2 = (~a & ~b);
-
-            short8 res = a  & as_short8((half8)h_olow)
-                       | c1 & as_short8((half8)h_ohigh)
-                       | c2 & as_short8(aux);
-
-            *((__local half8*)addr_dst + w) = as_half8(res);
-#else
-            *((__local half8*)addr_dst + w) = val;
-#endif
-        }
-
-        for (int w = W & (~0x7); w < W; w++)
-        //for (int w = 0         ; w < W; w++)
-        {
-            half val = addr_src[w];
-#if 1
-            short a = val <= h_ilow; a = ~(a-1);
-            short b = val > h_ihigh; b = ~(b-1);
-
-            short c1 = (~a &  b);
-            short c2 = (~a & ~b);
-
-            short res = a  & as_short(h_olow)
-                      | c1 & as_short(h_ohigh)
-                      | c2 & as_short(((half)(round( (val - h_ilow) * const1) * const2) + h_olow));
-
-            addr_dst[w] = as_half(res);
-#else
-            addr_dst[w] = val;
-#endif
-        }
-    }
-}
--- a/inference-engine/src/vpu/custom_kernels/region.cl
+++ b/inference-engine/src/vpu/custom_kernels/region.cl
@ -1,474 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
-
-#define ALLOW_EARLY_RETURN 1
-
-#define USE_MANUAL_DMA 1
-
-#if USE_MANUAL_DMA
-
-static void inline logistic_activate(__local const half* restrict src,
-                                     __local       half* restrict dst,
-                                     int offset)
-{
-    half val = src[offset];
-    val = 1.0h / (1.0h + exp2(val * -log_2_e));
-    dst[offset] = val;
-}
-
-__kernel void __dma_preload_region_chw(
-    __global const half* restrict src,
-    __global       half* restrict _0,
-    __local        half* restrict local_src,
-    __local        half* restrict _1,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int local_C = classes + coords + 1;
-    const int c = get_group_id(1)*local_C;
-    const int h = get_group_id(0);
-
-    WorkGroupDmaCreateStrideTransaction(
-        src + c*H*W + h*W, // src
-        local_src, // dst
-        W*sizeof(half), // src_width,
-        W*sizeof(half), // dst_width,
-        W*H*sizeof(half), // src_stride,
-        W*sizeof(half), // dst_stride,
-        W*local_C*sizeof(half), // size
-        0);
-}
-
-__kernel void __dma_postwrite_region_chw(
-    __global       half* restrict _0,
-    __global       half* restrict dst,
-    __local        half* restrict _1,
-    __local  const half* restrict local_dst,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int local_C = classes + coords + 1;
-    const int c = get_group_id(1)*local_C;
-    const int h = get_group_id(0);
-
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst, // src
-        dst + c*H*W + h*W, // dst
-        W*sizeof(half), // src_width,
-        W*sizeof(half), // dst_width,
-        W*sizeof(half), // src_stride,
-        W*H*sizeof(half), // dst_stride,
-        W*local_C*sizeof(half), // size
-        0);
-}
-
-__kernel void region_chw(
-    __global       half* restrict src_data,
-    __global       half* restrict dst_data,
-    __local  const half* restrict local_src,
-    __local        half* restrict local_dst,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int w = get_local_id(0);
-
-#if ALLOW_EARLY_RETURN
-    if (w >= W) return;
-#endif
-
-    __local const half *restrict src = local_src + w;
-    __local       half *restrict dst = local_dst + w;
-
-    const int stride = W;
-    logistic_activate(src, dst, 0*stride);
-    logistic_activate(src, dst, 1*stride);
-
-    //copy plane 2 and 3
-    dst[2*stride] = src[2*stride];
-    dst[3*stride] = src[3*stride];
-
-    logistic_activate(src, dst, 4*stride);
-
-    src += (coords + 1)*stride;
-    dst += (coords + 1)*stride;
-
-    if (doSoftmax)
-    {
-        half max_val = src[0];
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            max_val = max(max_val, src[c*stride]);
-        }
-
-        half expSum = 0.0h;
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            const half e = src[c*stride] - max_val;
-            const half tmp = exp2(e * log_2_e);
-            dst[c*stride] = tmp;
-            expSum += tmp;
-        }
-
-        const half invExpSum = 1.0h / expSum;
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            dst[c*stride] *= invExpSum;
-        }
-    }
-    else
-    {
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            logistic_activate(src, dst, c*stride);
-        }
-    }
-}
-
-__kernel void __dma_preload_region_hwc(
-    __global const half* restrict src,
-    __global       half* restrict _0,
-    __local        half* restrict local_src,
-    __local        half* restrict _1,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int local_C = classes + coords + 1;
-    const int c = get_group_id(1)*local_C;
-    const int h = get_group_id(0);
-    if (!doSoftmax) num = maskSize;
-    const int C = local_C*num;
-
-    WorkGroupDmaCreateStrideTransaction(
-        src + h*W*C + c, // src
-        local_src, // dst
-        local_C*sizeof(half), // src_width,
-        local_C*sizeof(half), // dst_width,
-        C*sizeof(half), // src_stride,
-        local_C*sizeof(half), // dst_stride,
-        local_C*W*sizeof(half), // size
-        0);
-}
-
-__kernel void __dma_postwrite_region_hwc(
-    __global       half* restrict _0,
-    __global       half* restrict dst,
-    __local        half* restrict _1,
-    __local  const half* restrict local_dst,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    // Region always outputs in CHW layout; same as postwrite_chw
-    const int local_C = classes + coords + 1;
-    const int c = get_group_id(1)*local_C;
-    const int h = get_group_id(0);
-
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst, // src
-        dst + c*H*W + h*W, // dst
-        W*sizeof(half), // src_width,
-        W*sizeof(half), // dst_width,
-        W*sizeof(half), // src_stride,
-        W*H*sizeof(half), // dst_stride,
-        W*local_C*sizeof(half), // size
-        0);
-}
-
-static void inline logistic_activate_hwc(__local const half* restrict src,
-                                         __local       half* restrict dst,
-                                         int offset,
-                                         int stride)
-{
-    half val = src[offset];
-    val = 1.0h / (1.0h + exp2(val * -log_2_e));
-    dst[offset*stride] = val;
-}
-
-__kernel void region_hwc(
-    __global       half* restrict src_data,
-    __global       half* restrict dst_data,
-    __local  const half* restrict local_src,
-    __local        half* restrict local_dst,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int w = get_local_id(0);
-
-#if ALLOW_EARLY_RETURN
-    if (w >= W) return;
-#endif
-
-    const int local_C = classes + coords + 1;
-
-    __local const half *restrict src = local_src + w*local_C;
-    __local       half *restrict dst = local_dst + w;
-
-    const int stride = W;
-    logistic_activate_hwc(src, dst, 0, stride);
-    logistic_activate_hwc(src, dst, 1, stride);
-
-    //copy plane 2 and 3
-    dst[2*stride] = src[2];
-    dst[3*stride] = src[3];
-
-    logistic_activate_hwc(src, dst, 4, stride);
-
-    src += coords + 1;
-    dst += (coords + 1)*stride;
-
-    if (doSoftmax)
-    {
-        half max_val = src[0];
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            max_val = max(max_val, src[c]);
-        }
-
-        half expSum = 0.0h;
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            const half e = src[c] - max_val;
-            const half tmp = exp2(e * log_2_e);
-            dst[c*stride] = tmp;
-            expSum += tmp;
-        }
-
-        const half invExpSum = 1.0h / expSum;
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            dst[c*stride] *= invExpSum;
-        }
-    }
-    else
-    {
-        #pragma unroll 4
-        for (int c = 0; c < classes; c++)
-        {
-            logistic_activate_hwc(src, dst, c, stride);
-        }
-    }
-}
-
-#else // defined (USE_MANUAL_DMA)
-
-#define NUM_CLASSES 80
-
-static void inline logistic_activate(__global const half* restrict src,
-                                     __global       half* restrict dst,
-                                     int offset)
-{
-    half val = src[offset];
-    val = 1.0h / (1.0h + exp2(val * -log_2_e));
-    dst[offset] = val;
-}
-
-__kernel void region_chw(
-    __global const half* restrict global_src,
-    __global       half* restrict global_dst,
-    __local        half* restrict _0,
-    __local        half* restrict _1,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int w = get_local_id(0);
-
-#if ALLOW_EARLY_RETURN
-    if (w >= W) return;
-#endif
-
-    const int local_C = classes + coords + 1;
-    const int c = get_group_id(1)*local_C;
-    const int h = get_group_id(0);
-
-    __global const half *restrict src = global_src + c*H*W + h*W + w;
-    __global       half *restrict dst = global_dst + c*H*W + h*W + w;
-
-    const int stride = H*W;
-    logistic_activate(src, dst, 0*stride);
-    logistic_activate(src, dst, 1*stride);
-
-    //copy plane 2 and 3
-    dst[2*stride] = src[2*stride];
-    dst[3*stride] = src[3*stride];
-
-    logistic_activate(src, dst, 4*stride);
-
-    src += (coords + 1)*stride;
-    dst += (coords + 1)*stride;
-
-    if (doSoftmax)
-    {
-        __private half data[NUM_CLASSES];
-
-        half max_val = src[0];
-        for (int c = 0; c < classes; c++)
-        {
-            half tmp = src[c*stride];
-            data[c] = tmp;
-            max_val = max(max_val, tmp);
-        }
-
-        half expSum = 0.0h;
-        for (int c = 0; c < classes; c++)
-        {
-            half tmp = half_exp(data[c] - max_val);
-            data[c] = tmp;
-            expSum += tmp;
-        }
-
-        for (int c = 0; c < classes; c++)
-        {
-            dst[c*stride] = data[c] / expSum;
-        }
-    }
-    else
-    {
-        #pragma unroll 4
-        for (int c = 0;  c < classes; c++)
-        {
-            logistic_activate(src, dst, c*stride);
-        }
-    }
-}
-
-static void inline logistic_activate_hwc(__global const half* restrict src,
-                                         __global       half* restrict dst,
-                                         int offset,
-                                         int stride)
-{
-    half val = src[offset];
-    val = 1.0h / (1.0h + exp2(val * -log_2_e));
-    dst[offset*stride] = val;
-}
-
-
-__kernel void region_hwc(
-    __global const half* restrict global_src,
-    __global       half* restrict global_dst,
-    __local        half* restrict _0,
-    __local        half* restrict _1,
-    int W,           /* 13 */
-    int H,           /* 13 */
-    int classes,     /* 20 */
-    int coords,      /* 4  */
-    int num,         /* 5  */
-    int maskSize,
-    int doSoftmax
-    )
-{
-    const int w = get_local_id(0);
-
-#if ALLOW_EARLY_RETURN
-    if (w >= W) return;
-#endif
-
-    const int local_C = classes + coords + 1;
-    const int c = get_group_id(1)*local_C;
-    const int h = get_group_id(0);
-    const int C = num*local_C;
-
-    __global const half *restrict src = global_src + h*W*C + w*C + c;
-    __global       half *restrict dst = global_dst + c*H*W + h*W + w;
-
-    const int stride = H*W;
-    logistic_activate_hwc(src, dst, 0, stride);
-    logistic_activate_hwc(src, dst, 1, stride);
-
-    //copy plane 2 and 3
-    dst[2*stride] = src[2];
-    dst[3*stride] = src[3];
-
-    logistic_activate_hwc(src, dst, 4, stride);
-
-    src += coords + 1;
-    dst += (coords + 1)*stride;
-
-    if (doSoftmax)
-    {
-        __private half data[NUM_CLASSES];
-
-        half max_val = src[0];
-        for (int c = 0; c < classes; c++)
-        {
-            half tmp = src[c];
-            data[c] = tmp;
-            max_val = max(max_val, tmp);
-        }
-
-        half expSum = 0.0h;
-        for (int c = 0; c < classes; c++)
-        {
-            half tmp = half_exp(data[c] - max_val);
-            data[c] = tmp;
-            expSum += tmp;
-        }
-
-        for (int c = 0; c < classes; c++)
-        {
-            dst[c*stride] = data[c] / expSum;
-        }
-    }
-    else
-    {
-        #pragma unroll 4
-        for (int c = 0;  c < classes; c++)
-        {
-            logistic_activate_hwc(src, dst, c, stride);
-        }
-    }
-}
-
-#endif // defined (USE_MANUAL_DMA)
--- a/inference-engine/src/vpu/custom_kernels/region_chw.cl
+++ b/inference-engine/src/vpu/custom_kernels/region_chw.cl
@ -3,75 +3,106 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-#define NUM_CLASSES 80
+__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))

-#define nlog_2_e ((half)(-1.442695040888963))
+#define ALLOW_EARLY_RETURN 1

-static void logistic_activate(__global const half* restrict src_data,
-                              __global       half* restrict dst_data,
-                              int offset)
+static void inline logistic_activate(__local const half *restrict src, __local half *restrict dst, int offset)
 {
-    half val = src_data[offset];
-    val = 1.f/(1.f + __builtin_shave_sau_exp2_f16_l_r(val*nlog_2_e));
-    dst_data[offset] = val;
+    half val    = src[offset];
+    val         = 1.0h / (1.0h + exp2(val * -log_2_e));
+    dst[offset] = val;
 }

-__kernel void region_ocl(__global const half* restrict src_data,
-                         __global       half* restrict dst_data,
-                         int W,
-                         int H,
-                         int classes,
-                         int coords,
-                         int num,
-                         int maskSize,
-                         int doSoftmax)
+__kernel void region_chw(
+    __global const half *restrict src_data,
+    __global half *restrict dst_data,
+    int W,
+    int H,
+    int classes,
+    int coords,
+    int num,
+    int maskSize,
+    int doSoftmax)
 {
-    int box_sz = H * W * (classes + coords + 1);
-    int pixel_pos =  min((int)get_global_id(0), H*W);
-    int box = get_global_id(1);
+    __local half local_src[13 * 13 * (4 + 1 + 80)];
+    __local half local_dst[13 * 13 * (4 + 1 + 80)];

-    //if (pixel_pos >= H*W) return;
+    const int box_sz = W * H * (classes + coords + 1);
+    event_t e1       = async_work_group_copy(local_src, src_data + get_group_id(1) * box_sz, box_sz, 0);
+    wait_group_events(1, &e1);

-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W);
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W);
+    const int pixel_pos = get_local_id(0);
+    const int stride    = W * H;

-    //copy plane 2 and 3
-    dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W];
-    dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W];
+#if ALLOW_EARLY_RETURN
+    if (pixel_pos < W * H)
+#endif
+    {
+        __local const half *restrict src = local_src + pixel_pos;
+        __local half *restrict dst       = local_dst + pixel_pos;

-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W);
+        logistic_activate(src, dst, 0 * stride);
+        logistic_activate(src, dst, 1 * stride);

-    int data_offset =  box * box_sz + (coords + 1) * W * H;
+        //copy plane 2 and 3
+        dst[2 * stride] = src[2 * stride];
+        dst[3 * stride] = src[3 * stride];

-    __private half data[NUM_CLASSES];
+        logistic_activate(src, dst, 4 * stride);

-    if (doSoftmax) {
-        half max_val = src_data[data_offset + 0*H*W + pixel_pos];
-        for (int c = 0; c < classes; c++) {
-            half tmp = src_data[data_offset + c*H*W + pixel_pos];
-            data[c] = tmp;
-            max_val = max( max_val, tmp);
-        }
+        src += (coords + 1) * stride;
+        dst += (coords + 1) * stride;

-        half expSum = 0.0f;
+        if (doSoftmax) {
+            half max_val = src[0];
+            #pragma unroll 4
+            for (int c = 1; c < classes; c++) {
+                max_val = max(max_val, src[c * stride]);
+            }

-        for (int c = 0; c < classes; c++) {
-            half tmp = half_exp(data[c] - max_val);
-            data[c] = tmp;
-            expSum += tmp;
-        }
-        for (int c = 0; c < classes; c++) {
-            data[c] = data[c] / expSum;
-        }
+            half expSum = 0.0h;
+            #pragma unroll 4
+            for (int c = 0; c < classes; c++) {
+                const half e    = src[c * stride] - max_val;
+                const half tmp  = exp2(e * log_2_e);
+                dst[c * stride] = tmp;
+                expSum += tmp;
+            }

-        for (int c = 0; c < classes; c++) {
-            dst_data[data_offset + c*H*W + pixel_pos + 0] = data[c];
-        }
-    }
-    else {
-        for (int i = 0;  i < classes; i++) {
-            logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + (5 + i)*H*W);
+            const half recip = 1.h / expSum;
+            int c            = 0;
+            for (; c < (classes & ~0x3); c += 4) {
+                const half t0 = dst[(c + 0) * stride];
+                const half t1 = dst[(c + 1) * stride];
+                const half t2 = dst[(c + 2) * stride];
+                const half t3 = dst[(c + 3) * stride];
+
+                const half e0 = t0 * recip;
+                const half e1 = t1 * recip;
+                const half e2 = t2 * recip;
+                const half e3 = t3 * recip;
+
+                dst[(c + 0) * stride] = e0;
+                dst[(c + 1) * stride] = e1;
+                dst[(c + 2) * stride] = e2;
+                dst[(c + 3) * stride] = e3;
+            }
+            for (; c < classes; c++) {
+                dst[c * stride] *= recip;
+            }
+        } else {
+            #pragma unroll 4
+            for (int c = 0; c < classes; c++) {
+                logistic_activate(src, dst, c * stride);
+            }
        }
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy(dst_data + get_group_id(1) * box_sz, local_dst, box_sz, 0);
+    wait_group_events(1, &e2);
 }
--- a/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch0.cl
+++ b/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch0.cl
@ -1,58 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define NUM_CLASSES 80
-
-static void logistic_activate(__global const half* restrict src_data,
-                              __global       half* restrict dst_data,
-                              int offset)
-{
-    half val = src_data[offset];
-    val = 1.0f/(1.0f + native_exp(-val));
-    dst_data[offset] = val;
-}
-
-__kernel void region_ocl(__global const half* restrict src_data,
-                         __global       half* restrict dst_data,
-                         int W,
-                         int H,
-                         int classes,
-                         int coords)
-{
-    const int box_sz = H * W * (classes + coords + 1);
-    const int pixel_pos = min((int)get_global_id(0), ((H*W) - 1));
-    const int box = get_global_id(1);
-
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W);
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W);
-
-    //copy plane 2 and 3
-    dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W];
-    dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W];
-
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W);
-    int data_offset =  box * box_sz + (coords + 1) * W * H;
-
-    __private half data[NUM_CLASSES];
-
-    half max_val = src_data[data_offset + 0*H*W + pixel_pos];
-    for (int c = 0; c < classes; c++) {
-        half tmp = src_data[data_offset + c*H*W + pixel_pos];
-        data[c] = tmp;
-        max_val = max( max_val, tmp);
-    }
-
-    half expSum = 0.0f;
-
-    for (int c = 0; c < classes; c++) {
-        half tmp = half_exp(data[c] - max_val);
-        data[c] = tmp;
-        expSum += tmp;
-    }
-    for (int c = 0; c < classes; c++) {
-        dst_data[data_offset + c*H*W + pixel_pos + 0] = data[c] / expSum;
-    }
-}
--- a/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch1.cl
+++ b/inference-engine/src/vpu/custom_kernels/region_chw_m7_branch1.cl
@ -1,43 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define NUM_CLASSES 80
-
-static void logistic_activate(__global const half* restrict src_data,
-                              __global       half* restrict dst_data,
-                              int offset)
-{
-    half val = src_data[offset];
-    val = 1.0f/(1.0f + native_exp(-val));
-    dst_data[offset] = val;
-}
-
-__kernel void region_ocl(__global const half* restrict src_data,
-                         __global       half* restrict dst_data,
-                         int W,
-                         int H,
-                         int classes,
-                         int coords)
-{
-    int box_sz = H * W * (classes + coords + 1);
-    int pixel_pos = min((int)get_global_id(0), ((H*W) - 1));
-    int box = get_global_id(1);
-
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W);
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W);
-
-    //copy plane 2 and 3
-    dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W];
-    dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W];
-
-    logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W);
-
-    int data_offset =  box * box_sz + (coords + 1) * W * H;
-
-    for (int i = 0;  i < classes; i++) {
-        logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + (5 + i)*H*W);
-    }
-}
--- a/inference-engine/src/vpu/custom_kernels/region_hwc.cl
+++ b/inference-engine/src/vpu/custom_kernels/region_hwc.cl
@ -0,0 +1,114 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
+
+#define ALLOW_EARLY_RETURN 1
+
+static void inline logistic_activate_hwc(
+    __local const half *restrict src,
+    __local half *restrict dst,
+    int offset,
+    int stride)
+{
+    half val             = src[offset];
+    val                  = 1.0h / (1.0h + exp2(val * -log_2_e));
+    dst[offset * stride] = val;
+}
+
+__kernel void region_hwc(
+    __global const half *restrict src,
+    __global half *restrict dst,
+    int W,
+    int H,
+    int classes,
+    int coords,
+    int num,
+    int maskSize,
+    int doSoftmax)
+{
+    __local half local_src[13 * 13 * (4 + 1 + 80)];
+    __local half local_dst[13 * 13 * (4 + 1 + 80)];
+
+    const int pixel_pos = get_local_id(0);
+
+    const int local_C = classes + coords + 1;
+    const int c       = get_group_id(1) * local_C;
+    const int h       = get_group_id(0);
+
+    num         = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
+    const int C = local_C * num;
+
+    event_t e1 = async_work_group_copy_2D2D(
+        local_src, // dst
+        src + h * W * C + c, // src
+        local_C, // num_elements_per_line,
+        H * W, // num_lines,
+        C - local_C, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+
+    wait_group_events(1, &e1);
+
+#if ALLOW_EARLY_RETURN
+    if (pixel_pos < W * H)
+#endif
+    {
+        const int w = pixel_pos % W;
+        const int h = pixel_pos / W;
+
+        __local const half *restrict src = local_src + h * W * local_C + w * local_C;
+        __local half *restrict dst       = local_dst + h * W + w;
+
+        const int stride = H * W;
+        logistic_activate_hwc(src, dst, 0, stride);
+        logistic_activate_hwc(src, dst, 1, stride);
+
+        //copy plane 2 and 3
+        dst[2 * stride] = src[2];
+        dst[3 * stride] = src[3];
+
+        logistic_activate_hwc(src, dst, 4, stride);
+
+        src += coords + 1;
+        dst += (coords + 1) * stride;
+
+        if (doSoftmax) {
+            half max_val = src[0];
+            #pragma unroll 4
+            for (int c = 1; c < classes; c++) {
+                max_val = max(max_val, src[c]);
+            }
+
+            half expSum = 0.0h;
+            #pragma unroll 4
+            for (int c = 0; c < classes; c++) {
+                const half e    = src[c] - max_val;
+                const half tmp  = exp2(e * log_2_e);
+                dst[c * stride] = tmp;
+                expSum += tmp;
+            }
+
+            const half invExpSum = 1.0h / expSum;
+            #pragma unroll 4
+            for (int c = 0; c < classes; c++) {
+                dst[c * stride] *= invExpSum;
+            }
+        } else {
+            #pragma unroll 4
+            for (int c = 0; c < classes; c++) {
+                logistic_activate_hwc(src, dst, c, stride);
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int box_sz = W * H * (classes + coords + 1);
+    event_t e2       = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/reorg_chw.cl
+++ b/inference-engine/src/vpu/custom_kernels/reorg_chw.cl
@ -3,119 +3,65 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-#define USE_MANUAL_DMA
-
-#if defined (USE_MANUAL_DMA)
-
-__kernel void __dma_preload_reorg_chw(__global half const *restrict src,
-                                      __global half       *restrict dst,
-                                      int W,
-                                      int H,
-                                      int C,
-                                      int stride,
-                                      __local half        *restrict local_src,
-                                      __local half        *restrict local_dst
-                                      )
+__kernel void reorg_chw(
+    __global const half *restrict src,
+    __global half *restrict dst,
+    int W,
+    int H,
+    int C,
+    int stride)
 {
-    const int stride_y = get_group_id(1);
+    __local half local_src[8 * 1024];
+    __local half local_dst[8 * 1024];

-    const int srcIdx = stride_y*W*stride + W*stride*stride*get_group_id(0);
-
-    WorkGroupDmaCreateStrideTransaction(
-        src + srcIdx, // src
+    event_t e1 = async_work_group_copy_2D2D(
        local_src, // dst
-        W * stride * sizeof(half), // src width
-        W * stride * sizeof(half), // dst width
-        W * stride * stride * get_num_groups(0) * sizeof(half), // src stride
-        W * stride * sizeof(half),  // dst stride
-        W * stride * get_local_size(0) * sizeof(half), //total size
+        src + get_group_id(1) * W * stride
+            + get_group_id(0) * W * stride * stride, // src
+        W * stride, // num_elements_per_line,
+        get_local_size(0), // num_lines,
+        W * stride * (stride * get_num_groups(0) - 1), // src_line_stride,
+        0, // dst_line_stride,
        0);
-}
+    wait_group_events(1, &e1);

-__kernel void __dma_postwrite_reorg_chw(__global half const *restrict src,
-                                        __global half       *restrict dst,
-                                        int W,
-                                        int H,
-                                        int C,
-                                        int stride,
-                                        __local half       *restrict local_src,
-                                        __local half const *restrict local_dst
-                                        )
-{
-    const int stride_y = get_group_id(1);
-
-    const int dstIdx = stride_y*W*stride*get_global_size(0) + get_group_id(0)*W;
-
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst, // src
-        dst + dstIdx, // dst
-        W * sizeof(half), // src width
-        W * sizeof(half), // dst width
-        W * sizeof(half), // src stride
-        W * get_num_groups(0) * sizeof(half),  // dst stride
-        get_local_size(0) * W * stride * sizeof(half), //total size
-        0);
-}
-
-__kernel void reorg_chw(__global half const *restrict src,
-                        __global half       *restrict dst,
-                        int W,
-                        int H,
-                        int C,
-                        int stride,
-                        __local half       *restrict local_src,
-                        __local half       *restrict local_dst
-                        )
-{
-    const int c = get_local_id(0);
+    const int c        = get_local_id(0);
    const int stride_x = get_local_id(1);

-    const int srcIdx = stride_x + c*W*stride;
-    const int dstIdx = stride_x*W*get_local_size(0) + c*W;
+    const int srcIdx = stride_x + c * W * stride;
+    const int dstIdx = stride_x * W * get_local_size(0) + c * W;

    int x = 0;
    for (; x <= W - 8; x += 8) {
-         half8 data = (half8) {
-             local_src[srcIdx + (x + 0)*stride], local_src[srcIdx + (x + 1)*stride],
-             local_src[srcIdx + (x + 2)*stride], local_src[srcIdx + (x + 3)*stride],
-             local_src[srcIdx + (x + 4)*stride], local_src[srcIdx + (x + 5)*stride],
-             local_src[srcIdx + (x + 6)*stride], local_src[srcIdx + (x + 7)*stride]
-         };
+        half8 data = (half8){
+            local_src[srcIdx + (x + 0) * stride],
+            local_src[srcIdx + (x + 1) * stride],
+            local_src[srcIdx + (x + 2) * stride],
+            local_src[srcIdx + (x + 3) * stride],
+            local_src[srcIdx + (x + 4) * stride],
+            local_src[srcIdx + (x + 5) * stride],
+            local_src[srcIdx + (x + 6) * stride],
+            local_src[srcIdx + (x + 7) * stride]};

-         *((__local half8*)(&local_dst[dstIdx + x])) = data;
+        *((__local half8 *)(&local_dst[dstIdx + x])) = data;
    }

    for (; x < W; x++) {
-        local_dst[dstIdx + x] = local_src[srcIdx + x*stride];
+        local_dst[dstIdx + x] = local_src[srcIdx + x * stride];
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_2D2D(
+        dst + get_group_id(0) * W
+            + get_group_id(1) * W * stride * get_global_size(0), // dst
+        local_dst, // src
+        W, // num_elements_per_line
+        get_local_size(0) * stride, // num_lines
+        0, // src_line_stride
+        W * (get_num_groups(0) - 1), // dst_line_stride
+        0);
+    wait_group_events(1, &e2);
 }
-
-#else
-
-__kernel void reorg_chw(__global half const *restrict src,
-                        __global half       *restrict dst,
-                        int W,
-                        int H,
-                        int C,
-                        int stride,
-                        __local half const *restrict _0,
-                        __local half       *restrict _1
-                        )
-{
-    const int stride_x = get_local_id(1);
-    const int stride_y = get_group_id(1);
-    const int N = get_global_size(0);
-    const int c = get_local_id(0)*get_num_groups(0) + get_group_id(0);
-
-    const int srcIdx = c*W*stride*stride + stride_x + stride_y*W*stride;
-    const int dstIdx = c*W + stride_x*W*N + stride_y*W*N*stride;
-
-    #pragma unroll 8
-    for (int x = 0; x < W; x++) {
-        dst[dstIdx + x] = src[srcIdx + x*stride];
-    }
-}
-
-#endif
-
--- a/inference-engine/src/vpu/custom_kernels/reorg_chw_local.cl
+++ b/inference-engine/src/vpu/custom_kernels/reorg_chw_local.cl
@ -1,40 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-// kernel with local memory buffer
-__kernel void reorg(__global const half* restrict src,
-                    __global       half* restrict out,
-                    __local        half* restrict tmp,
-                    int H,
-                    int W,
-                    int stride)
-{
-    int h = min((int)get_global_id(0), H-1);
-
-    int c = get_global_id(1);
-    int C = get_global_size(1);
-    int C2 = C/(stride*stride);
-
-    int offset = c / C2;
-
-    int c2 = c - C2 * offset;
-
-    int H2 = H*stride;
-    int W2 = W*stride;
-
-    for (int w = 0; w < W; ++w)
-    {
-        int h2 = h*stride + offset / stride;
-        int w2 = w*stride + offset - stride * (offset / stride);
-
-        tmp[get_local_id(1)*get_local_size(0)*W + get_local_id(0)*W + w] = src[W2*H2*c2 + W2*h2 + w2];
-    }
-
-    for (int w = 0; w < W; ++w)
-    {
-        out[W*H*c + W*h + w] = tmp[get_local_id(1)*get_local_size(0)*W + get_local_id(0)*W + w];
-    }
-}
--- a/inference-engine/src/vpu/custom_kernels/reorg_chw_stack.cl
+++ b/inference-engine/src/vpu/custom_kernels/reorg_chw_stack.cl
@ -1,45 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define MAX_W 512
-
-// kernel that uses private memory on stack
-__kernel void reorg(__global const half* restrict src,
-                    __global       half* restrict out,
-                    int H,
-                    int W,
-                    int stride)
-{
-    int h = min((int)get_global_id(0), H-1);
-
-    int c = get_global_id(1);
-    int C = get_global_size(1);
-    int C2 = C/(stride*stride);
-
-    int offset = c / C2;
-
-    int c2 = c - C2 * offset;
-
-    int b = get_global_id(2);
-
-    __private half tmp[MAX_W];
-
-    int H2 = H*stride;
-    int W2 = W*stride;
-
-    for (int w = 0; w < W; ++w)
-    {
-        int h2 = h*stride + offset / stride;
-        int w2 = w*stride + offset - stride * (offset / stride);
-
-        tmp[w] = src[W2*H2*C2*b + W2*H2*c2 + W2*h2 + w2];
-    }
-
-    for (int w = 0; w < W; ++w)
-    {
-        out[W*H*C*b + W*H*c + W*h + w] = tmp[w];
-    }
-}
--- a/inference-engine/src/vpu/custom_kernels/reorg_hwc.cl
+++ b/inference-engine/src/vpu/custom_kernels/reorg_hwc.cl
@ -3,66 +3,32 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

-__kernel void __dma_preload_reorg_hwc(__global half const *restrict src,
-                                        __global half       *restrict _0,
-                                        int W,
-                                        int H,
-                                        int C,
-                                        int stride,
-                                        __local half       *restrict local_src,
-                                        __local half       *restrict _1
-                                        )
+__kernel void reorg_hwc(
+    __global half const *restrict src,
+    __global half *restrict dst,
+    int W,
+    int H,
+    int C,
+    int stride)
 {
-    const int stride_x = get_group_id(1);
+    __local half local_src[8 * 1024];
+    __local half local_dst[8 * 1024];

-    WorkGroupDmaCreateStrideTransaction(
-        src + get_group_id(0) * stride + stride_x * C, // src
+    event_t e1 = async_work_group_copy_2D2D(
        local_src, // dst
-        stride * sizeof(half), // src_width,
-        stride * sizeof(half), // dst_width,
-        C * stride * sizeof(half), // src_stride,
-        stride * sizeof(half), // dst_stride,
-        H * W * sizeof(half), // size
+        src + get_group_id(0) * stride + get_group_id(1) * C, // src
+        stride, // num_elements_per_line
+        H * W / stride, // num_lines
+        (C - 1) * stride, // src_line_stride
+        0, // dst_line_stride
        0);
-}
+    wait_group_events(1, &e1);

-__kernel void __dma_postwrite_reorg_hwc(__global half const *restrict _0,
-                                        __global half       *restrict dst,
-                                        int W,
-                                        int H,
-                                        int C,
-                                        int stride,
-                                        __local half       *restrict _1,
-                                        __local half       *restrict local_dst
-                                        )
-{
-    const int stride_x = get_group_id(1);
-
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst, // src
-        dst + stride_x * C + get_group_id(0) * stride, // dst
-        stride * sizeof(half), // src_width,
-        stride * sizeof(half), // dst_width,
-        stride * sizeof(half), // src_stride,
-        C * stride * sizeof(half), // dst_stride,
-        W * H * sizeof(half), // size
-        0);
-}
-
-__kernel void reorg_hwc(__global half const *restrict src,
-                        __global half       *restrict dst,
-                        int W,
-                        int H,
-                        int C,
-                        int stride,
-                        __local half        *restrict local_src,
-                        __local half        *restrict local_dst
-                        )
-{
    const int stride_y = get_local_id(1);
-    const int blocks = get_local_size(0);
-    const int b = get_local_id(0);
+    const int blocks   = get_local_size(0);
+    const int b        = get_local_id(0);

    const int OC = stride * stride;
    const int OH = H / stride;
@ -73,67 +39,27 @@ __kernel void reorg_hwc(__global half const *restrict src,

    for (int block_h = 0; block_h < stride; block_h++) {
        const int src_line = b * stride * stride + stride_y * stride + block_h;
-        const int c = src_line / IH;
-        const int h = src_line % IH;
+        const int c        = src_line / IH;
+        const int h        = src_line % IH;

        const int dst_line = b * stride + stride_y * blocks * stride + block_h;
-        const int oc = dst_line / OH;
-        const int oh = dst_line % OH;
+        const int oc       = dst_line / OH;
+        const int oh       = dst_line % OH;

        for (int w = 0; w < W / stride; w++) {
-            local_dst[oh*OW*OC + w*OC + oc] = local_src[h*IW*IC + w*IC + c];
-        }
-    }
-}
-
-__kernel void reorg_hwc_naive(__global half const *restrict src,
-                              __global half       *restrict dst,
-                              int W,
-                              int H,
-                              int C,
-                              int stride,
-                              __local half        *restrict local_src,
-                              __local half        *restrict local_dst
-                              )
-{
-    const int out_c = C / (stride * stride);
-    const int oc = C * (stride * stride);
-    const int oh = H / stride;
-    const int ow = W / stride;
-
-    const int c = get_global_id(0);
-
-    for (int h = 0; h < H; ++h)
-    {
-        int in_index = W * (h + H*c) + (0);
-        int new_z = in_index / (oh*ow);
-        int new_y = (in_index %(oh*ow)) / ow;
-        int new_x = (in_index %(oh*ow)) % ow;
-        int new_index = new_z + new_x * oc + new_y * oc * ow;
-
-        in_index++;
-
-        int c2 = c % out_c;
-        int offset = c / out_c;
-        int w2 = 0 * stride + offset % stride;
-        int h2 = h * stride + offset / stride;
-        int out_index = w2 + W * stride * (h2 + H * stride * c2);
-
-        #pragma unroll 2
-        for(int i = 0; i < W; ++i, out_index+=stride, in_index++)
-        {
-            // repacking coordinates
-            int k0 = out_index / (H*W);
-            int j0 = (out_index % (H*W)) / W;
-            int i0 = (out_index % (H*W)) % W;
-            int out_index_repack = k0 + C * i0 + C * W * j0;
-
-            dst[new_index] = src[out_index_repack];
-
-            int new_z = in_index / (oh*ow);
-            int new_y = (in_index %(oh*ow)) / ow;
-            int new_x = (in_index %(oh*ow)) % ow;
-            new_index = new_z + new_x * oc + new_y * oc * ow;
+            local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c];
        }
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_2D2D(
+        dst + get_group_id(1) * C + get_group_id(0) * stride, // dst
+        local_dst, // src
+        stride, // num_elements_per_line
+        W * H / stride, // num_lines
+        0, // src_line_stride
+        C * stride - stride, // dst_line_stride
+        0);
+    wait_group_events(1, &e2);
 }
--- a/inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl
+++ b/inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl
@ -0,0 +1,53 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void reorg_hwc_naive(
+    __global half const *restrict src,
+    __global half *restrict dst,
+    int W,
+    int H,
+    int C,
+    int stride)
+{
+    const int out_c = C / (stride * stride);
+    const int oc    = C * (stride * stride);
+    const int oh    = H / stride;
+    const int ow    = W / stride;
+
+    const int c = get_global_id(0);
+
+    for (int h = 0; h < H; ++h) {
+        int in_index  = W * (h + H * c) + (0);
+        int new_z     = in_index / (oh * ow);
+        int new_y     = (in_index % (oh * ow)) / ow;
+        int new_x     = (in_index % (oh * ow)) % ow;
+        int new_index = new_z + new_x * oc + new_y * oc * ow;
+
+        in_index++;
+
+        int c2        = c % out_c;
+        int offset    = c / out_c;
+        int w2        = 0 * stride + offset % stride;
+        int h2        = h * stride + offset / stride;
+        int out_index = w2 + W * stride * (h2 + H * stride * c2);
+
+        #pragma unroll 2
+        for (int i = 0; i < W; ++i, out_index += stride, in_index++) {
+            // repacking coordinates
+            int k0               = out_index / (H * W);
+            int j0               = (out_index % (H * W)) / W;
+            int i0               = (out_index % (H * W)) % W;
+            int out_index_repack = k0 + C * i0 + C * W * j0;
+
+            dst[new_index] = src[out_index_repack];
+
+            int new_z = in_index / (oh * ow);
+            int new_y = (in_index % (oh * ow)) / ow;
+            int new_x = (in_index % (oh * ow)) % ow;
+            new_index = new_z + new_x * oc + new_y * oc * ow;
+        }
+    }
+}
--- a/inference-engine/src/vpu/custom_kernels/resample_AA.cl
+++ b/inference-engine/src/vpu/custom_kernels/resample_AA.cl
@ -0,0 +1,122 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+#define USE_OPTIMIZED_ROUND
+
+#ifdef USE_OPTIMIZED_ROUND
+#define ROUND(x) ((int)((x) + 0.5f))
+#else
+#define ROUND(x) (int)(round(x))
+#endif
+
+inline int out_to_in(float ox, float f)
+{
+#ifdef USE_OPTIMIZED_ROUND
+    return (int)((ox + 0.5f) / f);
+#else
+    return ROUND((ox + 0.5f) / f - 0.5f);
+#endif
+}
+
+static inline float triangleCoeff(float x) { return 1.0f - fabs(x); }
+
+static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); }
+
+__kernel void resample_with_antialias(
+    __global const half *restrict src,
+    __global half *restrict dst,
+    int iw,
+    int ih,
+    float factor,
+    int ow,
+    int oh,
+    int channels)
+{
+    __local half local_src[20 * 1024];
+    __local half local_dst[8 * 1024];
+
+    const int r        = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
+    const int oy_first = get_group_id(1) * get_local_size(1);
+    const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
+    const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
+    const int iy_last  = min(out_to_in(oy_last, factor) + r, ih - 1);
+    const int iy_size  = iy_last - iy_first + 1;
+
+    event_t e1 = async_work_group_copy_2D2D(
+        local_src, // dst
+        src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src
+        iy_size * iw, // num_elements_per_line,
+        get_local_size(2), // num_lines,
+        (ih - iy_size) * iw, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+    wait_group_events(1, &e1);
+
+    const int oy     = get_global_id(1);
+    const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
+    const int iy     = ROUND(iy_f);
+
+    __local half const *restrict start_src =
+        local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
+    __local half *restrict start_dst =
+        local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);
+
+    for (int ox = 0; ox < ow; ox++) {
+        const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
+        const int ix_i   = ROUND(ix_f);
+
+        float4 v_sum  = 0.f;
+        float4 v_wsum = 0.f;
+        for (int y = 0; y < iy_size; y++) {
+            float dy  = iy_f - y;
+            int x     = max(ix_i - r, 0);
+            int end_x = min(ix_i + r, iw - 1);
+
+            float4 dx;
+            for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i;
+
+            for (; x < end_x - 3; x += 4, dx -= 4) {
+                float4 w =
+                    factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy);
+                float4 src_vec = {
+                    start_src[y * iw + x + 0],
+                    start_src[y * iw + x + 1],
+                    start_src[y * iw + x + 2],
+                    start_src[y * iw + x + 3]};
+
+                v_sum += w * src_vec;
+                v_wsum += w;
+            }
+
+            for (; x <= end_x; x++) {
+                float dx = ix_f - x;
+                float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy);
+
+                v_sum[0] += w * start_src[y * iw + x];
+                v_wsum[0] += w;
+            }
+        }
+
+        v_sum[0]  = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
+        v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
+
+        start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e2 = async_work_group_copy_2D2D(
+        dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow
+            + get_group_id(1) * get_local_size(1) * ow, // dst
+        local_dst, // src
+        get_local_size(1) * ow, // num_elements_per_line,
+        get_local_size(2), // num_lines,
+        0, // src_line_stride,
+        (get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride,
+        0);
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/resample_nn.cl
+++ b/inference-engine/src/vpu/custom_kernels/resample_nn.cl
@ -1,173 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define USE_OPTIMIZED_ROUND
-
-#ifdef USE_OPTIMIZED_ROUND
-    #define ROUND(x)  ((int)((x) + 0.5f))
-#else
-    #define ROUND(x)  (int)(round(x))
-#endif
-
-inline int out_to_in(float ox, float f) {
-    return (int)((ox + 0.5f) * f);
-}
-
-#define USE_MANUAL_DMA
-
-#if defined (USE_MANUAL_DMA)
-
-void interpolationCHW_nn(__local half* psrc, __local half* pdst, int OW, int IW, int C, float rw, float rh)
-{
-    float alpha = rh / 2.0f - 0.5f;
-
-    for (int w = 0; w < OW/8; w++)
-    {
-        float fw0 = rw*(w*8+0) + alpha;
-        float fw1 = rw*(w*8+1) + alpha;
-        float fw2 = rw*(w*8+2) + alpha;
-        float fw3 = rw*(w*8+3) + alpha;
-
-        float fw4 = rw*(w*8+4) + alpha;
-        float fw5 = rw*(w*8+5) + alpha;
-        float fw6 = rw*(w*8+6) + alpha;
-        float fw7 = rw*(w*8+7) + alpha;
-
-        int iw0 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw0), IW-1);
-        int iw1 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw1), IW-1);
-        int iw2 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw2), IW-1);
-        int iw3 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw3), IW-1);
-
-        int iw4 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw4), IW-1);
-        int iw5 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw5), IW-1);
-        int iw6 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw6), IW-1);
-        int iw7 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw7), IW-1);
-
-        for (int c = 0; c < C; c++)
-        {
-            half8 val = {
-                *((__local half*)(psrc + c * IW + iw0)),
-                *((__local half*)(psrc + c * IW + iw1)),
-
-                *((__local half*)(psrc + c * IW + iw2)),
-                *((__local half*)(psrc + c * IW + iw3)),
-
-                *((__local half*)(psrc + c * IW + iw4)),
-                *((__local half*)(psrc + c * IW + iw5)),
-
-                *((__local half*)(psrc + c * IW + iw6)),
-                *((__local half*)(psrc + c * IW + iw7)),
-            };
-            *((__local half8*)(pdst + c * OW + w*8)) = val;
-        }
-    }
-
-    for (int w = OW/8*8; w < OW; w++)
-    {
-        float fw = rw*w + alpha;
-        int iw0 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw), IW-1);
-
-        for (int c = 0; c < C; c++)
-        {
-            *((__local half*)(pdst + c * OW + w)) = *((__local half*)(psrc + c * IW + iw0));
-        }
-    }
-}
-
-__kernel void __dma_preload_resample_nearest(__global const half* restrict src,
-                                             __global       half* restrict _0,
-                                             __local        half* restrict local_src,
-                                             __local        half* restrict _1,
-                                             int iw,
-                                             int ih,
-                                             float factor,
-                                             int ow,
-                                             int oh,
-                                             int channels)
-{
-    const int oy_first = get_group_id(1) * get_local_size(1);
-    const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
-    const int iy_first = out_to_in(oy_first, 1.0 / factor);
-    const int iy_last = out_to_in(oy_last, 1.0 /factor);
-    const int iy_size = iy_last - iy_first + 1;
-
-    WorkGroupDmaCreateStrideTransaction(
-        src + get_group_id(2)*channels*ih*iw + iy_first*iw, // src
-        local_src, // dst
-        iy_size * iw * sizeof(half), // src_width,
-        iy_size * iw * sizeof(half), // dst_width,
-        ih * iw * sizeof(half), // src_stride,
-        iy_size * iw * sizeof(half), // dst_stride,
-        channels * iy_size * iw * sizeof(half), // size
-        0);
-}
-
-__kernel void __dma_postwrite_resample_nearest(__global const half* restrict _0,
-                                               __global       half* restrict dst,
-                                               __local        half* restrict _1,
-                                               __local        half* restrict local_dst,
-                                               int iw,
-                                               int ih,
-                                               float factor,
-                                               int ow,
-                                               int oh,
-                                               int channels)
-{
-
-    WorkGroupDmaCreateStrideTransaction(
-        local_dst,  // src
-        dst + get_group_id(2)*channels*get_global_size(1)*ow + get_group_id(1)*get_local_size(1)*ow,  // dst
-        get_local_size(1) * ow * sizeof(half), // src_width,
-        get_local_size(1) * ow * sizeof(half), // dst_width,
-        get_local_size(1) * ow * sizeof(half), // src_stride,
-        get_global_size(1) * ow * sizeof(half), // dst_stride,
-        channels * get_local_size(1) * ow * sizeof(half), // size
-        0);
-}
-
-kernel void resample_nearest(__global const half* restrict src,
-                             __global       half* restrict dst,
-                             __local        half* restrict local_src,
-                             __local        half* restrict local_dst,
-                             int iw,
-                             int ih,
-                             float factor,
-                             int ow,
-                             int oh,
-                             int channels)
-{
-    interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor);
-}
-
-#else // defined (USE_MANUAL_DMA)
-
-kernel void resample_nearest(__global const half* restrict src,
-                             __global       half* restrict dst,
-                             __local        half* restrict local_src,
-                             __local        half* restrict local_dst,
-                             int iw,
-                             int ih,
-                             float factor,
-                             int ow,
-                             int oh,
-                             int channels)
-{
-    const float inv_factor = 1.0f / factor;
-    const int iy = out_to_in(get_global_id(1), inv_factor);
-
-    __global half* dst_data = dst + get_global_id(1)*ow;
-    __global half* src_data = src + iy*iw;
-
-    for (int ox = 0; ox < ow; ++ox)
-    {
-        const int ix = out_to_in(ox, inv_factor);
-        for (int c = 0; c < channels; c++) {
-            dst_data[c*oh*ow + ox] = src_data[c*ih*iw + ix];
-        }
-    }
-}
-
-#endif // defined (USE_MANUAL_DMA)
--- a/inference-engine/src/vpu/custom_kernels/resample_noAA.cl
+++ b/inference-engine/src/vpu/custom_kernels/resample_noAA.cl
@ -0,0 +1,112 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
+
+#define USE_OPTIMIZED_ROUND
+
+#ifdef USE_OPTIMIZED_ROUND
+#define ROUND(x) ((int)((x) + 0.5f))
+#else
+#define ROUND(x) (int)(round(x))
+#endif
+
+inline int out_to_in(float ox, float f) { return (int)((ox + 0.5f) * f); }
+
+void interpolationCHW_nn(__local half *psrc, __local half *pdst, int OW, int IW, int C, float rw, float rh)
+{
+    float alpha = rh / 2.0f - 0.5f;
+
+    for (int w = 0; w < OW / 8; w++) {
+        float fw0 = rw * (w * 8 + 0) + alpha;
+        float fw1 = rw * (w * 8 + 1) + alpha;
+        float fw2 = rw * (w * 8 + 2) + alpha;
+        float fw3 = rw * (w * 8 + 3) + alpha;
+
+        float fw4 = rw * (w * 8 + 4) + alpha;
+        float fw5 = rw * (w * 8 + 5) + alpha;
+        float fw6 = rw * (w * 8 + 6) + alpha;
+        float fw7 = rw * (w * 8 + 7) + alpha;
+
+        int iw0 = min((int)ROUND(fw0), IW - 1);
+        int iw1 = min((int)ROUND(fw1), IW - 1);
+        int iw2 = min((int)ROUND(fw2), IW - 1);
+        int iw3 = min((int)ROUND(fw3), IW - 1);
+
+        int iw4 = min((int)ROUND(fw4), IW - 1);
+        int iw5 = min((int)ROUND(fw5), IW - 1);
+        int iw6 = min((int)ROUND(fw6), IW - 1);
+        int iw7 = min((int)ROUND(fw7), IW - 1);
+
+        for (int c = 0; c < C; c++) {
+            half8 val = {
+                *((__local half *)(psrc + c * IW + iw0)),
+                *((__local half *)(psrc + c * IW + iw1)),
+                *((__local half *)(psrc + c * IW + iw2)),
+                *((__local half *)(psrc + c * IW + iw3)),
+
+                *((__local half *)(psrc + c * IW + iw4)),
+                *((__local half *)(psrc + c * IW + iw5)),
+                *((__local half *)(psrc + c * IW + iw6)),
+                *((__local half *)(psrc + c * IW + iw7)),
+            };
+            *((__local half8 *)(pdst + c * OW + w * 8)) = val;
+        }
+    }
+
+    for (int w = OW / 8 * 8; w < OW; w++) {
+        float fw = rw * w + alpha;
+        int iw0  = min((int)ROUND(fw), IW - 1);
+
+        for (int c = 0; c < C; c++) {
+            *((__local half *)(pdst + c * OW + w)) = *((__local half *)(psrc + c * IW + iw0));
+        }
+    }
+}
+
+kernel void resample_nearest(
+    __global const half *restrict src,
+    __global half *restrict dst,
+    int iw,
+    int ih,
+    float factor,
+    int ow,
+    int oh,
+    int channels)
+{
+    __local half local_src[14 * 1024];
+    __local half local_dst[14 * 1024];
+
+    const int oy_first = get_group_id(1) * get_local_size(1);
+    const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
+    const int iy_first = out_to_in(oy_first, 1.0 / factor);
+    const int iy_last  = out_to_in(oy_last, 1.0 / factor);
+
+    const int iy_size = iy_last - iy_first + 1;
+
+    event_t e1 = async_work_group_copy_2D2D(
+        local_src, // dst
+        src + get_group_id(2) * channels * ih * iw + iy_first * iw, // src
+        iy_size * iw, // num_elements_per_line,
+        channels, // num_lines,
+        ih * iw - iy_size * iw, // src_line_stride,
+        0, // dst_line_stride,
+        0);
+
+    wait_group_events(1, &e1);
+
+    interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor);
+
+    event_t e2 = async_work_group_copy_2D2D(
+        dst + get_group_id(2) * channels * get_global_size(1) * ow + get_group_id(1) * get_local_size(1) * ow, // dst
+        local_dst, // src
+        get_local_size(1) * ow, // size_t num_elements_per_line,
+        channels, // size_t num_lines,
+        0, // size_t src_line_stride,
+        get_global_size(1) * ow - get_local_size(1) * ow, // size_t dst_line_stride,
+        0);
+
+    wait_group_events(1, &e2);
+}
--- a/inference-engine/src/vpu/custom_kernels/resample_with_antialias.cl
+++ b/inference-engine/src/vpu/custom_kernels/resample_with_antialias.cl
@ -1,245 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define USE_OPTIMIZED_ROUND
-
-#ifdef USE_OPTIMIZED_ROUND
-    #define ROUND(x)  ((int)((x) + 0.5f))
-#else
-    #define ROUND(x)  (int)(round(x))
-#endif
-
-
-inline int out_to_in(float ox, float f) {
-#ifdef USE_OPTIMIZED_ROUND
-    return (int)((ox + 0.5f) / f);
-#else
-    return ROUND((ox + 0.5f) / f - 0.5f);
-#endif
-}
-
-static inline float triangleCoeff(float x)
-{
-    return 1.0f - fabs(x);
-}
-
-static inline float4 triangleCoeff4(float4 x)
-{
-    return 1.0f - fabs(x);
-}
-
-static inline half triangleCoeffHalf(half x)
-{
-    return 1.0h - fabs(x);
-}
-
-static inline half4 triangleCoeffHalf4(half4 x)
-{
-    return 1.0h - fabs(x);
-}
-
-static inline half8 triangleCoeffHalf8(half8 x)
-{
-    return 1.0h - fabs(x);
-}
-
-#define USE_MANUAL_DMA
-
-#if defined (USE_MANUAL_DMA)
-
-__kernel void __dma_preload_resample_with_antialias(__global const half* restrict src,
-                                             __global       half* restrict _0,
-                                             __local        half* restrict local_src,
-                                             __local        half* restrict _1,
-                                             int iw,
-                                             int ih,
-                                             float factor,
-                                             int ow,
-                                             int oh,
-                                             int channels)
-{
-    const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
-    const int oy_first = get_group_id(1) * get_local_size(1);
-    const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
-    const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
-    const int iy_last  = min(out_to_in(oy_last, factor) + r, ih - 1);
-    const int iy_size = iy_last - iy_first + 1;
-
-    WorkGroupDmaCreateStrideTransaction(
-        src + get_group_id(2)*get_local_size(2)*ih*iw + iy_first*iw, // src
-        local_src, // dst
-        iy_size * iw * sizeof(half), // src_width,
-        iy_size * iw * sizeof(half), // dst_width,
-        ih * iw * sizeof(half), // src_stride,
-        iy_size * iw * sizeof(half), // dst_stride,
-        get_local_size(2) * iy_size * iw * sizeof(half), // size
-        0);
-}
-
-__kernel void __dma_postwrite_resample_with_antialias(__global const half* restrict _0,
-                                               __global       half* restrict dst,
-                                               __local        half* restrict _1,
-                                               __local        half* restrict dst_local,
-                                               int iw,
-                                               int ih,
-                                               float factor,
-                                               int ow,
-                                               int oh,
-                                               int channels)
-{
-    WorkGroupDmaCreateStrideTransaction(
-        dst_local,  // src
-        dst + get_group_id(2)*get_local_size(2)*get_global_size(1)*ow + get_group_id(1)*get_local_size(1)*ow,  // dst
-        get_local_size(1) * ow * sizeof(half), // src_width,
-        get_local_size(1) * ow * sizeof(half), // dst_width,
-        get_local_size(1) * ow * sizeof(half), // src_stride,
-        get_global_size(1) * ow * sizeof(half), // dst_stride,
-        get_local_size(2) * get_local_size(1) * ow * sizeof(half), // size
-        0);
-}
-
-__kernel void resample_with_antialias(const __global half* restrict src,
-                                      __global half* restrict dst,
-                                      __local half* restrict local_src,
-                                      __local half* restrict local_dst,
-                                      int iw,
-                                      int ih,
-                                      float factor,
-                                      int ow,
-                                      int oh,
-                                      int channels)
-{
-    const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
-    const int oy_first = get_group_id(1) * get_local_size(1);
-    const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
-    const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
-    const int iy_last  = min(out_to_in(oy_last, factor) + r, ih - 1);
-    const int iy_size = iy_last - iy_first + 1;
-    const int oy = get_global_id(1);
-    const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
-    const int iy = ROUND(iy_f);
-
-    __local half const *restrict start_src = local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
-    __local half       *restrict start_dst = local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);
-
-    for (int ox = 0; ox < ow; ox++)
-    {
-        const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
-        const int ix_i = ROUND(ix_f);
-
-        float4 v_sum = 0.f;
-        float4 v_wsum = 0.f;
-        for (int y = 0; y < iy_size; y++)
-        {
-            float dy = iy_f - y;
-            int x = max(ix_i - r, 0);
-            int end_x = min(ix_i + r, iw - 1);
-
-            float4 dx;
-            for (int i = 0; i < 4; i++)
-                dx[i] = ix_f - x - i;
-
-            for (; x < end_x - 3; x += 4, dx -= 4)
-            {
-                float4 w = factor*triangleCoeff4(factor*dx) * factor*triangleCoeff(factor*dy);
-                float4 src_vec = { start_src[y*iw + x + 0],
-                                   start_src[y*iw + x + 1],
-                                   start_src[y*iw + x + 2],
-                                   start_src[y*iw + x + 3] };
-
-                v_sum += w * src_vec;
-                v_wsum += w;
-            }
-
-            for (; x <= end_x; x++)
-            {
-                float dx = ix_f - x;
-                float w = factor*triangleCoeff(factor*dx) * factor*triangleCoeff(factor*dy);
-
-                v_sum[0] += w * start_src[y*iw + x];
-                v_wsum[0] += w;
-            }
-        }
-
-        v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
-        v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
-
-        start_dst[get_local_id(1)*ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
-    }
-}
-
-#else
-
-__kernel void resample_with_antialias(const __global half* restrict src,
-                                      __global half* restrict dst,
-                                      __local half* restrict _0,
-                                      __local half* restrict _1,
-                                      int iw,
-                                      int ih,
-                                      float factor,
-                                      int ow,
-                                      int oh,
-                                      int channels)
-{
-    int oy = get_global_id(1);
-    int c = get_global_id(2);
-
-    int r = (factor > 1.0f) ? 2 : ceil((1.0f)/factor);
-
-    const __global half* restrict start_src = src + iw * ih * c;
-    __global half* restrict start_dst = dst + ow * oh * c;
-
-    float iy_f = (oy + 0.5) / factor - 0.5f;
-    int iy_i = ROUND(iy_f);
-
-    for (int ox = 0; ox < ow; ox++)
-    {
-        float ix_f = (ox + 0.5) / factor - 0.5f;
-        int ix_i = ROUND(ix_f);
-
-        float4 v_sum = 0.f;
-        float4 v_wsum = 0.f;
-
-        for (int y = max(iy_i - r, 0); y <= min(iy_i + r, (int)ih - 1); y++)
-        {
-            float dy = iy_f - y;
-            int x = max(ix_i - r, 0);
-            int end_x = min(ix_i + r, (int)iw - 1);
-
-            float4 dx;
-            for (int i = 0; i < 4; i++)
-                dx[i] = ix_f - x - i;
-
-            for (; x <= end_x - 3; x += 4, dx -= 4)
-            {
-                float4 w = factor*triangleCoeff4(factor*dx) * factor*triangleCoeff(factor*dy);
-                float4 src_vec = { start_src[y*iw + x + 0],
-                                   start_src[y*iw + x + 1],
-                                   start_src[y*iw + x + 2],
-                                   start_src[y*iw + x + 3] };
-
-                v_sum += w * src_vec;
-                v_wsum += w;
-            }
-
-            for (; x <= end_x; x++)
-            {
-                float dx = ix_f - x;
-                float w = factor*triangleCoeff(factor*dx) * factor*triangleCoeff(factor*dy);
-
-                v_sum[0] += w * start_src[y*iw + x];
-                v_wsum[0] += w;
-            }
-        }
-
-        v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
-        v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
-
-        start_dst[oy*ow + ox] = (!v_wsum[0]) ? (half)0.0f : (half)(v_sum[0] / v_wsum[0]);
-    }
-}
-
-#endif
--- a/inference-engine/src/vpu/custom_kernels/shuffle_channels.cl
+++ b/inference-engine/src/vpu/custom_kernels/shuffle_channels.cl
@ -4,12 +4,13 @@

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable

-__kernel void ShuffleChannel(__global const half* restrict src_data,
-                             __global       half* restrict dst_data,
-                             int C,
-                             int H,
-                             int W,
-                             int G)
+__kernel void ShuffleChannel(
+    __global const half *restrict src_data,
+    __global half *restrict dst_data,
+    int C,
+    int H,
+    int W,
+    int G)
 {
    int c = get_global_id(0);
    if (c >= C) return;
@ -18,16 +19,15 @@ __kernel void ShuffleChannel(__global const half* restrict src_data,
    int cy = c % G;
    int cx = c / G;

-    __global const half8* src_line = ((__global const half8*)(src_data + cy*CX*H*W + cx*H*W));
-    __global       half8* dst_line = ((__global       half8*)(dst_data + cx*CY*H*W + cy*H*W));
+    __global const half8 *src_line =
+        ((__global const half8 *)(src_data + cy * CX * H * W + cx * H * W));
+    __global half8 *dst_line = ((__global half8 *)(dst_data + cx * CY * H * W + cy * H * W));

-    for (int i = 0; i < W*H/8; i++)
-    {
+    for (int i = 0; i < W * H / 8; i++) {
        dst_line[i] = src_line[i];
    }

-    for (int i = W*H/8*8; i < W*H; i++)
-    {
-        dst_data[cx*CY*H*W + cy*H*W + i] = src_data[cy*CX*H*W + cx*H*W + i];
+    for (int i = W * H / 8 * 8; i < W * H; i++) {
+        dst_data[cx * CY * H * W + cy * H * W + i] = src_data[cy * CX * H * W + cx * H * W + i];
    }
 }
--- a/inference-engine/src/vpu/custom_kernels/st.cl
+++ b/inference-engine/src/vpu/custom_kernels/st.cl
@ -3,51 +3,29 @@
 //

 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable

 #define MAX_WIDTH 512
-#define MIN(a, b) ((a) < (b)) ? (a) : (b);

-__kernel void __dma_postwrite_ocl_st(__global half const *const restrict src_data,
-                                     __global half const *const restrict theta,
-                                     __global half       *const restrict dst_data,
-                                     int C,
-                                     int W,
-                                     __local half   const *const restrict local_dst)
-{
-    const int x0 = get_global_id(0) * MAX_WIDTH;
-    const int x1 = MIN(x0 + MAX_WIDTH, W);
-    const int length = x1 - x0;
-
-    WorkGroupDmaCreate3DTransaction(
-        local_dst, // src
-        dst_data + get_global_id(1) * W + x0, // dst
-        length * sizeof(half), // src width
-        length * sizeof(half), // dst width
-        length * sizeof(half), // src stride
-        W * sizeof(half), // dst stride
-        C, // num planes
-        get_local_size(1) * length * sizeof(half), // src plane stride
-        get_global_size(1) * W * sizeof(half), // dst plane stride
-        get_local_size(1) * length * sizeof(half), // plane size
-        0);
-}
-
-__attribute__((noinline))
-void calcInd(__global half const *const restrict theta,
-                      half       *const restrict weight,
-                      int        *const restrict ind,
-                      int y, int H, int x0, int length, int step, int W)
+__attribute__((noinline)) void calcInd(
+    __global const half *restrict theta,
+    __local half *restrict weight,
+    __local int *restrict ind,
+    int y,
+    int H,
+    int x0,
+    int length,
+    int step,
+    int W)
 {
    float a = (float)y * 1.0f / H * 2 - 1;

    int x = 0;

-    float8 va = (float8) {a, a, a, a, a, a, a, a};
-    float8 vxy = (float8) {x0 + 0, x0 + 1, x0 + 2, x0 + 3,
-                           x0 + 4, x0 + 5, x0 + 6, x0 + 7};
+    float8 va  = (float8){a, a, a, a, a, a, a, a};
+    float8 vxy = (float8){x0 + 0, x0 + 1, x0 + 2, x0 + 3, x0 + 4, x0 + 5, x0 + 6, x0 + 7};

-    for (; x <= length - 8; x += 8, vxy += 8)
-    {
+    for (; x <= length - 8; x += 8, vxy += 8) {
        float8 va1 = vxy * 1.0f / W * 2 - 1.f;

        float8 vx = (va * theta[0] + va1 * theta[1] + theta[2] + 1.f) / 2.f * H;
@ -61,21 +39,27 @@ void calcInd(__global half const *const restrict theta,
        float8 bx = 1.f - ax;
        float8 by = 1.f - ay;

-        union {int8 d; uint8 i; } check_x;
+        union {
+            int8 d;
+            uint8 i;
+        } check_x;

        check_x.d = ix;
-        int8 b01 = check_x.i < (uint8)H;
+        int8 b01  = check_x.i < (uint8)H;

        check_x.d = ix + 1;
-        int8 b45 = check_x.i < (uint8)H;
+        int8 b45  = check_x.i < (uint8)H;

-        union {int8 d; uint8 i; } check_y;
+        union {
+            int8 d;
+            uint8 i;
+        } check_y;

        check_y.d = iy;
-        int8 b23 = check_y.i < (uint8)W;
+        int8 b23  = check_y.i < (uint8)W;

        check_y.d = iy + 1;
-        int8 b67 = check_y.i < (uint8)W;
+        int8 b67  = check_y.i < (uint8)W;

        int8 b0123 = b01 & b23;
        int8 b0167 = b01 & b67;
@ -87,33 +71,48 @@ void calcInd(__global half const *const restrict theta,
        int8 TR_id = ((ix + 0) * W + (iy + 1)) * (b0167 & 1);
        int8 BR_id = ((ix + 1) * W + (iy + 1)) * (b4567 & 1);

-        union {float8 f; int8 i;} w0; w0.f = bx * by;
-        union {float8 f; int8 i;} w1; w1.f = ax * by;
-        union {float8 f; int8 i;} w2; w2.f = bx * ay;
-        union {float8 f; int8 i;} w3; w3.f = ax * ay;
+        union {
+            float8 f;
+            int8 i;
+        } w0;
+        w0.f = bx * by;
+        union {
+            float8 f;
+            int8 i;
+        } w1;
+        w1.f = ax * by;
+        union {
+            float8 f;
+            int8 i;
+        } w2;
+        w2.f = bx * ay;
+        union {
+            float8 f;
+            int8 i;
+        } w3;
+        w3.f = ax * ay;

        w0.i = w0.i & b0123;
        w1.i = w1.i & b4523;
        w2.i = w2.i & b0167;
        w3.i = w3.i & b4567;

-        *((half8*)(weight + x + 0*step)) = convert_half8(w0.f);
-        *((half8*)(weight + x + 1*step)) = convert_half8(w1.f);
-        *((half8*)(weight + x + 2*step)) = convert_half8(w2.f);
-        *((half8*)(weight + x + 3*step)) = convert_half8(w3.f);
+        *((__local half8 *)(weight + x + 0 * step)) = convert_half8(w0.f);
+        *((__local half8 *)(weight + x + 1 * step)) = convert_half8(w1.f);
+        *((__local half8 *)(weight + x + 2 * step)) = convert_half8(w2.f);
+        *((__local half8 *)(weight + x + 3 * step)) = convert_half8(w3.f);

-        *((int8*)(ind + x + 0*step)) = TL_id;
-        *((int8*)(ind + x + 1*step)) = BL_id;
-        *((int8*)(ind + x + 2*step)) = TR_id;
-        *((int8*)(ind + x + 3*step)) = BR_id;
+        *((__local int8 *)(ind + x + 0 * step)) = TL_id;
+        *((__local int8 *)(ind + x + 1 * step)) = BL_id;
+        *((__local int8 *)(ind + x + 2 * step)) = TR_id;
+        *((__local int8 *)(ind + x + 3 * step)) = BR_id;
    }

-    for (; x < length; x++)
-    {
+    for (; x < length; x++) {
        float a1 = (float)(x0 + x) * 1.0f / W * 2 - 1;

-        float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1)/2 * H;
-        float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1)/2 * W;
+        float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1) / 2 * H;
+        float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1) / 2 * W;

        const int ix = (int)(fx) - (fx < 0);
        const int iy = (int)(fy) - (fy < 0);
@ -123,15 +122,15 @@ void calcInd(__global half const *const restrict theta,
        float bx = 1 - ax;
        float by = 1 - ay;

-        int b0 = ix >=  0;
+        int b0 = ix >= 0;
        int b4 = ix >= -1;
-        int b1 = ix  <  H;
-        int b5 = ix  <  H-1;
+        int b1 = ix < H;
+        int b5 = ix < H - 1;

-        int b2 = iy >=  0;
+        int b2 = iy >= 0;
        int b6 = iy >= -1;
-        int b3 = iy  <  W;
-        int b7 = iy  <  W-1;
+        int b3 = iy < W;
+        int b7 = iy < W - 1;

        int b01 = b0 & b1;
        int b23 = b2 & b3;
@ -148,69 +147,79 @@ void calcInd(__global half const *const restrict theta,
        int TR_id = ((ix + 0) * W + (iy + 1)) * b0167;
        int BR_id = ((ix + 1) * W + (iy + 1)) * b4567;

-        half w0 = bx*by*b0123;
-        half w1 = ax*by*b4523;
-        half w2 = bx*ay*b0167;
-        half w3 = ax*ay*b4567;
+        half w0 = bx * by * b0123;
+        half w1 = ax * by * b4523;
+        half w2 = bx * ay * b0167;
+        half w3 = ax * ay * b4567;

-        weight[x + 0*step] = w0;
-        weight[x + 1*step] = w1;
-        weight[x + 2*step] = w2;
-        weight[x + 3*step] = w3;
+        weight[x + 0 * step] = w0;
+        weight[x + 1 * step] = w1;
+        weight[x + 2 * step] = w2;
+        weight[x + 3 * step] = w3;

-        ind[x + 0*step] = TL_id;
-        ind[x + 1*step] = BL_id;
-        ind[x + 2*step] = TR_id;
-        ind[x + 3*step] = BR_id;
+        ind[x + 0 * step] = TL_id;
+        ind[x + 1 * step] = BL_id;
+        ind[x + 2 * step] = TR_id;
+        ind[x + 3 * step] = BR_id;
    }
 }

-__attribute__((noinline))
-void apply(__global half const *const restrict src,
-                    half const *const restrict weight,
-                    int  const *const restrict ind,
-            __local half       *const restrict dst,
-                    int length,
-                    int step)
+__attribute__((noinline)) void apply(
+    __global half const *restrict src,
+    __local half const *restrict weight,
+    __local int const *restrict ind,
+    __local half *restrict dst,
+    int src_stride,
+    int step)
 {
    int x = 0;
-    for(; x <= length - 8; x += 8)
-    {
-        int8 TL_id = *((int8*)(ind + x + 0*step));
-        int8 BL_id = *((int8*)(ind + x + 1*step));
-        int8 TR_id = *((int8*)(ind + x + 2*step));
-        int8 BR_id = *((int8*)(ind + x + 3*step));
+    for (; x <= src_stride - 8; x += 8) {
+        int8 TL_id = *((__local int8 *)(ind + x + 0 * step));
+        int8 BL_id = *((__local int8 *)(ind + x + 1 * step));
+        int8 TR_id = *((__local int8 *)(ind + x + 2 * step));
+        int8 BR_id = *((__local int8 *)(ind + x + 3 * step));

-        half8 w00 = *((half8*)(weight + x + 0*step));
-        half8 w01 = *((half8*)(weight + x + 1*step));
-        half8 w02 = *((half8*)(weight + x + 2*step));
-        half8 w03 = *((half8*)(weight + x + 3*step));
+        half8 w00 = *((__local half8 *)(weight + x + 0 * step));
+        half8 w01 = *((__local half8 *)(weight + x + 1 * step));
+        half8 w02 = *((__local half8 *)(weight + x + 2 * step));
+        half8 w03 = *((__local half8 *)(weight + x + 3 * step));

-        half8 TL = (half8){src[TL_id[0]], src[TL_id[1]], src[TL_id[2]], src[TL_id[3]],
-                           src[TL_id[4]], src[TL_id[5]], src[TL_id[6]], src[TL_id[7]]};
-        half8 TR = (half8){src[TR_id[0]], src[TR_id[1]], src[TR_id[2]], src[TR_id[3]],
-                           src[TR_id[4]], src[TR_id[5]], src[TR_id[6]], src[TR_id[7]]};
-        half8 BL = (half8){src[BL_id[0]], src[BL_id[1]], src[BL_id[2]], src[BL_id[3]],
-                           src[BL_id[4]], src[BL_id[5]], src[BL_id[6]], src[BL_id[7]]};
-        half8 BR = (half8){src[BR_id[0]], src[BR_id[1]], src[BR_id[2]], src[BR_id[3]],
-                           src[BR_id[4]], src[BR_id[5]], src[BR_id[6]], src[BR_id[7]]};
+        half8 TL = (half8){
+            src[TL_id[0]], src[TL_id[1]],
+            src[TL_id[2]], src[TL_id[3]],
+            src[TL_id[4]], src[TL_id[5]],
+            src[TL_id[6]], src[TL_id[7]]};
+        half8 TR = (half8){
+            src[TR_id[0]], src[TR_id[1]],
+            src[TR_id[2]], src[TR_id[3]],
+            src[TR_id[4]], src[TR_id[5]],
+            src[TR_id[6]], src[TR_id[7]]};
+        half8 BL = (half8){
+            src[BL_id[0]], src[BL_id[1]],
+            src[BL_id[2]], src[BL_id[3]],
+            src[BL_id[4]], src[BL_id[5]],
+            src[BL_id[6]], src[BL_id[7]]};
+        half8 BR = (half8){
+            src[BR_id[0]], src[BR_id[1]],
+            src[BR_id[2]], src[BR_id[3]],
+            src[BR_id[4]], src[BR_id[5]],
+            src[BR_id[6]], src[BR_id[7]]};

-        half8 res = w00 * TL +  w01 * BL + w02 * TR + w03 * BR;
+        half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;

-        *((__local half8*)(dst + x)) = res;
+        *((__local half8 *)(dst + x)) = res;
    }

-    for (; x < length; x++)
-    {
-        int TL_id = ind[x + 0*step];
-        int BL_id = ind[x + 1*step];
-        int TR_id = ind[x + 2*step];
-        int BR_id = ind[x + 3*step];
+    for (; x < src_stride; x++) {
+        int TL_id = ind[x + 0 * step];
+        int BL_id = ind[x + 1 * step];
+        int TR_id = ind[x + 2 * step];
+        int BR_id = ind[x + 3 * step];

-        half w00 = weight[x + 0*step];
-        half w01 = weight[x + 1*step];
-        half w02 = weight[x + 2*step];
-        half w03 = weight[x + 3*step];
+        half w00 = weight[x + 0 * step];
+        half w01 = weight[x + 1 * step];
+        half w02 = weight[x + 2 * step];
+        half w03 = weight[x + 3 * step];

        half TL = src[TL_id];
        half TR = src[TR_id];
@ -218,36 +227,52 @@ void apply(__global half const *const restrict src,
        half BR = src[BR_id];

        half res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
+
        dst[x] = res;
    }
 }

-__kernel void ocl_st(__global half const *const restrict src_data,
-                     __global half const *const restrict theta,
-                     __global half const *const restrict dst_data,
-                     int C,
-                     int W,
-                     __local half        *const restrict local_dst)
+__kernel void ocl_st(
+    __global half const *const restrict src_data,
+    __global half const *const restrict theta,
+    __global half *const restrict dst_data,
+    int C,
+    int W)
 {
+    __local int ind[4 * MAX_WIDTH] __attribute__((aligned(16)));
+    __local half weight[4 * MAX_WIDTH] __attribute__((aligned(16)));
+    __local half local_dst[4 * 1024];
+
    int w = get_group_id(0);

    int y = get_global_id(1);
    int H = get_global_size(1);

-    __private int ind[4][MAX_WIDTH] __attribute__((aligned(16)));
-    __private half weight[4][MAX_WIDTH] __attribute__((aligned(16)));
+    const int x0         = w * MAX_WIDTH;
+    const int x1         = min(x0 + MAX_WIDTH, W);
+    const int src_stride = x1 - x0;

-    const int x0 = w * MAX_WIDTH;
-    const int x1 = MIN(x0 + MAX_WIDTH, W);
-    const int length = x1 - x0;
+    calcInd(theta, weight, ind, y, H, x0, src_stride, MAX_WIDTH, W);

-    calcInd(theta, weight, ind, y, H, x0, length, MAX_WIDTH, W);
+    for (int c = 0; c < C; c++) {
+        __global half const *restrict src = src_data + c * H * W;
+        __local half *restrict dst = local_dst + c * get_local_size(1) * src_stride + get_local_id(1) * src_stride;

-    for (int c = 0; c < C; c++)
-    {
-        __global half const *const restrict src = src_data + c*H*W;
-        __local  half       *const restrict dst = local_dst + c*get_local_size(1)*length + get_local_id(1)*length;
-
-        apply(src, weight, ind, dst, length, MAX_WIDTH);
+        apply(src, weight, ind, dst, src_stride, MAX_WIDTH);
    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    event_t e = async_work_group_copy_3D3D(
+        dst_data + get_group_id(1) * get_local_size(1) * W + x0, // dst
+        local_dst, // src
+        src_stride, // num_elements_per_line
+        get_local_size(1), // num_lines
+        0, // src_line_stride
+        W - src_stride, // dst_line_stride
+        C, // num planes
+        0, // src plane stride
+        W * (get_global_size(1) - get_local_size(1)), // dst plane stride
+        0);
+    wait_group_events(1, &e);
 }
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadata.h
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadata.h
@ -0,0 +1,188 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef SHAVE_METADATA_H_INCLUDED
+#define SHAVE_METADATA_H_INCLUDED
+
+#include <cstdint>
+
+
+enum {
+  md_invalid_index = ~0u,
+};
+
+enum md_version_t {
+  md_version_1_0 = 0x00010000,  // version 1.0
+  md_version_1_1 = 0x00010001,  // version 1.1
+  md_version_1_2 = 0x00010002,  // version 1.2
+  md_version_latest = md_version_1_2
+};
+
+struct md_header_t {
+  uint32_t version;             // 0xFFFF0000 = Major   0x0000FFFF = Minor
+
+  // md_kernel_descriptor_t array info
+  uint32_t kernel_count;        // number of kernels in the .metadata
+  uint32_t kernel_first;        // absolute byte offset to first
+                                // md_kernel_descriptor_t from start of .metadata
+
+  // md_kernel_argument_t array info
+  uint32_t arg_count;           // number of arguments in the .metadata
+  uint32_t arg_first;           // absolute byte offset to first
+                                // md_kernel_argument_t from start of .metadata
+
+  // md_kernel_sipp_info_t array info
+  uint32_t sipp_info_count;     // number of sipp dma infos in .metadata
+  uint32_t sipp_info_first;     // absolute byte offset to first
+                                // md_kernel_sipp_info_t from start of .metadata
+
+  // md_expr_t array info
+  uint32_t expr_count;          // number of expressions in .metadata
+  uint32_t expr_first;          // absolute byte offset to first
+                                // kernel_expr_t from start of .metadata
+
+  // md_expr_node_t array info
+  uint32_t expr_node_count;     // number of expression nodes in .metadata
+  uint32_t expr_node_first;     // absolute byte offset to first md_expr_node_t
+                                // from start of .metadata
+
+  // function table
+  uint32_t func_count;          // number of functions in the function table
+  uint32_t func_first;          // absolute byte offset to the first md_function_t
+};
+
+struct md_function_t {
+  uint32_t load_address;        // runtime address of a kernel function
+};
+
+struct md_kernel_variant_t {
+  uint32_t name;                // offset into the string table of the kernel name
+  uint32_t factor;              // vector width / unroll factor
+  uint32_t func;                // index into the kernel function table
+};
+
+enum md_kernel_variant_type_t {
+  md_variant_scalar = 0,           // basic scalar kernel
+  md_variant_vectorized,           // kernel has been vectorized
+  md_variant_unrolled,             // kernel has been loop unrolled
+  md_variant_sipp_dma,             // sipp dma kernel
+  md_variant_sipp_dma_vectorized,  // vectorized sipp dma kernel
+  md_variant_dma_preload,          // kernel preload function
+  md_variant_dma_postwrite,        // kernel postwrite function
+  md_variant_dma_fallback,         // kernel fallback function
+  md_VARIANT_COUNT
+};
+
+constexpr int kVariantCount = md_VARIANT_COUNT;
+
+enum md_kernel_flags_t {
+  md_kernel_flags_ddr_write         = 1u,   // kernel writes to DDR memory
+  md_kernel_flags_ddr_read          = 2u,   // kernel reads from DDR memory
+  md_kernel_flags_generated_prepost = 4u,   // kernel has an autogenerated prepost
+};
+
+struct md_kernel_descriptor_t {
+  uint32_t flags;               // combination of md_kernel_flags_t
+
+  uint32_t arg_count;           // number of arguments for this kernel
+  uint32_t arg_index;           // index of first kernel_argument_t
+
+  uint32_t sipp_dma_in_count;   // number of SIPP dma input arguments (or 0 if no SIPP dma)
+  uint32_t sipp_dma_out_count;  // number of SIPP dma output arguments (or 0 if no SIPP dma)
+  uint32_t sipp_info_index;     // index into the kernel_sipp_info_t list
+
+  uint32_t name;                // metadata string table offset for kernel name
+
+  uint32_t stack_size_wg;       // estimate of stack usage per work group (fixed)
+  uint32_t stack_size_wi;       // estimate of stack usage per work item
+
+                                // kernel variant list
+  md_kernel_variant_t variant[kVariantCount];
+};
+
+enum md_arg_addr_space_t {
+  md_addr_space_private = 0,
+  md_addr_space_global,         // global address space (ddr)
+  md_addr_space_constant,       //
+  md_addr_space_local,          // local address space (cmx)
+
+  md_addr_space_undef,          // none of the others
+};
+
+enum md_arg_flags_t {
+  md_arg_flags_dma_input         = 1u,   // local argument is being read from
+  md_arg_flags_dma_output        = 2u,   // local argument is being written to
+  md_arg_flags_dma_double_buffer = 4u,   // local argument should be double buffered
+  md_arg_flags_generated_prepost = 8u,   // preload and post write are auto generated
+};
+
+struct md_kernel_argument_t {
+  uint32_t flags;                  // bitfield of md_arg_flags_t
+  uint32_t name;                   // argument name
+  uint32_t array_size_expr;        // index to a `kernel_expr_t` type for evaluating total number of element
+  uint32_t size_elm;               // size in bytes of the underlying element
+  md_arg_addr_space_t addr_space;  // the arguments address space
+  uint32_t alignment;              // alignment require in bytes
+  uint32_t arg_pack_offset;        // offset into the argument pack
+};
+
+struct md_kernel_sipp_info_t {
+  uint32_t num_dims;            // number of dimensions of the dma
+  uint32_t span_x;
+  uint32_t span_y;
+
+  // below are all indexes to a 'kernel_expr_t'
+  uint32_t elm_size;            // size in bytes of the element
+  uint32_t stride_y;            // stride in elm_size in y axis
+  uint32_t stride_z;            //                       z
+  uint32_t base;                // address of the base of the buffer
+  uint32_t size_x;              // size in elements for x dim
+  uint32_t size_y;              //                      y
+  uint32_t size_z;              //                      z
+  uint32_t max_x;               // max work item index in x dim
+  uint32_t max_y;               //                        y
+  uint32_t max_z;               //                        z
+};
+
+enum md_expr_node_type_t {
+  md_type_global_size = 0,      // global work size
+  md_type_local_size,           // local work size
+  md_type_param,                // kernel parameter
+  md_type_immediate,            // uint32_t immediate value
+
+  md_type_op_umul,              // unsigned multiply
+  md_type_op_udiv,              // unsigned divide
+
+  md_type_op_add,               // add
+  md_type_op_sub,               // subtract
+
+  md_type_op_min,               // signed min
+  md_type_op_max,               // signed max
+  md_type_op_umin,              // unsigned min
+  md_type_op_umax,              // unsigned max
+
+  md_type_op_and,               // bitwise and
+  md_type_op_or,                // bitwise or
+  md_type_op_xor,               // bitwise xor
+
+  md_type_op_shl,               // left shift
+  md_type_op_lshr,              // right shift
+
+  // more operators as needed
+  // ...
+};
+
+struct md_expr_node_t {
+  md_expr_node_type_t type;     // type of this expression node
+  uint32_t value;               // immediate or operand
+};
+
+struct md_expr_t {
+  uint32_t node_count;          // number of md_expr_node_t's that make up this
+                                // expression
+  uint32_t node_first;          // index of the first md_expr_node_t that
+                                // is part of this expression
+};
+
+#endif  // SHAVE_METADATA_H_INCLUDED
--- a/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadataParser.h
+++ b/inference-engine/src/vpu/graph_transformer/include/vpu/frontend/ShaveElfMetadataParser.h
@ -0,0 +1,225 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef SHAVE_METADATA_PARSER_H_INCLUDED
+#define SHAVE_METADATA_PARSER_H_INCLUDED
+
+#include <string>
+#include <vector>
+#include <cassert>
+#include <cstring>
+
+#include "ShaveElfMetadata.h"
+
+
+struct md_parser_t {
+  md_parser_t(const uint8_t *data, size_t data_size,
+              const char *strtab,
+              size_t strtab_size)
+      : hdr(reinterpret_cast<const md_header_t *>(data)),
+        kernel_descriptor(reinterpret_cast<const md_kernel_descriptor_t *>(
+            data + hdr->kernel_first)),
+        kernel_argument(reinterpret_cast<const md_kernel_argument_t *>(
+            data + hdr->arg_first)),
+        kernel_sipp_info(reinterpret_cast<const md_kernel_sipp_info_t *>(
+            data + hdr->sipp_info_first)),
+        expr_node(reinterpret_cast<const md_expr_node_t *>(
+            data + hdr->expr_node_first)),
+        expr(reinterpret_cast<const md_expr_t *>(data + hdr->expr_first)),
+        func(reinterpret_cast<const md_function_t *>(data + hdr->func_first)),
+        strtab(strtab), strtab_size(strtab_size) {
+    (void)data_size;
+    (void)strtab_size;
+    assert(hdr->version == md_version_latest);
+  }
+
+  // Return the metadata version
+  //
+  md_version_t get_version() const {
+    return static_cast<md_version_t>(hdr->version);
+  }
+
+  // Get a kernel by name
+  //
+  const md_kernel_descriptor_t *get_kernel(const std::string &name) const {
+    for (uint32_t i=0; i < hdr->kernel_count; ++i) {
+      const md_kernel_descriptor_t *d = get_kernel(i);
+      const char *n = get_name(d);
+      if (name == n) {
+        return d;
+      }
+    }
+    return nullptr;
+  }
+
+  // Get a kernel id by name
+  //
+  int get_kernel_id(const std::string& name) const {
+      for (uint32_t i = 0; i < hdr->kernel_count; ++i) {
+          const md_kernel_descriptor_t* d = get_kernel(i);
+          const char* n = get_name(d);
+          if (name == n) {
+              return i;
+          }
+      }
+      return -1;
+  }
+
+  // Return true if a kernel has a specific variant
+  //
+  bool kernel_has_variant(const md_kernel_descriptor_t *kernel,
+                          md_kernel_variant_type_t variant) const {
+    const auto &v = kernel->variant[ variant ];
+    return v.name != md_invalid_index &&
+           v.func != md_invalid_index;
+  }
+
+  // return the load address of a kernel variant
+  //
+  uint32_t get_kernel_load_addr(const md_kernel_descriptor_t *kernel, const md_kernel_variant_type_t variant) {
+    if (!kernel_has_variant(kernel, variant)) {
+      return 0;
+    }
+    const auto &v = kernel->variant[ variant ];
+    const md_function_t &f = func[v.func];
+    return f.load_address;
+  }
+
+  // Get a rough stack size estimate for a kernel variant
+  //
+  uint32_t get_kernel_stack_estimate(const md_kernel_descriptor_t *kernel,
+                                     md_kernel_variant_type_t variant,
+                                     const uint32_t local_size[3]) const {
+    const uint32_t local_area = local_size[0] * local_size[1] * local_size[2];
+    const uint32_t per_wi = local_area * kernel->stack_size_wi;
+    const uint32_t per_wg = kernel->stack_size_wg;
+    const uint32_t factor = kernel->variant[variant].factor;
+    switch (variant) {
+    case md_variant_vectorized:
+    case md_variant_unrolled:     return per_wg + per_wi * factor;
+    case md_variant_scalar:
+    default:                      return per_wg + per_wi;
+    }
+  }
+
+  // Return the number of local arguments a kernel has
+  //
+  uint32_t get_num_local_args(const md_kernel_descriptor_t *kernel) const {
+    uint32_t out = 0;
+    for (uint32_t i = 0; i < kernel->arg_count; ++i) {
+      const md_kernel_argument_t *arg = get_argument(kernel->arg_index + i);
+      out += arg->addr_space == md_addr_space_local;
+    }
+    return out;
+  }
+
+  // Get the number of distinct kernels in this file
+  //
+  uint32_t get_kernel_count() const {
+    return hdr->kernel_count;
+  }
+
+  // Get a function by index
+  //
+  const md_function_t *get_func_ptr(uint32_t index) const {
+    assert(index != md_invalid_index && index < hdr->func_count);
+    return func + index;
+  }
+
+  // Get a kernel by load address
+  //
+  const md_kernel_descriptor_t *get_kernel_by_addr(uint32_t addr) const {
+    for (uint32_t i = 0; i < hdr->kernel_count; ++i) {
+      const md_kernel_descriptor_t *desc = get_kernel(i);
+      for (uint32_t j = 0; j < md_VARIANT_COUNT; ++j) {
+        const uint32_t index = desc->variant[j].func;
+        if (index == md_invalid_index) {
+          continue;
+        }
+        const md_function_t *ptr = get_func_ptr(index);
+        if (ptr->load_address == addr) {
+          return desc;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  // Get a kernel by index
+  //
+  const md_kernel_descriptor_t *get_kernel(uint32_t index) const {
+    assert(index < hdr->kernel_count);
+    return kernel_descriptor + index;
+  }
+
+  // Get an argument by index
+  //
+  const md_kernel_argument_t *get_argument(uint32_t index) const {
+    assert(index < hdr->arg_count);
+    return kernel_argument + index;
+  }
+
+  // Get SIPP info by index
+  //
+  const md_kernel_sipp_info_t *get_sipp_info(uint32_t index) const {
+    assert(index < hdr->sipp_info_count);
+    return kernel_sipp_info + index;
+  }
+
+  // Get an expression node by index
+  //
+  const md_expr_node_t *get_expr_node(uint32_t index) const {
+    assert(index < hdr->expr_node_count);
+    return expr_node + index;
+  }
+
+  // Get an expression by index
+  //
+  const md_expr_t *get_expr(uint32_t index) const {
+    assert(index < hdr->expr_count);
+    return expr + index;
+  }
+
+  // Get a kernel argument for a specific kernel by position
+  //
+  const md_kernel_argument_t *get_argument(const md_kernel_descriptor_t *kernel, uint32_t index) const {
+    assert(index < kernel->arg_count);
+    return get_argument(kernel->arg_index + index);
+  }
+
+  // Return the name of a kernel
+  //
+  const char *get_name(const md_kernel_descriptor_t *kernel) const {
+    return strtab + kernel->name;
+  }
+
+  // Return the name of an argument
+  //
+  const char *get_name(const md_kernel_argument_t *arg) const {
+    return strtab + arg->name;
+  }
+
+  // Evaluate an arbitary expression
+  //
+  uint32_t evaluate_expr(const md_expr_t *expression,
+                         const uint32_t local_size[3],
+                         const uint32_t global_size[3],
+                         const uint32_t *param,
+                         uint32_t param_count) const;
+
+protected:
+  // structure parsers
+  const md_header_t *hdr;
+  const md_kernel_descriptor_t *kernel_descriptor;
+  const md_kernel_argument_t *kernel_argument;
+  const md_kernel_sipp_info_t *kernel_sipp_info;
+  const md_expr_node_t *expr_node;
+  const md_expr_t *expr;
+  const md_function_t *func;
+  // string table
+  const char *strtab;
+  const size_t strtab_size;
+};
+
+#endif  // SHAVE_METADATA_PARSER_H_INCLUDED
--- a/inference-engine/src/vpu/graph_transformer/src/frontend/ShaveElfMetadataParser.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/frontend/ShaveElfMetadataParser.cpp
@ -0,0 +1,93 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/frontend/ShaveElfMetadataParser.h"
+#include <algorithm>
+
+namespace {
+
+// two operand operator evaluation
+uint32_t md_eval_expression_type_op_2(
+    const md_expr_node_type_t type,
+    const uint32_t lhs,
+    const uint32_t rhs) {
+  switch (type) {
+  case md_type_op_umul:    return lhs * rhs;
+  case md_type_op_udiv:    return lhs / rhs;
+  case md_type_op_add:     return (int32_t)lhs + (int32_t)rhs;
+  case md_type_op_sub:     return (int32_t)lhs - (int32_t)rhs;
+  case md_type_op_min:     return std::min((int32_t)lhs, (int32_t)rhs);
+  case md_type_op_max:     return std::max((int32_t)lhs, (int32_t)rhs);
+  case md_type_op_umin:    return std::min(lhs, rhs);
+  case md_type_op_umax:    return std::max(lhs, rhs);
+  case md_type_op_and:     return lhs & rhs;
+  case md_type_op_or:      return lhs | rhs;
+  case md_type_op_xor:     return lhs ^ rhs;
+  case md_type_op_shl:     return lhs << rhs;
+  case md_type_op_lshr:    return lhs >> rhs;
+  default:
+    assert(!"unknown node type");
+    return 0;
+  }
+}
+} // namespace
+
+uint32_t md_parser_t::evaluate_expr(const md_expr_t *expression,
+                                    const uint32_t local_size[3],
+                                    const uint32_t global_size[3],
+                                    const uint32_t *param,
+                                    uint32_t param_count) const {
+  // find the nodes for the given expr_index
+  assert(expression->node_first < hdr->expr_node_count);
+  const md_expr_node_t *node = expr_node + expression->node_first;
+  // the intermediate value stack
+  std::vector<uint32_t> values;
+  // for all of the nodes in this expression
+  for (uint32_t i = 0; i < expression->node_count; ++i) {
+    // get the node
+    const md_expr_node_t &v = node[i];
+    // dispatch the opcode
+    switch (v.type) {
+    case md_type_immediate:
+      values.push_back(v.value);
+      break;
+    case md_type_op_umul: {
+    case md_type_op_udiv:
+    case md_type_op_add:
+    case md_type_op_sub:
+    case md_type_op_min:
+    case md_type_op_max:
+    case md_type_op_umin:
+    case md_type_op_umax:
+    case md_type_op_and:
+    case md_type_op_or:
+    case md_type_op_xor:
+    case md_type_op_shl:
+    case md_type_op_lshr:
+      uint32_t rhs = values.rbegin()[0];
+      uint32_t lhs = values.rbegin()[1];
+      values.pop_back();
+      values.back() = md_eval_expression_type_op_2(v.type, lhs, rhs);
+    }
+      break;
+    case md_type_global_size:
+      assert(v.value < 3);
+      values.push_back(global_size[v.value]);
+      break;
+    case md_type_local_size:
+      assert(v.value < 3);
+      values.push_back(local_size[v.value]);
+      break;
+    case md_type_param:
+      assert(v.value < param_count);
+      values.push_back(param[v.value]);
+      break;
+    default:
+      assert(!"unknown node type");
+    }
+  }
+  // should only be one value remaining which is the result
+  assert(values.size() == 1);
+  return values.back();
+}
--- a/inference-engine/src/vpu/graph_transformer/src/frontend/custom_kernel.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/frontend/custom_kernel.cpp
@ -2,20 +2,30 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include <vpu/frontend/custom_kernel.hpp>
-#include <xml_parse_utils.h>
 #include <caseless.hpp>
+#include <vpu/frontend/ShaveElfMetadataParser.h>
+#include <vpu/frontend/custom_kernel.hpp>
+#include <vpu/utils/error.hpp>
 #include <vpu/utils/extra.hpp>
+#include <xml_parse_utils.h>

 namespace vpu {

+VPU_PACKED(Elf32Shdr {
+    uint32_t shName;
+    uint32_t pad0[3];
+    uint32_t shOffset;
+    uint32_t shSize;
+    uint32_t pad1[4];
+};)
+
 VPU_PACKED(Elf32Ehdr {
-    uint8_t  offs1[28];
-    uint32_t ePhoff;        // Program header offset
-    uint32_t eShoff;        // Section header offset
-    uint8_t  offs2[12];
-    uint16_t eShnum;        // Number of sections
-    uint16_t offs3;
+    uint32_t pad0[7];
+    uint32_t ePhoff;
+    uint32_t eShoff;
+    uint32_t pad1[3];
+    uint16_t eShnum;
+    uint16_t eShstrndx;
 };)

 VPU_PACKED(Elf32Section {
@ -95,111 +105,66 @@ std::pair<const Elf32Section*, const Elf32Section*> findSymbolTable(
    return std::make_pair(strShdr, symShdr);
 }

-SmallVector<std::string> deduceKernelParameters(
-        const char* ELFData,
-        uint32_t kernelAddress) {
-    IE_ASSERT(ELFData != nullptr);
-    const auto cmp = ie::details::CaselessEq<std::string>{};
+SmallVector<std::string> deduceKernelParameters(const md_parser_t& parser, int kernelId) {
+    const auto kernelDesc = parser.get_kernel(kernelId);
+    IE_ASSERT(kernelDesc != nullptr);
+    // Number of elements we get from parser is always greater by one
+    const auto argCount = kernelDesc->arg_count - 1;

-    auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
-    auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
-    auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
+    auto arguments = SmallVector<std::string>{};
+    arguments.reserve(argCount);
+    for (size_t i = 0; i < argCount; i++) {
+        const auto arg = parser.get_argument(kernelDesc, i);
+        VPU_THROW_UNLESS(arg, "Error while parsing custom layer elf file.");

-    const Elf32Section* strShdr = nullptr;
-    const Elf32Section* symShdr = nullptr;
-    std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
-    IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
-
-    auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
-    auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
-    auto firstStr = ELFData + strShdr->shOffset;
-
-    const char* kernelArgStrings = nullptr;
-    for (size_t i = 0; i < numSymEntries; i++) {
-        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) {
-            kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset;
-            break;
+        // skip hoisted buffers
+        if (arg->flags & md_arg_flags_generated_prepost) {
+            continue;
        }
-    }
-    IE_ASSERT(kernelArgStrings != nullptr);

-    SmallVector<std::string> parameters;
-    for (size_t i = 0; i < numSymEntries; i++) {
-        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) {
-            auto ptr = ELFData + shdr[sym[i].stShndx].shOffset;
-            auto numKernels = *reinterpret_cast<const int*>(ptr);
-
-            auto metaOffset = sizeof(int);
-            for (int k = 0; k < numKernels; k++) {
-                auto kHdr = reinterpret_cast<const KernelHdr*>(ptr + metaOffset);
-
-                if (kHdr->address-phdr->pVaddr == kernelAddress) {
-                    auto aHdr = reinterpret_cast<const KernelArgHdr*>(
-                        reinterpret_cast<const char*>(&(kHdr->argOffset)) + sizeof(kHdr->argOffset) + kHdr->argOffset);
-
-                    auto numArgs = reinterpret_cast<const int*>(aHdr)[-1];
-                    for (int n = 0; n < numArgs; n++, aHdr++) {
-                        parameters.push_back(kernelArgStrings + aHdr->stringOffset);
-                    }
-
-                    break;
-                }
-
-                metaOffset += kHdr->sectionSize + sizeof(kHdr->address) + sizeof(kHdr->flags);
-            }
-        }
+        const auto argName = parser.get_name(arg);
+        arguments.emplace_back(argName);
    }

-    return parameters;
+    return arguments;
 }

-int32_t getKernelId(
-        const char* ELFData,
-        uint32_t kernelAddress) {
-    IE_ASSERT(ELFData != nullptr);
-    const auto cmp = ie::details::CaselessEq<std::string>{};
+static const Elf32Shdr *get_elf_section_with_name(const uint8_t *elf_data, const char* section_name) {
+    IE_ASSERT(elf_data);
+    IE_ASSERT(section_name);

-    auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
-    auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
-    auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
+    const auto *ehdr = reinterpret_cast<const Elf32Ehdr *>(elf_data);
+    IE_ASSERT(0 != ehdr->eShoff);
+    IE_ASSERT(0 != ehdr->ePhoff);

-    const Elf32Section* strShdr = nullptr;
-    const Elf32Section* symShdr = nullptr;
-    std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
-    IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
+    // Pointer to the first section header
+    const Elf32Shdr *shdr = reinterpret_cast<const Elf32Shdr *>(elf_data + ehdr->eShoff);

-    auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
-    auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
-    auto firstStr = ELFData + strShdr->shOffset;
+    // Pointer to section header string table header
+    const Elf32Shdr *strShdr = &shdr[ehdr->eShstrndx];

-    const char* kernelArgStrings = nullptr;
-    for (size_t i = 0; i < numSymEntries; i++) {
-        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) {
-            kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset;
-            break;
-        }
+    // We couldn't find sections for the symbol string names and for the symbols
+    // entries
+    if (!strShdr) {
+        return nullptr;
    }
-    IE_ASSERT(kernelArgStrings != nullptr);

-    for (size_t i = 0; i < numSymEntries; i++) {
-        if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) {
-            auto ptr = ELFData + shdr[sym[i].stShndx].shOffset;
-            auto numKernels = *reinterpret_cast<const int*>(ptr);
+    // The string at index 0, which corresponds to the first byte, is a null
+    // character
+    const char *firstStr = reinterpret_cast<const char *>(elf_data + strShdr->shOffset);

-            auto metaOffset = sizeof(int);
-            for (int k = 0; k < numKernels; k++) {
-                auto kHdr = reinterpret_cast<const KernelHdr*>(ptr + metaOffset);
+    // Find the section with the custom SHAVEComputeAorta data
+    for (uint16_t i = 0; i < ehdr->eShnum; i++) {
+        const char *currentSectionName = firstStr + shdr[i].shName;

-                if (kHdr->address-phdr->pVaddr == kernelAddress) {
-                    return k;
-                }
-
-                metaOffset += kHdr->sectionSize + sizeof(kHdr->address) + sizeof(kHdr->flags);
-            }
+        if (0 == strcmp(currentSectionName, section_name)) {
+            return shdr + i;
        }
    }

-    return -1;
+    // If we reached this point, it means that there wasn't a section with
+    // the name we were looking for
+    return nullptr;
 }

 uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) {
@ -230,8 +195,9 @@ uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) {
 CustomKernel::CustomKernel(const pugi::xml_node& kernel, std::string configDir): _configDir {std::move(configDir)} {
    _maxShaves = XMLParseUtils::GetIntAttr(kernel, "max-shaves", 0);

+    std::string fileName;
    for (auto source = kernel.child("Source"); !source.empty(); source = source.next_sibling("Source")) {
-        auto fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(source, "filename", "");
+        fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(source, "filename", "");

        std::ifstream inputFile(fileName, std::ios::binary);
        if (!inputFile.is_open()) {
@ -244,9 +210,30 @@ CustomKernel::CustomKernel(const pugi::xml_node& kernel, std::string configDir):
    }

    const auto kernelEntryName = XMLParseUtils::GetStrAttr(kernel, "entry");
-    const auto kernelEntry = getKernelEntry(&_kernelBinary[0], kernelEntryName);
-    _parameters = deduceKernelParameters(&_kernelBinary[0], kernelEntry);
-    _kernelId = getKernelId(&_kernelBinary[0], kernelEntry);
+
+    const auto elf = reinterpret_cast<const uint8_t*>(_kernelBinary.data());
+    const Elf32Shdr *neoMetadataShdr = get_elf_section_with_name(elf, ".neo_metadata");
+    VPU_THROW_UNLESS(neoMetadataShdr, "Error while parsing custom layer elf: Couldn't find .neo_metadata section");
+
+    const uint8_t *neoMetadata = elf + neoMetadataShdr->shOffset;
+    const size_t neoMetadataSize = neoMetadataShdr->shSize;
+
+    const Elf32Shdr *neoMetadataStrShdr = get_elf_section_with_name(elf, ".neo_metadata.str");
+    VPU_THROW_UNLESS(neoMetadataStrShdr, "Error while parsing custom layer elf: Couldn't find .neo_metadata.str section");
+
+    const char *neoMetadataStr = reinterpret_cast<const char *>(elf + neoMetadataStrShdr->shOffset);
+    const size_t neoMetadataStrSize = neoMetadataStrShdr->shSize;
+
+    const auto parser = md_parser_t{neoMetadata, neoMetadataSize, neoMetadataStr, neoMetadataStrSize};
+    _kernelId = parser.get_kernel_id(kernelEntryName);
+    VPU_THROW_UNLESS(_kernelId != -1, "Failed to find kernel with name `%l`", kernelEntryName);
+
+    VPU_THROW_UNLESS(parser.get_kernel_count() == 1,
+                     "Failed to load kernel binary '%l'\n"
+                     "\tReason: binary should contain only one kernel, but contains %l",
+                     fileName, parser.get_kernel_count());
+
+    _parameters = deduceKernelParameters(parser, _kernelId);

    processParametersNode(kernel);
    processWorkSizesNode(kernel);
--- a/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp
+++ b/inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp
@ -136,7 +136,7 @@ private:
            case CustomParamType::OutputBuffer:
            case CustomParamType::Data: {
                VPU_THROW_UNLESS(ports.find(kp) != ports.end(),
-                    "XML specification for %s layer has no definition for %s parameter. Layer name: %s",
+                    "XML specification for %s layer has no definition for '%s' parameter. Layer name: %s",
                    origLayer()->type, kp, origLayer()->name);

                int id = ports.find(kp)->second;
--- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.cpp
+++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.cpp
@ -20,7 +20,7 @@ INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsFakeQuantize_smoke,
 INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsQuantizeBinarize_smoke,
        ::testing::Combine(
        ::testing::ValuesIn(s_QuantizeTensors),
-        ::testing::ValuesIn(s_QuantizeLevels),
+        ::testing::Values(2),
        ::testing::ValuesIn(s_QuantizeSwitchOut),
        ::testing::ValuesIn(s_CustomConfig)));

--- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp
+++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_custom_test.hpp
@ -799,7 +799,7 @@ TEST_P(myriadLayersTestsQuantizeBinarize_smoke, Quantize_Binarization) {
                    </port>
                </output>
            </layer>
-            <layer id="5" name="Quantize" precision="FP16" type="QuantizeTemporaryType">
+            <layer id="5" name="Quantize" precision="FP16" type="FakeQuantizeBin">
                <data levels="@levels@" input_low_size="@input_low_size@" input_high_size="@input_high_size@" output_low_size="@output_low_size@" output_high_size="@output_high_size@" switch_out="@switch_out@"/>
                <input>
                    <port id="0">
@ -1057,6 +1057,10 @@ TEST_P(myriadLayersTestsBinaryConvolution_smoke, BinaryConvolution) {
    }
    _config[InferenceEngine::MYRIAD_CUSTOM_LAYERS] = customConfig;

+    if (kernel.x == 3 && kernel.y == 3 && dilations == 2) {
+        GTEST_SKIP() << "Computing wrong after hoisting";
+    }
+
    SetInputTensor(dims);
    auto dimsOutput = dims;
    dimsOutput.h = (dims.h) / strides;
@ -1112,7 +1116,7 @@ static std::vector<Group> s_BinaryConvolutionGroup = {
 static std::vector<Kernel> s_BinaryConvolutionKernel = {
    {{1, 1}},
    {{1, 3}},
-    {{3, 3}},
+    {{3, 3}}
 };
 static std::vector<Strides> s_BinaryConvolutionStrides = {
    1, 2
--- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_region_test.cpp
+++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_region_test.cpp
@ -14,5 +14,22 @@ INSTANTIATE_TEST_CASE_P(
 		::testing::Values<DoSoftmax>(1, 0),
 		::testing::Values(vpu::LayoutPreference::ChannelMajor, vpu::LayoutPreference::ChannelMinor),
 		::testing::Values(IRVersion::v7, IRVersion::v10),
-		::testing::ValuesIn(s_CustomConfig)
+		::testing::Values("")
 ));
+
+#ifdef VPU_HAS_CUSTOM_KERNELS
+
+INSTANTIATE_TEST_CASE_P(
+	accuracy_custom, myriadLayersTestsRegionYolo_smoke,
+	::testing::Combine(
+		::testing::Values<Coords>(4),
+		::testing::Values<Classes>(20),
+		::testing::Values<Num>(5, 10),
+		::testing::Values<MaskSize>(3),
+		::testing::Values<DoSoftmax>(1, 0),
+		::testing::Values(vpu::LayoutPreference::ChannelMajor, vpu::LayoutPreference::ChannelMinor),
+		::testing::Values(IRVersion::v7, IRVersion::v10),
+		::testing::Values(s_CustomConfig[1])
+));
+
+#endif
--- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.cpp
+++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.cpp
@ -9,5 +9,17 @@ INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsReorg_smoke, ::testing::Combi
    ::testing::Values<Stride>(2),
    ::testing::Values(vpu::LayoutPreference::ChannelMinor, vpu::LayoutPreference::ChannelMajor),
    ::testing::Values(IRVersion::v7, IRVersion::v10),
-    ::testing::ValuesIn(s_CustomConfig)
+    ::testing::Values<CustomConfig>({})
 ));
+
+#ifdef VPU_HAS_CUSTOM_KERNELS
+
+INSTANTIATE_TEST_CASE_P(accuracy_custom, myriadLayersTestsReorg_smoke, ::testing::Combine(
+    ::testing::ValuesIn(s_ReorgInputs_CustomLayer),
+    ::testing::Values<Stride>(2),
+    ::testing::Values(vpu::LayoutPreference::ChannelMinor, vpu::LayoutPreference::ChannelMajor),
+    ::testing::Values(IRVersion::v7, IRVersion::v10),
+    ::testing::Values(s_CustomConfig[1])
+));
+
+#endif
--- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.hpp
+++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_reorg_test.hpp
@ -111,3 +111,9 @@ static std::vector<SizeVector> s_ReorgInputs = {
 		{1, 192, 6 * 26, 6 * 26},
 		{1, 4, 6, 6}
 };
+
+static std::vector<SizeVector> s_ReorgInputs_CustomLayer = {
+		{1, 64, 26, 26},
+		{1, 64, 128, 128},
+		{1, 4, 6, 6}
+};
--- a/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_resample_test.cpp
+++ b/inference-engine/tests_deprecated/functional/vpu/common/layers/myriad_layers_resample_test.cpp
@ -4,13 +4,26 @@

 #include "myriad_layers_resample_test.hpp"

-// #-31522
 INSTANTIATE_TEST_CASE_P(
-	DISABLED_accuracy, myriadResampleLayerTests_smoke,
+	accuracy, myriadResampleLayerTests_smoke,
 	::testing::Combine(
 		::testing::ValuesIn(s_ResampleInput),
 		::testing::Values<Factor>(2.0f, 0.5f),
+		::testing::Values<Antialias>(false),
+		::testing::Values<HwOptimization>(false, true),
+		::testing::Values(""))
+);
+
+#ifdef VPU_HAS_CUSTOM_KERNELS
+
+INSTANTIATE_TEST_CASE_P(
+	accuracy_custom, myriadResampleLayerTests_smoke,
+	::testing::Combine(
+		::testing::ValuesIn(s_ResampleInput),
+		::testing::Values<Factor>(2.0f),
 		::testing::Values<Antialias>(false, true),
 		::testing::Values<HwOptimization>(false, true),
-		::testing::ValuesIn(s_CustomConfig))
+		::testing::Values(s_CustomConfig[1]))
 );
+
+#endif