diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp index fc3e33267e3..dca5348789a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32.cpp @@ -139,6 +139,10 @@ bool ConvolutionKernel_fs_byx_fsv32::Validate(const Params& p, const optional_pa if (cp.output.Feature().pad.before % fsv != 0) return false; + // Input feature padding must be multiple of fsv to keep block alignment + if (cp.inputs[0].Feature().pad.before % fsv != 0) + return false; + return true; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp index 67c0b3e8124..5533baa796d 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_1x1.cpp @@ -142,6 +142,10 @@ bool ConvolutionKernel_fs_byx_fsv32_1x1::Validate(const Params& p, const optiona if (cp.output.Feature().pad.before % fsv != 0) return false; + // Input feature padding must be multiple of fsv to keep block alignment + if (cp.inputs[0].Feature().pad.before % fsv != 0) + return false; + return true; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp index dc3f4147249..cbb39991735 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_fs_byx_fsv32_depthwise.cpp @@ -146,6 +146,10 @@ bool ConvolutionKernel_fs_byx_fsv32_depthwise::Validate(const Params& p, const o if (cp.output.Feature().pad.before % fsv != 0) return false; + // Input feature padding must be multiple of fsv to keep block alignment + if (cp.inputs[0].Feature().pad.before % fsv != 0) + return false; + return true; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp index b963162bc18..25ccfe1c681 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp @@ -68,6 +68,9 @@ bool PoolingKerneGPU_fs_b_yx_fsv32::Validate(const Params& p, const optional_par if (pp.output.Feature().pad.before % 32 != 0) return false; + if (pp.inputs[0].Feature().pad.before % 32 != 0) + return false; + return true; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl index 076464062ad..a3afa685304 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -24,6 +24,7 @@ #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM) // In some cases input padding may be bigger than needed, those variables describe the offset into padding. #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X) @@ -103,7 +104,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)( uint input_offset = oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X; input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING; input_offset += INPUT0_PAD_BEFORE_FEATURE_NUM * INPUT0_FEATURE_PITCH; - input_offset += b * INPUT0_BATCH_PITCH; + input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * INPUT0_BATCH_PITCH; uint weight_offset = 0; weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV; @@ -243,12 +244,19 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)( // ======================================================================== // Store results: + // Calculate offset to first output element + const uint out_pitch_x = FSV; + const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING; + const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING; + const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING; + const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV); + uint output_offset = 0; - output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV; - output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING; - output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING; - output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM; + output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x; + output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y; + output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b; + output_offset += (pad_before_fs + fs) * out_pitch_fs; const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM; const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X; @@ -309,5 +317,6 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)( #undef OUTPUT_SIZE_X_WITH_PADDING #undef OUTPUT_SIZE_Y_WITH_PADDING +#undef OUTPUT_SIZE_B_WITH_PADDING #undef INPUT_BLOCK_WIDTH_EL_CNT diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl index bed75181362..a6c7bbfa4b7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,9 +21,11 @@ #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) +#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM) #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM) // In some cases input padding may be bigger than needed, those variables describe the offset into padding. #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X) @@ -74,10 +76,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( out[out_i] = UNIT_VAL_ZERO; } + // Calculate offset to first input data element + const uint in_pitch_x = FSV; + const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING; + const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING; + const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING; + uint input_offset = 0; - input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV; - input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV; - input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV; + input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x; + input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y; + input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b; + input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs; uint weight_offset = 0; weight_offset += fs * FILTER_SIZE_X * FILTER_SIZE_Y * ALIGNED_IFM_NUM * FSV; @@ -108,7 +117,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( // ==================================================================== // Move temporary input offset to next row - tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV; + tmp_input_offset += DILATION_SIZE_Y * in_pitch_y; uint tmp_weight_offset = weight_offset; @@ -146,7 +155,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( weight_offset += FILTER_SIZE_X * FSV; } // Move input offset to next input feature slice - input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV; + input_offset += in_pitch_fs; // Move weight offset to next input feature slice (FSV input features) // minus offset added by moving FILTER_SIZE_Y times to new row weight_offset += FSV * FILTER_SIZE_Y * FILTER_SIZE_X * FSV // FSV * input filter feature pitch @@ -190,13 +199,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( // ======================================================================== // Store results: + // Calculate offset to first output element + const uint out_pitch_x = FSV; + const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING; + const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING; + const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING; + const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV); uint output_offset = 0; - output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV; - output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING; - output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING; - output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM; + output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x; + output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y; + output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b; + output_offset += (fs + pad_before_fs) * out_pitch_fs; const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM; const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X; @@ -243,6 +258,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( #undef INPUT0_SIZE_X_WITH_PADDING #undef INPUT0_SIZE_Y_WITH_PADDING +#undef INPUT0_SIZE_B_WITH_PADDING #undef OUTPUT_SIZE_X_WITH_PADDING #undef OUTPUT_SIZE_Y_WITH_PADDING +#undef OUTPUT_SIZE_B_WITH_PADDING diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl index a82af383d37..7e9f8e7ccbb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_1x1.cl @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,9 +21,11 @@ #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) +#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM) #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM) // In some cases input padding may be bigger than needed, those variables describe the offset into padding. #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X) @@ -73,10 +75,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)( out[out_i] = UNIT_VAL_ZERO; } + // Calculate offset to first input data element + const uint in_pitch_x = FSV; + const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING; + const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING; + const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING; + uint input_offset = 0; - input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV; - input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV; - input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV; + input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x; + input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y; + input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b; + input_offset += (INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs; uint weight_offset = 0; weight_offset += fs * ALIGNED_IFM_NUM * FSV; @@ -119,11 +128,11 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)( } } // Move temporary input offset to next strided row - tmp_input_offset += INPUT0_SIZE_X_WITH_PADDING * FSV * STRIDE_SIZE_Y; + tmp_input_offset += in_pitch_y * STRIDE_SIZE_Y; } // ======================================================================== // Move input offset to next input feature slice - input_offset += INPUT0_BATCH_NUM * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV; + input_offset += in_pitch_fs; } // ======================================================================== @@ -170,12 +179,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)( // ======================================================================== // Store results: + // Calculate offset to first output element + const uint out_pitch_x = FSV; + const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING; + const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING; + const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING; + const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV); + uint output_offset = 0; - output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV; - output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING; - output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING; - output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM; + output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x; + output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y; + output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b; + output_offset += (pad_before_fs + fs) * out_pitch_fs; const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM; const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X; @@ -201,7 +217,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)( UNIT_BLOCK_WRITE2(output, output_offset + out_x * FSV, tmp_write); } // Move output offset to next row - output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING; + output_offset += out_pitch_y; } } else @@ -225,7 +241,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)( } } // Move output offset to next row - output_offset += FSV * OUTPUT_SIZE_X_WITH_PADDING; + output_offset += out_pitch_y; } } // ======================================================================== @@ -235,6 +251,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)( #undef INPUT0_SIZE_X_WITH_PADDING #undef INPUT0_SIZE_Y_WITH_PADDING +#undef INPUT0_SIZE_B_WITH_PADDING #undef OUTPUT_SIZE_X_WITH_PADDING #undef OUTPUT_SIZE_Y_WITH_PADDING +#undef OUTPUT_SIZE_B_WITH_PADDING diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl index 7131d3535e9..45c48973cc6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_fs_byx_fsv32_depthwise.cl @@ -21,9 +21,11 @@ #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) +#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM) #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM) // In some cases input padding may be bigger than needed, those variables describe the offset into padding. #define INPUT0_PADDING_OFFSET_SIZE_X (INPUT0_PAD_BEFORE_SIZE_X - PADDING_SIZE_X) @@ -72,11 +74,17 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( out[out_i] = UNIT_VAL_ZERO; } + // Calculate offset to first input data element + const uint in_pitch_x = FSV; + const uint in_pitch_y = in_pitch_x * INPUT0_SIZE_X_WITH_PADDING; + const uint in_pitch_b = in_pitch_y * INPUT0_SIZE_Y_WITH_PADDING; + const uint in_pitch_fs = in_pitch_b * INPUT0_SIZE_B_WITH_PADDING; + uint input_offset = 0; - input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * FSV; - input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING * FSV; - input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV; - input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM; + input_offset += (oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X) * in_pitch_x; + input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * in_pitch_y; + input_offset += (b + INPUT0_PAD_BEFORE_BATCH_NUM) * in_pitch_b; + input_offset += (fs + INPUT0_PAD_BEFORE_FEATURE_NUM / FSV) * in_pitch_fs; uint weight_offset = 0; @@ -105,7 +113,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( // ==================================================================== // Move temporary input offset to next row - tmp_input_offset += DILATION_SIZE_Y * INPUT0_SIZE_X_WITH_PADDING * FSV; + tmp_input_offset += DILATION_SIZE_Y * in_pitch_y; uint tmp_weight_offset = weight_offset; @@ -174,13 +182,19 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( // ======================================================================== // Store results: + // Calculate offset to first output element + const uint out_pitch_x = FSV; + const uint out_pitch_y = out_pitch_x * OUTPUT_SIZE_X_WITH_PADDING; + const uint out_pitch_b = out_pitch_y * OUTPUT_SIZE_Y_WITH_PADDING; + const uint out_pitch_fs = out_pitch_b * OUTPUT_SIZE_B_WITH_PADDING; + const uint pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / FSV); uint output_offset = 0; - output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * FSV; - output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * FSV * OUTPUT_SIZE_X_WITH_PADDING; - output_offset += b * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING; - output_offset += (pad_before_fs + fs) * FSV * OUTPUT_SIZE_X_WITH_PADDING * OUTPUT_SIZE_Y_WITH_PADDING * OUTPUT_BATCH_NUM; + output_offset += (oc + OUTPUT_PAD_BEFORE_SIZE_X) * out_pitch_x; + output_offset += (or + OUTPUT_PAD_BEFORE_SIZE_Y) * out_pitch_y; + output_offset += (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_pitch_b; + output_offset += (pad_before_fs + fs) * out_pitch_fs; const bool full_f = OUTPUT_FEATURE_NUM % FSV == 0 || fs * FSV + FSV <= OUTPUT_FEATURE_NUM; const bool full_x = OUTPUT_SIZE_X % OUTPUT_BLOCK_WIDTH == 0 || oc + OUTPUT_BLOCK_WIDTH <= OUTPUT_SIZE_X; @@ -227,6 +241,8 @@ KERNEL(convolution_gpu_fs_byx_fsv32)( #undef INPUT0_SIZE_X_WITH_PADDING #undef INPUT0_SIZE_Y_WITH_PADDING +#undef INPUT0_SIZE_B_WITH_PADDING #undef OUTPUT_SIZE_X_WITH_PADDING #undef OUTPUT_SIZE_Y_WITH_PADDING +#undef OUTPUT_SIZE_B_WITH_PADDING diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl index 7c98ee79530..bec60b14b9e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl @@ -37,8 +37,11 @@ #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) +#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM) + #define OUTPUT_SIZE_X_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) #define OUTPUT_SIZE_Y_WITH_PADDING (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) +#define OUTPUT_SIZE_B_WITH_PADDING (OUTPUT_PAD_BEFORE_BATCH_NUM + OUTPUT_BATCH_NUM + OUTPUT_PAD_AFTER_BATCH_NUM) // Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI #define REQD_SUB_GROUP_SIZE 16 @@ -79,14 +82,18 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( const uint x_pitch = REQD_FEATURE_SLICE_SIZE; // difference in location between (x+1) and (x) const uint y_pitch = x_pitch * INPUT0_SIZE_X_WITH_PADDING; // difference in location between (y+1) and (y) const uint b_pitch = y_pitch * INPUT0_SIZE_Y_WITH_PADDING; // difference in location between (b+1) and (b) - const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM; // difference in location between (fs+1) and (fs) + const uint fs_pitch = b_pitch * INPUT0_SIZE_B_WITH_PADDING; // difference in location between (fs+1) and (fs) const int offset_x = (int)out_x*STRIDE_SIZE_X - PADDING_SIZE_X; const int offset_y = (int)out_y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch + INPUT0_PAD_BEFORE_SIZE_Y * y_pitch; + const size_t padding_offset = INPUT0_PAD_BEFORE_SIZE_X * x_pitch + + INPUT0_PAD_BEFORE_SIZE_Y * y_pitch + + INPUT0_PAD_BEFORE_BATCH_NUM * b_pitch + + INPUT0_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE * fs_pitch; const size_t fs_offset = fs * fs_pitch; // locate beginning of feature tile const size_t b_offset = b * b_pitch; // locate beginning of batch + #ifdef CHECK_BOUNDRY if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y) @@ -152,15 +159,14 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( const size_t out_x_pitch = REQD_FEATURE_SLICE_SIZE; const size_t out_y_pitch = out_x_pitch * OUTPUT_SIZE_X_WITH_PADDING; const size_t out_b_pitch = out_y_pitch * OUTPUT_SIZE_Y_WITH_PADDING; - const size_t out_fs_pitch = out_b_pitch * OUTPUT_BATCH_NUM; + const size_t out_fs_pitch = out_b_pitch * OUTPUT_SIZE_B_WITH_PADDING; const size_t out_pad_before_fs = (OUTPUT_PAD_BEFORE_FEATURE_NUM / REQD_FEATURE_SLICE_SIZE); const size_t out_x_offset = (out_x + OUTPUT_PAD_BEFORE_SIZE_X) * out_x_pitch; const size_t out_y_offset = (out_y + OUTPUT_PAD_BEFORE_SIZE_Y) * out_y_pitch; - const size_t out_b_offset = b * out_b_pitch; + const size_t out_b_offset = (b + OUTPUT_PAD_BEFORE_BATCH_NUM) * out_b_pitch; const size_t out_fs_offset = (fs + out_pad_before_fs) * out_fs_pitch; - const size_t output_offset = out_fs_offset + out_b_offset + out_y_offset + out_x_offset; const bool full_f = OUTPUT_FEATURE_NUM % REQD_FEATURE_SLICE_SIZE == 0 || @@ -204,3 +210,15 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)( #undef OUTPUT_VEC2 #undef TO_OUTPUT_VEC2 + +#undef INPUT0_SIZE_X_WITH_PADDING +#undef INPUT0_SIZE_Y_WITH_PADDING +#undef INPUT0_SIZE_B_WITH_PADDING + +#undef OUTPUT_SIZE_X_WITH_PADDING +#undef OUTPUT_SIZE_Y_WITH_PADDING +#undef OUTPUT_SIZE_B_WITH_PADDING + +#undef REQD_SUB_GROUP_SIZE +#undef REQD_FEATURE_SLICE_SIZE +#undef REQD_FEATURES_PER_WORK_ITEM diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp index 362e9a76579..bda68337a64 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp @@ -448,8 +448,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const { definitions.push_back({ safe_index_func_name, safe_index_func_val }); definitions.push_back({ index_func_name, index_func_val }); } else { - definitions.push_back({ safe_index_func_name, "(f)" }); - definitions.push_back({ index_func_name, "(f)" }); + definitions.push_back({ safe_index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" }); + definitions.push_back({ index_func_name, "(" + std::to_string(_tensor.Feature().pad.before) + " + (f))" }); } } else { definitions.push_back({ safe_index_func_name, safe_index_func_val }); diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp index d2ce48e8c60..81db4d52529 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp @@ -38,6 +38,222 @@ using namespace cldnn; +namespace { + +struct concat_noop_optimization : pattern_match_optimization_typed { + // Removes concatenation nodes with single input. + using base = pattern_match_optimization_typed; + using base::base; + + bool match(concatenation_node& node); + bool optimize(concatenation_node& node); +}; + +struct concat_in_place_optimization : pattern_match_optimization_typed { + // Performs in-place concat optimization. + // Padding of predecessors is updated to use single buffer by all, which is output from concatenation. + // Then concatenation can be optimized out, as memory will be correctly filled by previous nodes. + // If one of the dependencies is also optimized-out concatenation, then cascade adjusment is performed to update it. + // This optimization is expected to be executed in some topological order, as cascade adjustment is performed backwards. + using base = pattern_match_optimization_typed; + using base::base; + + // Runs concat in-place optimization and adds already optimized concatenations that need re-optimization to `needs_reoptimization`. + void optimize_cascade(concatenation_node& node, std::list& need_reoptimization); + bool match(concatenation_node& node); + bool optimize(concatenation_node& node) { + std::list need_reopt; + optimize_cascade(node, need_reopt); + while (!need_reopt.empty()) { + auto& prop = *need_reopt.front(); + need_reopt.pop_front(); + if (match(prop)) + optimize_cascade(prop, need_reopt); + else + // TODO: Revert extra padding when cascade adjustment failed. + prop.can_be_optimized(false); + } + return false; // node not invalidated + } +}; + +bool concat_noop_optimization::match(concatenation_node& node) { + if (node.is_output() && !get_program().is_debug_build()) + return false; + return node.get_dependencies().size() == 1 && + !node.has_fused_primitives() && + node.get_fused_activations_funcs().empty(); +} + +bool concat_noop_optimization::optimize(concatenation_node& node) { + auto& dep = node.get_dependency(0); + dep.merge_output_padding(node.get_output_layout().data_padding); + prog.extract_and_remove(node); + // Node has been removed, so no further optimizations. + return true; +} + +bool concat_in_place_optimization::match(concatenation_node& node) { + if (node.is_output() && !get_program().is_debug_build()) + return false; + if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty()) + return false; + + // For in place concatenation input layouts and data types must match. + auto output_format = node.get_output_layout().format; + auto output_datatype = node.get_output_layout().data_type; + auto concat_axis = node.get_primitive()->axis; + + for (auto& input : node.get_dependencies()) { + if (input->is_type()) + // reshapes should be optimized out. + return false; + + layout l = input->get_output_layout(); + + if (output_format != l.format || output_datatype != l.data_type) + return false; + + // TODO: Below condition should be moved to program_node::supports_padding. + // This hovewer will need updating the algorithm as it may make cascade adjustment impossible in some cases. + // It hovewer would make normal optimizations possible in others, so this is a trade-off to be investigated. + if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) + return false; + + if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) + return false; + + if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) && + (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) + return false; + + // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required + if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) + return false; + + if (l.format == format::bs_fs_yx_bsv16_fsv16) + return false; + + if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f)) + return false; + } + + auto lower_padd_in_axis = node.get_output_layout().data_padding.lower_size().raw[concat_axis]; + lower_padd_in_axis = std::max(lower_padd_in_axis, + node.get_dependency(0).get_output_layout().data_padding.lower_size().raw[concat_axis]); + + // check if concatenation in place can be applied for inputs set + size_t idx = 0; + for (auto input : node.get_dependencies()) { + // reverted condition - if any of this node's inputs is used by more than one primitive + // and is not optimized concatenation then do not fuse buffers + // todo: we need add padding support for all optimized kernels to remove this condition + if (!input->is_type() && !input->is_type() && + !input->is_type() && !input->is_type() && + !input->is_type() && !input->is_type() && !input->is_type() && + !input->is_type()) + return false; + + // if an input is marked as network output, prevent optimizations + // which would affect a form of its output (unless debug flag is set), + // we also need to restrict input types to those which support padding on all axis + if ((input->is_output() && !get_program().is_debug_build()) || + !input->is_padding_supported(concat_axis, lower_padd_in_axis)) + return false; + + // TODO: Investigate if this condition is needed + if (input->get_users().size() > 2) + return false; + + // Check that input isn't optimized out concatenation along different axis. + if (input->is_type() && input->can_be_optimized() && + input->as().get_primitive()->axis != concat_axis) + return false; + + // Check that input isn't optimized out non-concatenation. + if (!input->is_type() && input->can_be_optimized()) + return false; + + size_t concat_users = 0; + for (auto& user : input->get_users()) + if (user->is_type()) + concat_users += 1; + + // If input is used by more than one concatenation then they may require different paddings. + if (concat_users != 1) + return false; + + auto input_padd = input->get_output_layout().data_padding; + + // Check that there isn't already some padding between inputs in concat axis. + // If node has already been optimized we skip this check - this is just cascade adjustment. + if (!node.can_be_optimized()) { + if (idx != node.get_dependencies().size() && input_padd.upper_size().raw[concat_axis] != 0) + return false; + if (idx != 0 && input_padd.lower_size().raw[concat_axis] != 0) + return false; + } + + lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis]; + idx += 1; + } + + return true; +} + +void concat_in_place_optimization::optimize_cascade(concatenation_node& node, std::list& need_reoptimization) { + auto concat_axis = node.get_primitive()->axis; + + // Select output padding by propagating all required input paddings. + auto padd = node.get_output_layout().data_padding; + for (auto input : node.get_dependencies()) { + padd = padding::max(padd, input->get_output_layout().data_padding); + } + + auto lower_padd = padd.lower_size(); + auto upper_padd = padd.upper_size(); + + // For cascade adjustment override padding in concat axis to output padding. + // In other case match(...) already checked that only first/last input have lower/upper padding. + if (node.can_be_optimized()) { + lower_padd.raw[concat_axis] = node.get_output_layout().data_padding.lower_size().raw[concat_axis]; + upper_padd.raw[concat_axis] = node.get_output_layout().data_padding.upper_size().raw[concat_axis]; + } + node.set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes())); + + upper_padd.raw[concat_axis] += node.get_output_layout().size.raw[concat_axis]; + + // apply concatenation in place optimization + for (auto input : node.get_dependencies()) { + auto input_lenght = input->get_output_layout().size.raw[concat_axis]; + + if (input->is_type() && input->can_be_optimized()) + need_reoptimization.push_back(&input->as()); + + // shrink upper pad so it points at the end of the input's buffer + // + // |--- lower padd ---| |---------- upper padd -----------| + // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| + upper_padd.raw[concat_axis] -= input_lenght; + + // set new padding for input + input->set_output_padding(padding(lower_padd.sizes(), upper_padd.sizes())); + + // move lower padd further + // + // |-------------- lower padd -------------|---------- upper padd -----------| + // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| + lower_padd.raw[concat_axis] += input_lenght; + } + + node.can_be_optimized(true); + for (auto dep : node.get_users()) { + dep->can_share_buffer(false); + } +} + +} // namespace + // ToDo remove friendship relation from program_node void prepare_buffer_fusing::run(program_impl& p) { bool is_debug = p.get_options().get()->enabled(); @@ -57,198 +273,11 @@ void prepare_buffer_fusing::run(program_impl& p) { }; // [1] First try to optimize all concats - auto node_itr = p.get_processing_order().begin(); - while (node_itr != p.get_processing_order().end()) { - auto& node = (*node_itr++); - if (!can_optimize(node)) - continue; - program_helpers::do_for_types(*node, [&p, is_debug](concatenation_node& node) { - // For in place concatenation input layouts and data types must match - auto output_format = node.get_output_layout().format; - auto output_datatype = node.get_output_layout().data_type; - // we need to avoid mixing padded and unpadded buffer - bool all_dependencies_padded = true; - bool all_dependencies_unpadded = true; - for (auto& input : node.get_dependencies()) { - if (input->type() == reshape::type_id()) - // reshapes should be optimized out - return; - - layout l = input->get_output_layout(); - if (static_cast(l.data_padding)) - all_dependencies_unpadded = false; - else - all_dependencies_padded = false; - - if (output_format != l.format || output_datatype != l.data_type) - return; - - if (l.format == format::b_fs_yx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return; - - if (l.format == format::b_fs_zyx_fsv16 && (l.size.feature[0] % 16 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return; - - if ((l.format == format::b_fs_yx_fsv32 || l.format == format::b_fs_zyx_fsv32) && - (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return; - - // TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required - if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f)) - return; - - if (l.format == format::bs_fs_yx_bsv16_fsv16) - return; - - if (l.format == format::b_fs_yx_fsv4 && (l.size.feature[0] != 8 || node.get_primitive()->axis != concatenation::along_f)) - return; - } - - auto concat_axis = node.get_primitive()->axis; - auto padd = node.get_output_layout().data_padding; - - tensor lower_padd = padd.lower_size(); - tensor upper_padd = padd.upper_size(); - - auto upper_padd_val = - node.get_output_layout().get_buffer_size().raw[concat_axis] - lower_padd.raw[concat_axis]; - tensor lower_padd_offset = lower_padd; - - std::list, tensor>> stack = { - std::make_pair(node.get_dependencies(), tensor(0))}; - while (!stack.empty()) { - auto nodes_list = stack.front(); - stack.pop_front(); - - // if concatenation has only one input it does nothing, remove the node - if (node.get_dependencies().size() == 1) { - p.extract_and_remove(node); - return; - } - - auto cascade_adjustment = nodes_list.second; - upper_padd.raw[concat_axis] = upper_padd_val; - lower_padd = lower_padd_offset; - - auto lower_padd_in_axis = lower_padd.raw[concat_axis] + cascade_adjustment.raw[concat_axis]; - auto first_input_format = nodes_list.first[0]->get_output_layout().format; - - // check if concatenation in place can be applied for inputs set - for (auto input : nodes_list.first) { - // reverted condition - if any of this node's inputs is used by more than one primitive - // and is not optimized concatenation then do not fuse buffers - // todo: we need add padding support for all optimized kernels to remove this condition - if (!input->is_type() && !input->is_type() && - !input->is_type() && !input->is_type() && - !input->is_type() && !input->is_type() && !input->is_type() && - !input->is_type()) - return; - - // if an input is marked as network output, prevent optimizations - // which would affect a form of its output (unless debug flag is set), - // we also need to restrict input types to those which support padding on all axis - if ((input->is_output() && !is_debug) || input->get_users().size() > 2 || - !input->is_padding_supported(concat_axis, lower_padd_in_axis)) - return; - - if (input->get_users().size() > 1) { - auto user_count = input->get_users().size(); - for (auto& user : input->get_users()) - if (user->is_type()) - user_count--; - if (user_count != 1) // user_cout == 0 means that input will be used only by concatenations, so - // we cannot apply concat in place for it - return; - } - - // check if all inputs have the same format - if (input->get_output_layout().format != first_input_format) - return; - - lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis]; - } - - // check if it is worth doing concat in place, in case the following primitive is convolution - // with different input padding than concatenation's input users' convolutions, - // it is likely that convolution's implementation will be a reference one, due to mismatched padding - // and performance gain by doing in place concat is nullified by slower convolution implementation - // this should be handled by more advanced tuning mechanism on the topology level - auto& users = node.get_users(); - if (users.size() == 1) { - auto& user = users.front(); - if (node.get_output_layout().format == format::bfyx && user->type() == convolution::type_id()) { - auto out_input_offsets = user->as().get_primitive()->input_offset; - - std::vector in_input_offsets; - for (auto& in_user : nodes_list.first) { - if (in_user->type() == convolution::type_id()) - in_input_offsets.push_back(in_user->as().get_primitive()->input_offset); - } - - for (auto& in_input_offset : in_input_offsets) { - if (in_input_offset.spatial[0] != out_input_offsets.spatial[0] && - in_input_offset.spatial[1] != out_input_offsets.spatial[1]) - return; - } - } else if (user->type() == fused_conv_eltwise::type_id()) { - if (!user->as().get_fused_primitives().empty() && - user->as().get_fused_primitives().begin()->node->is_type()) - return; - } - } - - // apply concatenation in place optimization - for (auto input : nodes_list.first) { - auto input_lenght = input->get_output_layout().size.raw[concat_axis]; - - bool optimized_concat_input = false; - if (input->type() == concatenation::type_id() && input->can_be_optimized()) { - if (input->as().get_primitive()->axis != node.get_primitive()->axis) - return; - optimized_concat_input = true; - } else if (input->can_be_optimized()) { - return; - } - - // shrink upper pad so it points at the end of the input's buffer - // - // |--- lower padd ---| |---------- upper padd -----------| - // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| - upper_padd.raw[concat_axis] -= input_lenght; - - // adjust padding sizes for cascade concatenations - auto lower_padd_tmp = lower_padd; - lower_padd_tmp.raw[concat_axis] += cascade_adjustment.raw[concat_axis]; - auto upper_padd_tmp = upper_padd; - upper_padd_tmp.raw[concat_axis] -= cascade_adjustment.raw[concat_axis]; - - // set new padding for input - input->set_output_padding(padding(lower_padd_tmp.sizes(), upper_padd_tmp.sizes())); - - // move lower padd further - // - // |-------------- lower padd -------------|---------- upper padd -----------| - // |-- output padd ---| ----- input1 ------|----- input2 -----|-- out padd --| - - lower_padd.raw[concat_axis] += input_lenght; - - if (optimized_concat_input && !input->get_dependencies().empty()) - stack.push_back(std::make_pair(input->get_dependencies(), - input->get_output_layout().data_padding.lower_size())); - } - } - - node.can_be_optimized(true); - for (auto dep : node.get_users()) { - dep->can_share_buffer(false); - } - if (!all_dependencies_padded && !all_dependencies_unpadded) - node.can_share_buffer(false); - }); - } + run_node_optimizations(p); // [2] Then try to optimize all crops - node_itr = p.get_processing_order().begin(); + auto node_itr = p.get_processing_order().begin(); while (node_itr != p.get_processing_order().end()) { auto& node = (*node_itr++); if (!can_optimize(node)) diff --git a/inference-engine/thirdparty/clDNN/src/include/program_helpers.h b/inference-engine/thirdparty/clDNN/src/include/program_helpers.h index 7ec262217e1..57e56f7587b 100644 --- a/inference-engine/thirdparty/clDNN/src/include/program_helpers.h +++ b/inference-engine/thirdparty/clDNN/src/include/program_helpers.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2018 Intel Corporation +// Copyright (c) 2018-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -135,4 +135,105 @@ struct program_helpers { } static layout get_weights_layout(typed_program_node& data_node, int32_t split); }; + +// Base class for performing pattern match style optimizations. +// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`, +// and overload match and optimize methods. +template +struct pattern_match_optimization { + pattern_match_optimization(program_impl& prog) + : prog(prog) + {} + + // Returns whether optimization can be performed for specified node. + bool match(program_node& node) { + return static_cast(this)->match(node); + } + // Returns whether optimization invalidated the node and no futher optimizations should execute. + bool optimize(program_node& node) { + // TODO: Add program optimizer class that would take responsibility of modifying program. + // Then use it to provide more complex control over pattern-matches, ie: + // new node added - run applicable optimizations on it as well; + // node deleted - don't do more optimizations; + return static_cast(this)->optimize(node); + } + // Returns whether optimization invalidated the node and no futher optimizations should execute. + bool match_and_optimize(program_node& node) { + if (!match(node)) + return false; + return optimize(node); + } + + program_impl& get_program() { return prog; } + + program_impl& prog; +}; + +// Class for pattern-match optimizations that provides support for matching +// single primitive type `Prim`. +// Implementing class `Impl` is expected to overload: +// bool match(typed_program_node&) +// bool optimize(typed_program_node&) +// Uses CRTP idiom, implementing class should be passed as template parameter `Impl`. +template +struct pattern_match_optimization_typed : pattern_match_optimization> { + using base = pattern_match_optimization>; + + using base::base; + + // Returns whether optimization can be performed for specified node. + bool match(program_node& node) { + if (!node.is_type()) + return false; + return static_cast(this)->match(node.as()); + } + // Should be overloaded by implementation class to match specified primitive. + bool match(typed_program_node& node) { + return false; + } + + // Returns whether optimization invalidated the node and no futher optimizations should execute. + bool optimize(program_node& node) { + return static_cast(this)->optimize(node.as()); + } + // Should be overloaded by implementation class to optimize specified primitive. + bool optimize(typed_program_node& node) { + return false; + } +}; + +// Runs pattern-match optimiations passed as arguments on `node`. +inline bool run_node_optimizations(program_node& /*node*/) { + return false; +} + +template +bool run_node_optimizations(program_node& node, Opt&& opt, Rest&&... rest) { + if (opt.match_and_optimize(node)) + return true; + return run_node_optimizations(node, std::forward(rest)...); +} + +// Runs pattern-match optimizations `Opts` on `node`. +// Optimizations should have constructor with single argument `program_impl&`. +template +bool run_node_optimizations(program_impl& p, program_node& node) { + return run_node_optimizations(node, Opts(p)...); +} + +// Runs specified pattern-match optimizations on whole program, in processing order. +template +void run_node_optimizations(program_impl& p, Opts&&... opts) { + auto it = p.get_processing_order().begin(); + while (it != p.get_processing_order().end()) { + auto node = *it++; + run_node_optimizations(*node, std::forward(opts)...); + } +} + +template +void run_node_optimizations(program_impl& p) { + run_node_optimizations(p, Opts(p)...); +} + } // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp index 0d8bc880002..05e32e547d3 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/depth_concatenate_gpu_test.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -431,6 +431,211 @@ TEST(depth_concatenate_f32_gpu, test05_different_formats) { } } +TEST(depth_concatenate_f32_gpu, test06_padded_input) { + // input1 - activation - concatenation - concatenation - reorder + // / / + // input2 - activation - convolution* / + // + // *Convolution has input offset so it should be propagated, both back to reorders and to second concatenation. + // As a result both concatenations should be optimized out and convolution should use optimized implementation. + const int32_t input_f = 32; + const int32_t output_f = 3 * input_f; + + const auto& engine = get_test_engine(); + auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} }); + auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} }); + + auto input1_data = generate_random_4d(1, input_f, 1, 1, -1, 1); + auto input2_data = generate_random_4d(1, input_f, 1, 1, -1, 1); + set_values(input1, flatten_4d(format::bfyx, input1_data)); + set_values(input2, flatten_4d(format::bfyx, input2_data)); + + auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {input_f, input_f, 3, 3} }); + // Construct weights for convolution that just double input values. + VVVVF weights_data; + weights_data.resize(input_f); + for (size_t oi = 0; oi < input_f; ++oi) { + weights_data[oi].resize(input_f, VVF(3, VF(3, FLOAT16(0.f)))); + weights_data[oi][oi][1][1] = 2.f; + } + set_values(weights, flatten_4d(format::bfyx, weights_data)); + + topology topology; + topology.add(input_layout("input1", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f })); + topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f })); + topology.add(data("weights", weights)); + topology.add(convolution("conv", "actv2", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0)))); + topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f)); + topology.add(concatenation("depth2", { "depth1", "conv" }, concatenation::along_f)); + topology.add(reorder("output", "depth2", format::bfyx, data_types::f32)); + + cldnn::build_options options; + options.set_option(cldnn::build_option::optimize_data(true)); + options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } })); + network network(engine, topology, options); + + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + + auto outputs = network.execute({}); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "output"); + // Check that all concatenations have been optimized out. + auto executed_primitives = network.get_executed_primitives(); + EXPECT_TRUE(executed_primitives.count("depth1") == 0); + EXPECT_TRUE(executed_primitives.count("depth2") == 0); + // Check that convolution was able to use optimzed kernel. + for (auto& info : network.get_primitives_info()) { + if (info.original_id == "conv") { + EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id; + } + } + + auto output = outputs.at("output").get_memory(); + auto output_ptr = output.pointer(); + ASSERT_EQ(output.count(), output_f); + for (size_t i = 0; i < output_f; ++i) { + auto& val = output_ptr[i]; + float ref; + if (i < input_f) + ref = 0.75f * static_cast(input1_data[0][i % input_f][0][0]); + else if (i < 2 * input_f) + ref = 0.5f * static_cast(input2_data[0][i % input_f][0][0]); + else + ref = static_cast(input2_data[0][i % input_f][0][0]); + + EXPECT_EQ(val, ref) << " at i=" << i; + } +} + +TEST(depth_concatenate_f32_gpu, test07_padded_output) { + // input1 - activation - concatenation - convolution - reorder + // input2 - activation / + // + // *Convolution has input offset so it should be propagated back to activations. + const int32_t input_f = 32; + const int32_t output_f = 2 * input_f; + + const auto& engine = get_test_engine(); + auto input1 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} }); + auto input2 = memory::allocate(engine, { data_types::f16, format::fs_b_yx_fsv32, {1, input_f, 1, 1} }); + + auto input1_data = generate_random_4d(1, input_f, 1, 1, -1, 1); + auto input2_data = generate_random_4d(1, input_f, 1, 1, -1, 1); + set_values(input1, flatten_4d(format::bfyx, input1_data)); + set_values(input2, flatten_4d(format::bfyx, input2_data)); + + auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, {output_f, output_f, 3, 3} }); + // Construct weights for convolution that just double input values. + VVVVF weights_data; + weights_data.resize(output_f); + for (size_t oi = 0; oi < output_f; ++oi) { + weights_data[oi].resize(output_f, VVF(3, VF(3, FLOAT16(0.f)))); + weights_data[oi][oi][1][1] = 2.f; + } + set_values(weights, flatten_4d(format::bfyx, weights_data)); + + topology topology; + topology.add(input_layout("input1", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f })); + topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f })); + topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f)); + topology.add(data("weights", weights)); + topology.add(convolution("conv", "depth1", { "weights" }, tensor(1), tensor(batch(0), feature(0), spatial(-1, -1, 0, 0)))); + topology.add(reorder("output", "conv", format::bfyx, data_types::f32)); + + cldnn::build_options options; + options.set_option(cldnn::build_option::optimize_data(true)); + options.set_option(cldnn::build_option::force_implementations({ {"conv", implementation_desc{format::fs_b_yx_fsv32, ""} } })); + network network(engine, topology, options); + + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + + auto outputs = network.execute({}); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "output"); + // Check that all concatenations have been optimized out. + auto executed_primitives = network.get_executed_primitives(); + EXPECT_TRUE(executed_primitives.count("depth1") == 0); + // Check that convolution was able to use optimzed kernel. + for (auto& info : network.get_primitives_info()) { + if (info.original_id == "conv") { + EXPECT_TRUE(info.kernel_id.find("ref") == std::string::npos) << " selected kernel: " << info.kernel_id; + } + } + + auto output = outputs.at("output").get_memory(); + auto output_ptr = output.pointer(); + ASSERT_EQ(output.count(), output_f); + for (size_t i = 0; i < output_f; ++i) { + auto& val = output_ptr[i]; + float ref; + if (i < input_f) + ref = 1.5f * static_cast(input1_data[0][i % input_f][0][0]); + else + ref = static_cast(input2_data[0][i % input_f][0][0]); + + EXPECT_EQ(val, ref) << " at i=" << i; + } +} + +TEST(depth_concatenate_f32_gpu, test07_concat_is_output) { + // input1 - activation - concatenation + // input2 - activation / + // + // As concatenation is output it should not be optimizex out. + const int32_t input_f = 16; + const int32_t output_f = 2 * input_f; + + const auto& engine = get_test_engine(); + auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} }); + auto input2 = memory::allocate(engine, { data_types::f32, format::bfyx, {1, input_f, 1, 1} }); + + auto input1_data = generate_random_4d(1, input_f, 1, 1, -1, 1); + auto input2_data = generate_random_4d(1, input_f, 1, 1, -1, 1); + set_values(input1, flatten_4d(format::bfyx, input1_data)); + set_values(input2, flatten_4d(format::bfyx, input2_data)); + + topology topology; + topology.add(input_layout("input1", input1.get_layout())); + topology.add(input_layout("input2", input2.get_layout())); + topology.add(activation("actv1", "input1", activation_func::linear, { 0.75f })); + topology.add(activation("actv2", "input2", activation_func::linear, { 0.5f })); + topology.add(concatenation("depth1", { "actv1", "actv2" }, concatenation::along_f)); + + cldnn::build_options options; + options.set_option(cldnn::build_option::optimize_data(true)); + network network(engine, topology, options); + + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + + auto outputs = network.execute({}); + EXPECT_EQ(outputs.size(), size_t(1)); + EXPECT_EQ(outputs.begin()->first, "depth1"); + // Check that concatenation haven't been optimized out. + auto executed_primitives = network.get_executed_primitives(); + EXPECT_TRUE(executed_primitives.count("depth1") == 1); + + auto output = outputs.at("depth1").get_memory(); + auto output_ptr = output.pointer(); + ASSERT_EQ(output.count(), output_f); + for (size_t i = 0; i < output_f; ++i) { + auto& val = output_ptr[i]; + float ref; + if (i < input_f) + ref = 0.75f * input1_data[0][i % input_f][0][0]; + else + ref = 0.5f * input2_data[0][i % input_f][0][0]; + + EXPECT_EQ(val, ref) << " at i=" << i; + } +} + TEST(depth_concatenate_f32_gpu, concat_with_different_format_inputs) { const auto& engine = get_test_engine(); build_options build_opt;