diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp deleted file mode 100644 index 45c57a8fd19..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.cpp +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2017-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "convolution_kernel_bfyx_3x3_dw_opt.h" -#include - -namespace kernel_selector { -ConvolutionKernel_bfyx_3x3_dw_opt::ConvolutionKernel_bfyx_3x3_dw_opt() - : ConvolutionKernelBase("convolution_gpu_bfyx_3x3_dw_opt") { - // Generate the dispatch options to the auto-tuner. - std::vector tileXDimSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14}; - std::vector tileYDimSizes = {1, 2, 3, 4, 5, 6, 7}; - std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; - - for (auto tileXDim : tileXDimSizes) { - for (auto tileYDim : tileYDimSizes) { - for (auto executionMode : executionModes) { - autoTuneOptions.emplace_back(AutoTuneOption{{tileXDim, tileYDim}, executionMode}); - } - } - } -} - -ParamsKey ConvolutionKernel_bfyx_3x3_dw_opt::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::F16); - k.EnableInputWeightsType(WeightsType::F16); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableSubGroup(); - k.EnableSubGroupShort(); - k.EnableDepthwiseSeparableOpt(); - return k; -} - -bool ConvolutionKernel_bfyx_3x3_dw_opt::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - if ((cp.filterSize.x != 3) || (cp.filterSize.y != 3) || (cp.stride.x != 1) || (cp.stride.y != 1) || - (cp.padding.x != 1) || (cp.padding.y != 1) || (cp.inputs[0].Feature().v != cp.split) || - cp.output.PitchesDifferFromLogicalDims()) { - return false; - } - - return true; -} - -ConvolutionKernel_bfyx_3x3_dw_opt::AutoTuneOption ConvolutionKernel_bfyx_3x3_dw_opt::GetAutoTuneOptions(const Params&, - int autoTuneIndex) const { - if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast(autoTuneOptions.size()))) { - return autoTuneOptions[autoTuneIndex]; - } - - constexpr int simdSize = 16; - - return AutoTuneOption{{simdSize - 2, 7}, DEFAULT}; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_3x3_dw_opt::SetDefault(const convolution_params& params, - int autoTuneIndex) const { - constexpr int simdSize = 16; - - DispatchData dispatchData = Parent::SetDefault(params); - - auto options = GetAutoTuneOptions(params, autoTuneIndex); - - const int numTilesX = static_cast( - std::ceil(static_cast(params.inputs[0].X().v) / static_cast(options.tileDims.x))); - const int numTilesY = static_cast( - std::ceil(static_cast(params.inputs[0].Y().v) / static_cast(options.tileDims.y))); - - dispatchData.cldnnStyle.blockWidth = options.tileDims.x; - dispatchData.cldnnStyle.blockHeight = options.tileDims.y; - dispatchData.gws[0] = numTilesX * simdSize; - dispatchData.gws[1] = numTilesY; - dispatchData.gws[2] = params.inputs[0].Feature().v * params.inputs[0].Batch().v; - dispatchData.lws[0] = simdSize; - dispatchData.lws[1] = 1; - dispatchData.lws[2] = 1; - - dispatchData.efficiency = FORCE_PRIORITY_5; - - return dispatchData; -} - -JitConstants ConvolutionKernel_bfyx_3x3_dw_opt::GetJitConstants(const convolution_params& params, - const DispatchData& dispatchData) const { - stSize tileDims = {dispatchData.cldnnStyle.blockWidth, dispatchData.cldnnStyle.blockHeight}; - auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, dispatchData); - - if (tileDims.y != 0 && tileDims.x != 0) { - mem_consts.AddConstant(MakeJitConstant("UNIT_BYTE_SIZE", BytesPerElement(params.output.GetDType()))); - mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0])); - mem_consts.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y)); - mem_consts.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x)); - } - - return mem_consts; -} - -KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetTunedKernelsDataByIndex(const Params& params, - const optional_params& options, - const int autoTuneIndex) const { - constexpr int simdSize = 16; - - KernelData kd = KernelData::Default(params); - convolution_params& convParams = *static_cast(kd.params.get()); - DispatchData dispatchData = SetDefault(convParams, autoTuneIndex); - - if (static_cast(static_cast(dispatchData.gws[0] - 1) / simdSize) * dispatchData.cldnnStyle.blockWidth + simdSize > - convParams.inputs[0].Y().pitch) { - // Internal Error - requested tile size is not supported for y pitch - return {}; - } - - return GetCommonKernelsData(params, options, GetAutoTuneOptions(params, autoTuneIndex).exeMode, autoTuneIndex); -} - -KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options, -1); -} - -KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetKernelsDataForAutoTune(const Params& params, - const optional_params& options) const { - if (!Validate(params, options)) { - return {}; - } - - KernelsData res = {}; - - for (size_t i = 0; i < autoTuneOptions.size(); i++) { - KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast(i)); - if (!kd.empty()) { - res.emplace_back(kd[0]); - } - } - - KernelsData defaultKds = GetKernelsData(params, options); - res.insert(res.end(), defaultKds.begin(), defaultKds.end()); - - return res; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h deleted file mode 100644 index c8e52854e70..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_3x3_dw_opt.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include -#include - -namespace kernel_selector { -class ConvolutionKernel_bfyx_3x3_dw_opt : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_bfyx_3x3_dw_opt(); - virtual ~ConvolutionKernel_bfyx_3x3_dw_opt() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; - KernelsData GetTunedKernelsDataByIndex(const Params& params, - const optional_params& options, - int autoTuneIndex) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params&, const optional_params&) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::oiyx; - } - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override; - DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override; - - struct AutoTuneOption { - stSize tileDims; - std::string exeMode; - }; - - AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; - std::vector autoTuneOptions = {}; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp deleted file mode 100644 index f515fa30f20..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h" -#include -#include -#include - -namespace kernel_selector { -// Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel. -constexpr size_t sub_group_size = 16; - -ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() - : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16_2_sg") { - // Generate the dispatch options to the auto-tuner. - std::vector blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16}; - std::vector blockHeightSizes = {1, 2, 3, 4, 5}; - std::vector prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10}; - std::vector executionModes = ConvolutionKernelBase::autoTuneOptions; - const size_t maxBlockSize = 60; - - for (auto executionMode : executionModes) { - for (auto blockWidth : blockWidthSizes) { - for (auto blockHeight : blockHeightSizes) { - for (auto prefetch : prefetchSizes) { - if (blockWidth * blockHeight <= maxBlockSize) { - autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode}); - } - } - } - } - } -} - -ParamsKey ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F16); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableSubGroup(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableDilation(); - return k; -} - -static std::pair get_bfyx_req_input_block_dims(size_t output_block_width, - size_t output_block_height, - const uSize& filter_size, - const uSize& stride, - const uSize& dilation, - size_t sg_size = 16, - size_t read_chunk_size = 8, - size_t min_read_size = 16) { - assert(output_block_width > 0 && output_block_height > 0); - assert(stride.x > 0 && stride.y > 0); - assert(filter_size.x > 0 && filter_size.y > 0); - - // Number of elements in X dimension needed from input to compute output block without re-reading input. - size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1; - // Number of elements in Y dimension needed from input to compute output block without re-reading input. - size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1; - - // Required number of elements in X dimension rounded to nearest >= read chunk size. - size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size); - // Number of sub-group-sized vectors of unit type needed to store input block. - size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size); - - return std::make_pair(input_block_array_size, input_block_read_width); -} - -static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) { - // how many elements we will compute in each dimension - size_t computed_x = Align(output_x, block_x); - size_t computed_y = Align(output_y, block_y); - // how many simds we need in each dimension - size_t simds_x = computed_x / block_x; - size_t simds_y = computed_y / block_y; - // how many unused values we have in each dimension - size_t unused_x = computed_x - output_x; - size_t unused_y = computed_y - output_y; - - block_x -= unused_x / simds_x; - block_y -= unused_y / simds_y; -} - -ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetAutoTuneOptions( - const Params& p, - int autoTuneIndex) const { - if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast(autoTuneOptions.size()))) { - return autoTuneOptions[autoTuneIndex]; - } - - AutoTuneOption option = {0, 0, 0, DEFAULT}; - - const convolution_params& cp = static_cast(p); - - if (cp.stride.x == 1 && cp.stride.y == 1) { - if (cp.filterSize.x == 1 && cp.filterSize.y == 1) { - option.blockWidth = 16; - option.blockHeight = 1; - option.prefetch = 4; - // if less than 16 values is required to compute one single row of output - // then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice - // performance results) - } else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) { - option.blockWidth = cp.output.X().v; - option.blockHeight = 1; - option.prefetch = 4; - } else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) { - option.blockWidth = sub_group_size - cp.filterSize.x + 1; - option.blockHeight = 2; - option.prefetch = 4; - } else { - option.blockWidth = 4; - option.blockHeight = 3; - option.prefetch = 4; - } - } else if (cp.stride.x == 2 && cp.stride.y == 2) { - option.blockWidth = 5; - option.blockHeight = 4; - option.prefetch = 4; - } else { - option.blockWidth = 4; - option.blockHeight = 3; - option.prefetch = 5; - } - - // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block - // sizes - if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) { - shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight); - } - - return option; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::SetDefault(const convolution_params& cp, - int autoTuneIndex) const { - DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp); - - const auto of_maps = cp.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - - dispatchData.efficiency = FORCE_PRIORITY_3; - - auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex); - dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth; - dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight; - dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch; - - auto input_block_dims = get_bfyx_req_input_block_dims(dispatchData.cldnnStyle.blockWidth, - dispatchData.cldnnStyle.blockHeight, - cp.filterSize, - cp.stride, - cp.dilation, - sub_group_size, - cp.output.GetDType() == Datatype::F16 ? sub_group_size : sub_group_size / 2, - sub_group_size); - dispatchData.cldnnStyle.inputBlockArraySize = input_block_dims.first; - dispatchData.cldnnStyle.inputBlockWidth = input_block_dims.second; - - dispatchData.gws[0] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth); - dispatchData.gws[1] = CeilDiv(cp.output.Y().v, dispatchData.cldnnStyle.blockHeight); - dispatchData.gws[2] = 2 * of_threads_per_batch * cp.output.Batch().v; - - dispatchData.lws[0] = 1; - dispatchData.lws[1] = 1; - dispatchData.lws[2] = 2 * sub_group_size; - - return dispatchData; -} - -bool ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - if (cp.inputs[0].Feature().v % 2 != 0 || cp.inputs[0].Feature().v < 64) - return false; - - if (cp.output.Feature().v % 64 != 0) - return false; - - return true; -} - -JitConstants ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetJitConstants(const convolution_params& params, - const DispatchData& dispatchData) const { - const auto of_maps = params.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - size_t leftovers = of_threads_per_batch - of_maps; - - auto jit = Parent::GetJitConstants(params, dispatchData); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 16)); - jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth)); - jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight)); - jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", dispatchData.cldnnStyle.inputBlockArraySize)); - jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth)); - jit.AddConstant(MakeJitConstant("PREFETCH", dispatchData.cldnnStyle.prefetch)); - - if (leftovers) { - jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers)); - } - - return jit; -} - -WeightsLayout ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetPreferredWeightsLayout( - const convolution_params ¶ms) const { - return params.groups == 1 ? WeightsLayout::os_iyx_osv16 : WeightsLayout::g_os_iyx_osv16; -} - -KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options); -} - -KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsDataForAutoTune(const Params& params, - const optional_params& options) const { - if (!Validate(params, options)) { - return {}; - } - - KernelsData res = {}; - - for (size_t i = 0; i < autoTuneOptions.size(); i++) { - KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast(i)); - if (!kd.empty()) { - res.emplace_back(kd[0]); - } - } - - return res; -} - -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h deleted file mode 100644 index 75e8c3bdec6..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16_2_sg.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include -#include - -namespace kernel_selector { - -class ConvolutionKernel_bfyx_os_iyx_osv16_2_sg : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_bfyx_os_iyx_osv16_2_sg(); - virtual ~ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override; - bool Validate(const Params& p, const optional_params& o) const override; - bool NeedPaddedInput() const override { return true; } - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - -private: - struct AutoTuneOption { - size_t blockWidth; - size_t blockHeight; - size_t prefetch; - std::string exeMode; - }; - - AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; - - std::vector autoTuneOptions = {}; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp index e002c9f0958..28f31b2c3dc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp @@ -20,14 +20,11 @@ #include "convolution_kernel_bfyx_gemm_like.h" #include "convolution_kernel_bfyx_direct_10_12_16.h" #include "convolution_kernel_bfyx_os_iyx_osv16.h" -#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h" #include "convolution_kernel_bfyx_iyxo.h" #include "convolution_kernel_yxfb_ref.h" #include "convolution_kernel_yxfb_yxio_b16.h" #include "convolution_kernel_yxfb_yxio_b8.h" -#include "convolution_kernel_yxfb_yxio_b1_block.h" #include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h" -// #include "convolution_kernel_bfyx_3x3_dw_opt.h" #include "convolution_kernel_winograd_2x3_s1.h" #include "convolution_kernel_bfyx_1x1.h" #include "convolution_kernel_bfyx_1x1_gemm_buf.h" @@ -99,16 +96,12 @@ convolution_kernel_selector::convolution_kernel_selector() { Attach(); Attach(); Attach(); - // commented out to not get in our way, will enable in future after autotuning - // Attach(); // yxfb fp Attach(); Attach(); Attach(); Attach(); - // Attach(); // TODO: need to finish integration - // Attach(); // Winograd Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp deleted file mode 100644 index 39f42b033a2..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "convolution_kernel_yxfb_yxio_b1_block.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_yxfb_yxio_b1_block::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F16); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableDilation(); - k.EnableSubGroup(); - return k; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block::SetDefault(const convolution_params& arg, - int) const { - DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg); - // TODO: fill the proper data here (I don't know where can I locate it). - return dispatchData; -} - -JitConstants ConvolutionKernel_yxfb_yxio_b1_block::GetJitConstants(const convolution_params& params, - const DispatchData& dispatchData) const { - auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData); - - cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0])); - return cldnn_jit; -} - -KernelsData ConvolutionKernel_yxfb_yxio_b1_block::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h deleted file mode 100644 index e7b1aa9be35..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_yxfb_yxio_b1_block.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_yxfb_yxio_b1_block : public ConvolutionKernelBase { -public: - ConvolutionKernel_yxfb_yxio_b1_block() : ConvolutionKernelBase("convolution_gpu_yxfb_yxio_b1_block_fp32") {} - virtual ~ConvolutionKernel_yxfb_yxio_b1_block() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::yxio; - } - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_3x3_dw_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_3x3_dw_opt.cl deleted file mode 100644 index df81266254a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_3x3_dw_opt.cl +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#if FP16_UNIT_USED == 0 - #define ALIGNED_BLOCK_READ(ptr, offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr) + (offset))) -#endif - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1))) -KERNEL(convolution_gpu_bfyx_3x3_dw_opt)( - __global UNIT_TYPE* input, - __global UNIT_TYPE* output, - __global UNIT_TYPE* weights, -#if BIAS_TERM - __global UNIT_TYPE* biases, -#endif - uint split_idx) -{ - const uint local_id = get_local_id(0); - const uint tile_x = (uint)get_global_id(0); - const uint tile_y = (uint)get_global_id(1); - const uint bf = (uint)get_global_id(2); - const uint f = bf % INPUT0_FEATURE_NUM; - const uint b = bf / INPUT0_FEATURE_NUM; - - const uint start_x = tile_x / SUB_GROUP_SIZE * TILE_WIDTH; - const uint offset_x = start_x + (tile_x - tile_x / SUB_GROUP_SIZE * SUB_GROUP_SIZE) % TILE_WIDTH; - const uint offset = b * INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH * f; - const uint out_offset = b * OUTPUT_BATCH_PITCH + OUTPUT_FEATURE_PITCH * f; - - const int start_y = tile_y * TILE_HEIGHT; - const int end_y = min(INPUT0_SIZE_Y - 1, start_y + TILE_HEIGHT - 1); - const uint weight_offset = f * FILTER_IFM_PITCH + local_id; - - // Read 3 lines of SUB_GROUP_SIZE floats. - // The 3 lines start one float before the current (to the left) and one line up: - // SUB_GROUP_SIZE=16 - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - // 0 X 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - // In the diagram above X represents the current work item. - - const int input_offset_const = INPUT0_OFFSET + offset + (start_y * INPUT0_Y_PITCH + start_x) - 1; - - const uint base_addr_offset = INPUT0_Y_PITCH; - - UNIT_TYPE input_buffer[3] = { UNIT_VAL_ZERO }; - const int base_offset = -base_addr_offset * UNIT_BYTE_SIZE; - -#if FP16_UNIT_USED - const uint lid = get_sub_group_local_id(); - if(input_offset_const - base_addr_offset >= 0) - input_buffer[0] = input[input_offset_const - base_addr_offset + lid]; - if(input_offset_const >= 0) - input_buffer[1] = input[input_offset_const + lid]; -#else - input_buffer[0] = ALIGNED_BLOCK_READ(input, input_offset_const - base_addr_offset); - input_buffer[1] = ALIGNED_BLOCK_READ(input, input_offset_const); -#endif - - UNIT_TYPE w = weights[weight_offset]; - - int first = 0; - int second = 1; - int third = 2; - int input_offset = input_offset_const; - - for (int y = start_y; y <= end_y; y++) - { - UNIT_TYPE res = UNIT_VAL_ZERO; - input_offset += base_addr_offset; - -#if FP16_UNIT_USED - if(input_offset >= 0) - input_buffer[third] = input[input_offset + lid]; -#else - input_buffer[third] = ALIGNED_BLOCK_READ(input, input_offset); -#endif - - uint kc = 0; - LOOP(FILTER_SIZE_X, kc, - { - res = mad(intel_sub_group_shuffle( w, FILTER_SIZE_Y + kc),intel_sub_group_shuffle( input_buffer[second], local_id + kc),res); - - if (y == 0) - { - res = mad(intel_sub_group_shuffle( w, 2*FILTER_SIZE_Y + kc),intel_sub_group_shuffle( input_buffer[third], local_id + kc),res); - } - else if (y == INPUT0_SIZE_Y - 1) - { - res = mad(intel_sub_group_shuffle( w, kc),intel_sub_group_shuffle( input_buffer[first], local_id + kc),res); - } - else - { - res = mad(intel_sub_group_shuffle( w, kc),intel_sub_group_shuffle( input_buffer[first], local_id + kc),res); - res = mad(intel_sub_group_shuffle( w, 2*FILTER_SIZE_Y + kc),intel_sub_group_shuffle( input_buffer[third], local_id + kc),res); - } - }); - -#if BIAS_TERM - res += biases[f]; -#endif - - if ((local_id < TILE_WIDTH) && (offset_x < INPUT0_SIZE_X)) - { - output[OUTPUT_OFFSET + out_offset + y * INPUT0_SIZE_X + offset_x] = ACTIVATION(res, ACTIVATION_PARAMS); - } - - first = (first + 1) % 3; - second = (second + 1) % 3; - third = (third + 1) % 3; - } - -} - -#undef ALIGNED_BLOCK_READ diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl deleted file mode 100644 index 90251f078a5..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16_2_sg.cl +++ /dev/null @@ -1,254 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" -#include "include/data_types.cl" - -#define SIMD_SIZE SUB_GROUP_SIZE -// --------------------------------------------------------------------------------------------------------------------- -// Just-in-time macro definitions: -// --------------------------------------------------------------------------------------------------------------------- - -// Required JIT constants: -// - INPUT - [tensor] Input dimensions (batch, spatial and feature). -// - OUTPUT - [tensor] Output dimensions (batch, spatial and feature). -// - STRIDE - [tensor] Stride (only spatial). Factors that describe step size in X or Y dimension of -// input position of application of convolution filter when next ouput value -// (step 1 in in X or Y dimension of output) is computed. -// - INPUT0_OFFSET - [tensor] Offset for the first element -// initial offset input position of application of convolution filter and output position. -// - FP16_SUPPORTED - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16). -// - FP16_UNIT_USED - [0/1] Value indicating that current kernel should use FP16. -// - UNIT_TYPE - Type of unit of input/output/weight/bias. -// - UNIT_VAL_ZERO - Literal of current UNIT_TYPE that represents 0. -// - RELU - [0/1] Indicates that ReLU activation function should be used on output. -// - NEGATIVE_SLOPE - [float] Factor for negative output values (required when ReLU is specified). -// -// - SUB_GROUP_SIZE - [int] Size of used subgroup (SIMD). -// - LEFTOVERS - [int] Optional parameter, required only when number of ofm is not dividable by SUB_GROUP_SIZE -// see comment for FEATURES_THREADS_PER_BATCH for more informations - -/* -gpu::make_jit_constant("OUTPUT_LIMIT", output_size), -gpu::make_jit_constant("FILTER", filter_mem.argument().size), -gpu::make_jit_constant("FILTER_ARRAY_NUM", split), -gpu::make_jit_constant("OUTPUT_BLOCK_WIDTH", _kernel_data.block_width)); -gpu::make_jit_constant("OUTPUT_BLOCK_HEIGHT", _kernel_data.block_height)); -gpu::make_jit_constant("IN_BLOCK_ARRAY_SIZE", _kernel_data.input_block_array_size)); -gpu::make_jit_constant("IN_BLOCK_WIDTH", _kernel_data.input_block_width)); -gpu::make_jit_constant("PREFETCH", _kernel_data.prefetch)); -if (_kernel_data.leftovers) - gpu::make_jit_constant("LEFTOVERS", _kernel_data.leftovers)); -*/ - -// FEATURES_THREADS_PER_BATCH defines how many threads in z-dimension are processing single batch. -// ideally, z-dimension of value n should indicate processing of n-th output feature. however, since -// threads are stack in groups of SUB_GROUP_SIZE, when number of ofm is not dividable by SUB_GROUP_SIZE -// there are dummy threads added in z-dimension in count of LEFTOVERS. We need to take them into consideration -// while calculating batch's id (see lines 86-87). Values calculated by dummy threads are discarded at line 210. -#ifdef LEFTOVERS -#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM + LEFTOVERS) -#else -#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM) -#endif - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -__attribute__((reqd_work_group_size(1, 1, 2*SUB_GROUP_SIZE))) -KERNEL(convolution_gpu_bfyx_os_iyx_osv16_2_sg)( - const __global UNIT_TYPE* input, - __global UNIT_TYPE* output, - const __global UNIT_TYPE* weights, -#if BIAS_TERM - const __global UNIT_TYPE* bias, -#endif - uint split_idx) // TODO: removing this parameter cause a performance degradation... :) -{ - const uint oc = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH; // oc = Output Column - const uint or = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; // or = Output Row - const uint fm = (uint)get_group_id(2) * SUB_GROUP_SIZE + get_sub_group_local_id();//get_global_id(2); // fm = Feature Map = od = Output Depth - const uint lid = get_sub_group_local_id(); - - const uint ifm_part = get_sub_group_id(); - __local float slm_vals[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT * SIMD_SIZE]; - - uint batch_idx = fm / FEATURES_THREADS_PER_BATCH; - uint feature_idx = fm % FEATURES_THREADS_PER_BATCH; - uint fmg = feature_idx / SUB_GROUP_SIZE; - - UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE]; - UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT]; - UNIT_TYPE w[PREFETCH]; - uint in_addr; - uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid; - weight_addr += ifm_part * SUB_GROUP_SIZE * FILTER_IFM_NUM/2 * FILTER_SIZE_X * FILTER_SIZE_Y; - - for(int i = 0; i < (OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT); i++) { - out[i] = UNIT_VAL_ZERO; - } - - uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; - in_addr = batch_idx * INPUT0_BATCH_PITCH; - in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + or * STRIDE_SIZE_Y * INPUT0_Y_PITCH + oc * STRIDE_SIZE_X + lid; - in_addr += ifm_part * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM/2; - - for(int kd = 0; kd < FILTER_IFM_NUM/2; kd++) // _ID = 3, RGB - { - uint tmp_in_addr = in_addr; - -#if IN_BLOCK_WIDTH % SUB_GROUP_SIZE == 0 - __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE))) - for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) { - // Horizontal position in input block after read. - const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; - - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; - - // If we have row break, move to the next row. - if (in_block_next_x_pos == IN_BLOCK_WIDTH) - tmp_in_addr += INPUT0_Y_PITCH; - } -#elif (2 * IN_BLOCK_WIDTH) % SUB_GROUP_SIZE == 0 - __attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE))) - for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) { - // Horizontal position in input block after read. - const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE; - - if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { // - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; - - // If we have row break, move to the next row. - if (in_block_next_x_pos == IN_BLOCK_WIDTH) - tmp_in_addr += INPUT0_Y_PITCH; - } - else { - // TODO: Generalize this step to relax IN_BLOCK_WIDTH restrictions. - // Position in sub-group on which new row need to be read. - const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH; - - if (lid < sg_br_pos) - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH]; - // We have row break inside sub-group. Need to move to next line. - tmp_in_addr += INPUT0_Y_PITCH; - if (lid >= sg_br_pos) - in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - sg_br_pos]; - - // If we have another row break, move to the next row. - if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH) - tmp_in_addr += INPUT0_Y_PITCH; - } - } -#else - #error IN_BLOCK_WIDTH must be multiple of SUB_GROUP_SIZE or half of SUB_GROUP_SIZE. Other scenarios are not currently implemented. -#endif - - //move to next filter - in_addr += INPUT0_FEATURE_PITCH; - - for(int pf=0; pf= OUTPUT_SIZE_Y)) - { - for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) { - // this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer. - if(!(oc + c >= OUTPUT_SIZE_X)) - output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c]; - } - } - } - -} - -} - -#undef FEATURES_THREADS_PER_BATCH diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl deleted file mode 100644 index 3e67dead386..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_yxfb_yxio_b1_block_fp32.cl +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" -#include "include/sub_group.cl" - -__attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1))) -KERNEL(convolution_gpu_yxfb_yxio_b1_block)( - const __global float* input, - __global float* output, - const __global float* filter, -#if BIAS_TERM - const __global float* bias, -#endif - uint split_idx) -{ -#ifdef USE_VECTOR_8 - #define VECTOR_FLOAT float8 - #define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN)) - #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA)); -#endif -#ifdef USE_VECTOR_4 - #define VECTOR_FLOAT float4 - #define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN)) - #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA)); -#endif -#ifdef USE_VECTOR_2 - #define VECTOR_FLOAT float2 - #define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN)) - #define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA)); -#endif - - const uint batch_num = INPUT0_BATCH_NUM; - const uint linear_id_xy = (uint)get_group_id(1) + (uint)get_global_size(1) * (uint)get_group_id(2); - uint global_id = (((uint)get_group_id(0) * LOCAL_WORK_GROUP_SIZE) / batch_num) * batch_num + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * batch_num; - - const uint out_batch_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM; - const uint out_x = get_group_id(1); - const uint out_y = get_group_id(2); - - const uint out_id = (global_id / batch_num) * OFM_PER_WORK_ITEM * batch_num + out_batch_id; - - const uint ofm_offset = (global_id * (OFM_PER_WORK_ITEM / batch_num)) % FILTER_OFM_NUM; - - const uint sub_group_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM; - - VECTOR_FLOAT _data0 = 0.f; - - const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - for (uint i = 0; i < FILTER_SIZE_Y; i++) - { - const int input_offset_y = y + i * DILATION_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - - if(!zero_y) - { - for (uint j = 0; j < FILTER_SIZE_X; j++) - { - const int input_offset_x = x + j * DILATION_SIZE_X; - const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - - if(!zero) - { - uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH; - input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH; - input_idx += out_batch_id; - - uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH; - -#if INPUT0_BATCH_NUM == 1 - for(uint h = 0; h < FILTER_IFM_NUM / 8; h++) - { - float _in = as_float(intel_sub_group_block_read((const __global uint*)input + input_idx)); - float8 _input = TRANSPOSE_BLOCK_8(_in); - - VECTOR_FLOAT _filter; - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s0, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s1, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s2, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s3, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s4, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s5, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s6, _filter, _data0); - - _filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM; - _data0 = mad(_input.s7, _filter, _data0); - - input_idx += 8 * INPUT0_FEATURE_PITCH; - } - for (uint h = FILTER_IFM_NUM - (FILTER_IFM_NUM % 8); h < FILTER_IFM_NUM; h++) -#else - for (uint h = 0; h < FILTER_IFM_NUM; h++) -#endif - { - VECTOR_FLOAT _filter = BLOCK_READ(filter + filter_idx); - _data0 = mad(input[input_idx], _filter, _data0); - filter_idx += FILTER_IFM_PITCH; - input_idx += INPUT0_FEATURE_PITCH; - } - } - } - } - } - -#if BIAS_TERM - _data0 += BLOCK_READ(bias + ofm_offset); -#endif - _data0 = ACTIVATION(_data0, ACTIVATION_PARAMS); - - uint _out_id = OUTPUT_OFFSET + out_id; - BLOCK_WRITE(output + _out_id, _data0); -#if defined(USE_VECTOR_8) || defined(USE_VECTOR_4) || defined(USE_VECTOR_2) - #undef VECTOR_FLOAT - #undef BLOCK_READ - #undef BLOCK_WRITE -#endif -}