[IE CLDNN] Removed unused conv kernels (#2995)
This commit is contained in:
parent
32c48b1087
commit
ccee30e733
@ -1,170 +0,0 @@
|
|||||||
// Copyright (c) 2017-2020 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#include "convolution_kernel_bfyx_3x3_dw_opt.h"
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace kernel_selector {
|
|
||||||
ConvolutionKernel_bfyx_3x3_dw_opt::ConvolutionKernel_bfyx_3x3_dw_opt()
|
|
||||||
: ConvolutionKernelBase("convolution_gpu_bfyx_3x3_dw_opt") {
|
|
||||||
// Generate the dispatch options to the auto-tuner.
|
|
||||||
std::vector<size_t> tileXDimSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14};
|
|
||||||
std::vector<size_t> tileYDimSizes = {1, 2, 3, 4, 5, 6, 7};
|
|
||||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
|
||||||
|
|
||||||
for (auto tileXDim : tileXDimSizes) {
|
|
||||||
for (auto tileYDim : tileYDimSizes) {
|
|
||||||
for (auto executionMode : executionModes) {
|
|
||||||
autoTuneOptions.emplace_back(AutoTuneOption{{tileXDim, tileYDim}, executionMode});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ParamsKey ConvolutionKernel_bfyx_3x3_dw_opt::GetSupportedKey() const {
|
|
||||||
ParamsKey k;
|
|
||||||
k.EnableInputDataType(Datatype::F32);
|
|
||||||
k.EnableInputDataType(Datatype::F16);
|
|
||||||
k.EnableInputWeightsType(WeightsType::F16);
|
|
||||||
k.EnableInputWeightsType(WeightsType::F32);
|
|
||||||
k.EnableOutputDataType(Datatype::F32);
|
|
||||||
k.EnableOutputDataType(Datatype::F16);
|
|
||||||
k.EnableInputLayout(DataLayout::bfyx);
|
|
||||||
k.EnableOutputLayout(DataLayout::bfyx);
|
|
||||||
k.EnableTensorOffset();
|
|
||||||
k.EnableTensorPitches();
|
|
||||||
k.EnableBiasPerFeature();
|
|
||||||
k.EnableNonBiasTerm();
|
|
||||||
k.EnableBatching();
|
|
||||||
k.EnableSplitSupport();
|
|
||||||
k.EnableSubGroup();
|
|
||||||
k.EnableSubGroupShort();
|
|
||||||
k.EnableDepthwiseSeparableOpt();
|
|
||||||
return k;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ConvolutionKernel_bfyx_3x3_dw_opt::Validate(const Params& p, const optional_params& o) const {
|
|
||||||
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
|
||||||
|
|
||||||
if ((cp.filterSize.x != 3) || (cp.filterSize.y != 3) || (cp.stride.x != 1) || (cp.stride.y != 1) ||
|
|
||||||
(cp.padding.x != 1) || (cp.padding.y != 1) || (cp.inputs[0].Feature().v != cp.split) ||
|
|
||||||
cp.output.PitchesDifferFromLogicalDims()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionKernel_bfyx_3x3_dw_opt::AutoTuneOption ConvolutionKernel_bfyx_3x3_dw_opt::GetAutoTuneOptions(const Params&,
|
|
||||||
int autoTuneIndex) const {
|
|
||||||
if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
|
|
||||||
return autoTuneOptions[autoTuneIndex];
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int simdSize = 16;
|
|
||||||
|
|
||||||
return AutoTuneOption{{simdSize - 2, 7}, DEFAULT};
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_3x3_dw_opt::SetDefault(const convolution_params& params,
|
|
||||||
int autoTuneIndex) const {
|
|
||||||
constexpr int simdSize = 16;
|
|
||||||
|
|
||||||
DispatchData dispatchData = Parent::SetDefault(params);
|
|
||||||
|
|
||||||
auto options = GetAutoTuneOptions(params, autoTuneIndex);
|
|
||||||
|
|
||||||
const int numTilesX = static_cast<int>(
|
|
||||||
std::ceil(static_cast<float>(params.inputs[0].X().v) / static_cast<float>(options.tileDims.x)));
|
|
||||||
const int numTilesY = static_cast<int>(
|
|
||||||
std::ceil(static_cast<float>(params.inputs[0].Y().v) / static_cast<float>(options.tileDims.y)));
|
|
||||||
|
|
||||||
dispatchData.cldnnStyle.blockWidth = options.tileDims.x;
|
|
||||||
dispatchData.cldnnStyle.blockHeight = options.tileDims.y;
|
|
||||||
dispatchData.gws[0] = numTilesX * simdSize;
|
|
||||||
dispatchData.gws[1] = numTilesY;
|
|
||||||
dispatchData.gws[2] = params.inputs[0].Feature().v * params.inputs[0].Batch().v;
|
|
||||||
dispatchData.lws[0] = simdSize;
|
|
||||||
dispatchData.lws[1] = 1;
|
|
||||||
dispatchData.lws[2] = 1;
|
|
||||||
|
|
||||||
dispatchData.efficiency = FORCE_PRIORITY_5;
|
|
||||||
|
|
||||||
return dispatchData;
|
|
||||||
}
|
|
||||||
|
|
||||||
JitConstants ConvolutionKernel_bfyx_3x3_dw_opt::GetJitConstants(const convolution_params& params,
|
|
||||||
const DispatchData& dispatchData) const {
|
|
||||||
stSize tileDims = {dispatchData.cldnnStyle.blockWidth, dispatchData.cldnnStyle.blockHeight};
|
|
||||||
auto mem_consts = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
|
|
||||||
|
|
||||||
if (tileDims.y != 0 && tileDims.x != 0) {
|
|
||||||
mem_consts.AddConstant(MakeJitConstant("UNIT_BYTE_SIZE", BytesPerElement(params.output.GetDType())));
|
|
||||||
mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", dispatchData.lws[0]));
|
|
||||||
mem_consts.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y));
|
|
||||||
mem_consts.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x));
|
|
||||||
}
|
|
||||||
|
|
||||||
return mem_consts;
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetTunedKernelsDataByIndex(const Params& params,
|
|
||||||
const optional_params& options,
|
|
||||||
const int autoTuneIndex) const {
|
|
||||||
constexpr int simdSize = 16;
|
|
||||||
|
|
||||||
KernelData kd = KernelData::Default<convolution_params>(params);
|
|
||||||
convolution_params& convParams = *static_cast<convolution_params*>(kd.params.get());
|
|
||||||
DispatchData dispatchData = SetDefault(convParams, autoTuneIndex);
|
|
||||||
|
|
||||||
if (static_cast<int>(static_cast<int>(dispatchData.gws[0] - 1) / simdSize) * dispatchData.cldnnStyle.blockWidth + simdSize >
|
|
||||||
convParams.inputs[0].Y().pitch) {
|
|
||||||
// Internal Error - requested tile size is not supported for y pitch
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
return GetCommonKernelsData(params, options, GetAutoTuneOptions(params, autoTuneIndex).exeMode, autoTuneIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetKernelsData(const Params& params,
|
|
||||||
const optional_params& options) const {
|
|
||||||
return GetTunedKernelsDataByIndex(params, options, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData ConvolutionKernel_bfyx_3x3_dw_opt::GetKernelsDataForAutoTune(const Params& params,
|
|
||||||
const optional_params& options) const {
|
|
||||||
if (!Validate(params, options)) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData res = {};
|
|
||||||
|
|
||||||
for (size_t i = 0; i < autoTuneOptions.size(); i++) {
|
|
||||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
|
|
||||||
if (!kd.empty()) {
|
|
||||||
res.emplace_back(kd[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData defaultKds = GetKernelsData(params, options);
|
|
||||||
res.insert(res.end(), defaultKds.begin(), defaultKds.end());
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
} // namespace kernel_selector
|
|
@ -1,52 +0,0 @@
|
|||||||
// Copyright (c) 2017 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "convolution_kernel_base.h"
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace kernel_selector {
|
|
||||||
class ConvolutionKernel_bfyx_3x3_dw_opt : public ConvolutionKernelBase {
|
|
||||||
public:
|
|
||||||
using Parent = ConvolutionKernelBase;
|
|
||||||
ConvolutionKernel_bfyx_3x3_dw_opt();
|
|
||||||
virtual ~ConvolutionKernel_bfyx_3x3_dw_opt() {}
|
|
||||||
|
|
||||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
|
||||||
KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override;
|
|
||||||
KernelsData GetTunedKernelsDataByIndex(const Params& params,
|
|
||||||
const optional_params& options,
|
|
||||||
int autoTuneIndex) const override;
|
|
||||||
ParamsKey GetSupportedKey() const override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
bool Validate(const Params&, const optional_params&) const override;
|
|
||||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
|
||||||
return WeightsLayout::oiyx;
|
|
||||||
}
|
|
||||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
|
|
||||||
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
|
|
||||||
|
|
||||||
struct AutoTuneOption {
|
|
||||||
stSize tileDims;
|
|
||||||
std::string exeMode;
|
|
||||||
};
|
|
||||||
|
|
||||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
|
||||||
std::vector<AutoTuneOption> autoTuneOptions = {};
|
|
||||||
};
|
|
||||||
} // namespace kernel_selector
|
|
@ -1,262 +0,0 @@
|
|||||||
// Copyright (c) 2016-2020 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h"
|
|
||||||
#include <vector>
|
|
||||||
#include <utility>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
namespace kernel_selector {
|
|
||||||
// Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
|
|
||||||
constexpr size_t sub_group_size = 16;
|
|
||||||
|
|
||||||
ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::ConvolutionKernel_bfyx_os_iyx_osv16_2_sg()
|
|
||||||
: ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16_2_sg") {
|
|
||||||
// Generate the dispatch options to the auto-tuner.
|
|
||||||
std::vector<size_t> blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16};
|
|
||||||
std::vector<size_t> blockHeightSizes = {1, 2, 3, 4, 5};
|
|
||||||
std::vector<size_t> prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10};
|
|
||||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
|
||||||
const size_t maxBlockSize = 60;
|
|
||||||
|
|
||||||
for (auto executionMode : executionModes) {
|
|
||||||
for (auto blockWidth : blockWidthSizes) {
|
|
||||||
for (auto blockHeight : blockHeightSizes) {
|
|
||||||
for (auto prefetch : prefetchSizes) {
|
|
||||||
if (blockWidth * blockHeight <= maxBlockSize) {
|
|
||||||
autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ParamsKey ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedKey() const {
|
|
||||||
ParamsKey k;
|
|
||||||
k.EnableInputDataType(Datatype::F16);
|
|
||||||
k.EnableInputDataType(Datatype::F32);
|
|
||||||
k.EnableInputWeightsType(WeightsType::F16);
|
|
||||||
k.EnableInputWeightsType(WeightsType::F32);
|
|
||||||
k.EnableOutputDataType(Datatype::F16);
|
|
||||||
k.EnableOutputDataType(Datatype::F32);
|
|
||||||
k.EnableInputLayout(DataLayout::bfyx);
|
|
||||||
k.EnableOutputLayout(DataLayout::bfyx);
|
|
||||||
k.EnableTensorOffset();
|
|
||||||
k.EnableTensorPitches();
|
|
||||||
k.EnableSubGroup();
|
|
||||||
k.EnableBiasPerFeature();
|
|
||||||
k.EnableBiasPerOutput();
|
|
||||||
k.EnableNonBiasTerm();
|
|
||||||
k.EnableBatching();
|
|
||||||
k.EnableSplitSupport();
|
|
||||||
k.EnableDilation();
|
|
||||||
return k;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<size_t, size_t> get_bfyx_req_input_block_dims(size_t output_block_width,
|
|
||||||
size_t output_block_height,
|
|
||||||
const uSize& filter_size,
|
|
||||||
const uSize& stride,
|
|
||||||
const uSize& dilation,
|
|
||||||
size_t sg_size = 16,
|
|
||||||
size_t read_chunk_size = 8,
|
|
||||||
size_t min_read_size = 16) {
|
|
||||||
assert(output_block_width > 0 && output_block_height > 0);
|
|
||||||
assert(stride.x > 0 && stride.y > 0);
|
|
||||||
assert(filter_size.x > 0 && filter_size.y > 0);
|
|
||||||
|
|
||||||
// Number of elements in X dimension needed from input to compute output block without re-reading input.
|
|
||||||
size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1;
|
|
||||||
// Number of elements in Y dimension needed from input to compute output block without re-reading input.
|
|
||||||
size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1;
|
|
||||||
|
|
||||||
// Required number of elements in X dimension rounded to nearest >= read chunk size.
|
|
||||||
size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
|
|
||||||
// Number of sub-group-sized vectors of unit type needed to store input block.
|
|
||||||
size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size);
|
|
||||||
|
|
||||||
return std::make_pair(input_block_array_size, input_block_read_width);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) {
|
|
||||||
// how many elements we will compute in each dimension
|
|
||||||
size_t computed_x = Align(output_x, block_x);
|
|
||||||
size_t computed_y = Align(output_y, block_y);
|
|
||||||
// how many simds we need in each dimension
|
|
||||||
size_t simds_x = computed_x / block_x;
|
|
||||||
size_t simds_y = computed_y / block_y;
|
|
||||||
// how many unused values we have in each dimension
|
|
||||||
size_t unused_x = computed_x - output_x;
|
|
||||||
size_t unused_y = computed_y - output_y;
|
|
||||||
|
|
||||||
block_x -= unused_x / simds_x;
|
|
||||||
block_y -= unused_y / simds_y;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetAutoTuneOptions(
|
|
||||||
const Params& p,
|
|
||||||
int autoTuneIndex) const {
|
|
||||||
if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
|
|
||||||
return autoTuneOptions[autoTuneIndex];
|
|
||||||
}
|
|
||||||
|
|
||||||
AutoTuneOption option = {0, 0, 0, DEFAULT};
|
|
||||||
|
|
||||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
|
||||||
|
|
||||||
if (cp.stride.x == 1 && cp.stride.y == 1) {
|
|
||||||
if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
|
|
||||||
option.blockWidth = 16;
|
|
||||||
option.blockHeight = 1;
|
|
||||||
option.prefetch = 4;
|
|
||||||
// if less than 16 values is required to compute one single row of output
|
|
||||||
// then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice
|
|
||||||
// performance results)
|
|
||||||
} else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) {
|
|
||||||
option.blockWidth = cp.output.X().v;
|
|
||||||
option.blockHeight = 1;
|
|
||||||
option.prefetch = 4;
|
|
||||||
} else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) {
|
|
||||||
option.blockWidth = sub_group_size - cp.filterSize.x + 1;
|
|
||||||
option.blockHeight = 2;
|
|
||||||
option.prefetch = 4;
|
|
||||||
} else {
|
|
||||||
option.blockWidth = 4;
|
|
||||||
option.blockHeight = 3;
|
|
||||||
option.prefetch = 4;
|
|
||||||
}
|
|
||||||
} else if (cp.stride.x == 2 && cp.stride.y == 2) {
|
|
||||||
option.blockWidth = 5;
|
|
||||||
option.blockHeight = 4;
|
|
||||||
option.prefetch = 4;
|
|
||||||
} else {
|
|
||||||
option.blockWidth = 4;
|
|
||||||
option.blockHeight = 3;
|
|
||||||
option.prefetch = 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
|
|
||||||
// sizes
|
|
||||||
if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
|
|
||||||
shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
return option;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::SetDefault(const convolution_params& cp,
|
|
||||||
int autoTuneIndex) const {
|
|
||||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(cp);
|
|
||||||
|
|
||||||
const auto of_maps = cp.output.Feature().v;
|
|
||||||
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
|
|
||||||
|
|
||||||
dispatchData.efficiency = FORCE_PRIORITY_3;
|
|
||||||
|
|
||||||
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
|
|
||||||
dispatchData.cldnnStyle.blockWidth = tuneOptions.blockWidth;
|
|
||||||
dispatchData.cldnnStyle.blockHeight = tuneOptions.blockHeight;
|
|
||||||
dispatchData.cldnnStyle.prefetch = tuneOptions.prefetch;
|
|
||||||
|
|
||||||
auto input_block_dims = get_bfyx_req_input_block_dims(dispatchData.cldnnStyle.blockWidth,
|
|
||||||
dispatchData.cldnnStyle.blockHeight,
|
|
||||||
cp.filterSize,
|
|
||||||
cp.stride,
|
|
||||||
cp.dilation,
|
|
||||||
sub_group_size,
|
|
||||||
cp.output.GetDType() == Datatype::F16 ? sub_group_size : sub_group_size / 2,
|
|
||||||
sub_group_size);
|
|
||||||
dispatchData.cldnnStyle.inputBlockArraySize = input_block_dims.first;
|
|
||||||
dispatchData.cldnnStyle.inputBlockWidth = input_block_dims.second;
|
|
||||||
|
|
||||||
dispatchData.gws[0] = CeilDiv(cp.output.X().v, dispatchData.cldnnStyle.blockWidth);
|
|
||||||
dispatchData.gws[1] = CeilDiv(cp.output.Y().v, dispatchData.cldnnStyle.blockHeight);
|
|
||||||
dispatchData.gws[2] = 2 * of_threads_per_batch * cp.output.Batch().v;
|
|
||||||
|
|
||||||
dispatchData.lws[0] = 1;
|
|
||||||
dispatchData.lws[1] = 1;
|
|
||||||
dispatchData.lws[2] = 2 * sub_group_size;
|
|
||||||
|
|
||||||
return dispatchData;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::Validate(const Params& p, const optional_params& o) const {
|
|
||||||
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
|
||||||
|
|
||||||
if (cp.inputs[0].Feature().v % 2 != 0 || cp.inputs[0].Feature().v < 64)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (cp.output.Feature().v % 64 != 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
JitConstants ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetJitConstants(const convolution_params& params,
|
|
||||||
const DispatchData& dispatchData) const {
|
|
||||||
const auto of_maps = params.output.Feature().v;
|
|
||||||
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
|
|
||||||
size_t leftovers = of_threads_per_batch - of_maps;
|
|
||||||
|
|
||||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
|
||||||
|
|
||||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 16));
|
|
||||||
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", dispatchData.cldnnStyle.blockWidth));
|
|
||||||
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", dispatchData.cldnnStyle.blockHeight));
|
|
||||||
jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", dispatchData.cldnnStyle.inputBlockArraySize));
|
|
||||||
jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
|
|
||||||
jit.AddConstant(MakeJitConstant("PREFETCH", dispatchData.cldnnStyle.prefetch));
|
|
||||||
|
|
||||||
if (leftovers) {
|
|
||||||
jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
|
|
||||||
}
|
|
||||||
|
|
||||||
return jit;
|
|
||||||
}
|
|
||||||
|
|
||||||
WeightsLayout ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetPreferredWeightsLayout(
|
|
||||||
const convolution_params ¶ms) const {
|
|
||||||
return params.groups == 1 ? WeightsLayout::os_iyx_osv16 : WeightsLayout::g_os_iyx_osv16;
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsData(const Params& params,
|
|
||||||
const optional_params& options) const {
|
|
||||||
return GetTunedKernelsDataByIndex(params, options);
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsDataForAutoTune(const Params& params,
|
|
||||||
const optional_params& options) const {
|
|
||||||
if (!Validate(params, options)) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData res = {};
|
|
||||||
|
|
||||||
for (size_t i = 0; i < autoTuneOptions.size(); i++) {
|
|
||||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
|
|
||||||
if (!kd.empty()) {
|
|
||||||
res.emplace_back(kd[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace kernel_selector
|
|
@ -1,53 +0,0 @@
|
|||||||
// Copyright (c) 2016 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "convolution_kernel_base.h"
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace kernel_selector {
|
|
||||||
|
|
||||||
class ConvolutionKernel_bfyx_os_iyx_osv16_2_sg : public ConvolutionKernelBase {
|
|
||||||
public:
|
|
||||||
using Parent = ConvolutionKernelBase;
|
|
||||||
ConvolutionKernel_bfyx_os_iyx_osv16_2_sg();
|
|
||||||
virtual ~ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() {}
|
|
||||||
|
|
||||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
|
||||||
KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override;
|
|
||||||
ParamsKey GetSupportedKey() const override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
|
|
||||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
|
|
||||||
bool Validate(const Params& p, const optional_params& o) const override;
|
|
||||||
bool NeedPaddedInput() const override { return true; }
|
|
||||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct AutoTuneOption {
|
|
||||||
size_t blockWidth;
|
|
||||||
size_t blockHeight;
|
|
||||||
size_t prefetch;
|
|
||||||
std::string exeMode;
|
|
||||||
};
|
|
||||||
|
|
||||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
|
||||||
|
|
||||||
std::vector<AutoTuneOption> autoTuneOptions = {};
|
|
||||||
};
|
|
||||||
} // namespace kernel_selector
|
|
@ -20,14 +20,11 @@
|
|||||||
#include "convolution_kernel_bfyx_gemm_like.h"
|
#include "convolution_kernel_bfyx_gemm_like.h"
|
||||||
#include "convolution_kernel_bfyx_direct_10_12_16.h"
|
#include "convolution_kernel_bfyx_direct_10_12_16.h"
|
||||||
#include "convolution_kernel_bfyx_os_iyx_osv16.h"
|
#include "convolution_kernel_bfyx_os_iyx_osv16.h"
|
||||||
#include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h"
|
|
||||||
#include "convolution_kernel_bfyx_iyxo.h"
|
#include "convolution_kernel_bfyx_iyxo.h"
|
||||||
#include "convolution_kernel_yxfb_ref.h"
|
#include "convolution_kernel_yxfb_ref.h"
|
||||||
#include "convolution_kernel_yxfb_yxio_b16.h"
|
#include "convolution_kernel_yxfb_yxio_b16.h"
|
||||||
#include "convolution_kernel_yxfb_yxio_b8.h"
|
#include "convolution_kernel_yxfb_yxio_b8.h"
|
||||||
#include "convolution_kernel_yxfb_yxio_b1_block.h"
|
|
||||||
#include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h"
|
#include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h"
|
||||||
// #include "convolution_kernel_bfyx_3x3_dw_opt.h"
|
|
||||||
#include "convolution_kernel_winograd_2x3_s1.h"
|
#include "convolution_kernel_winograd_2x3_s1.h"
|
||||||
#include "convolution_kernel_bfyx_1x1.h"
|
#include "convolution_kernel_bfyx_1x1.h"
|
||||||
#include "convolution_kernel_bfyx_1x1_gemm_buf.h"
|
#include "convolution_kernel_bfyx_1x1_gemm_buf.h"
|
||||||
@ -99,16 +96,12 @@ convolution_kernel_selector::convolution_kernel_selector() {
|
|||||||
Attach<ConvolutionKernel_bfyx_1x1>();
|
Attach<ConvolutionKernel_bfyx_1x1>();
|
||||||
Attach<ConvolutionKernel_bfyx_1x1_gemm_buf>();
|
Attach<ConvolutionKernel_bfyx_1x1_gemm_buf>();
|
||||||
Attach<ConvolutionKernel_bfyx_depthwise_weights_lwg>();
|
Attach<ConvolutionKernel_bfyx_depthwise_weights_lwg>();
|
||||||
// commented out to not get in our way, will enable in future after autotuning
|
|
||||||
// Attach<ConvolutionKernel_bfyx_os_iyx_osv16_2_sg>();
|
|
||||||
|
|
||||||
// yxfb fp
|
// yxfb fp
|
||||||
Attach<ConvolutionKernel_yxfb_Ref>();
|
Attach<ConvolutionKernel_yxfb_Ref>();
|
||||||
Attach<ConvolutionKernel_yxfb_yxio_b16>();
|
Attach<ConvolutionKernel_yxfb_yxio_b16>();
|
||||||
Attach<ConvolutionKernel_yxfb_yxio_b8>();
|
Attach<ConvolutionKernel_yxfb_yxio_b8>();
|
||||||
Attach<ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x>();
|
Attach<ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x>();
|
||||||
// Attach<ConvolutionKernel_yxfb_yxio_b1_block>(); // TODO: need to finish integration
|
|
||||||
// Attach<ConvolutionKernel_bfyx_3x3_dw_opt>();
|
|
||||||
|
|
||||||
// Winograd
|
// Winograd
|
||||||
Attach<ConvolutionKernel_Winograd_2x3_s1>();
|
Attach<ConvolutionKernel_Winograd_2x3_s1>();
|
||||||
|
@ -1,58 +0,0 @@
|
|||||||
// Copyright (c) 2016-2020 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#include "convolution_kernel_yxfb_yxio_b1_block.h"
|
|
||||||
|
|
||||||
namespace kernel_selector {
|
|
||||||
|
|
||||||
ParamsKey ConvolutionKernel_yxfb_yxio_b1_block::GetSupportedKey() const {
|
|
||||||
ParamsKey k;
|
|
||||||
k.EnableInputDataType(Datatype::F32);
|
|
||||||
k.EnableInputWeightsType(WeightsType::F16);
|
|
||||||
k.EnableInputWeightsType(WeightsType::F32);
|
|
||||||
k.EnableOutputDataType(Datatype::F32);
|
|
||||||
k.EnableInputLayout(DataLayout::yxfb);
|
|
||||||
k.EnableOutputLayout(DataLayout::yxfb);
|
|
||||||
k.EnableTensorOffset();
|
|
||||||
k.EnableTensorPitches();
|
|
||||||
k.EnableBiasPerFeature();
|
|
||||||
k.EnableNonBiasTerm();
|
|
||||||
k.EnableBatching();
|
|
||||||
k.EnableSplitSupport();
|
|
||||||
k.EnableDilation();
|
|
||||||
k.EnableSubGroup();
|
|
||||||
return k;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block::SetDefault(const convolution_params& arg,
|
|
||||||
int) const {
|
|
||||||
DispatchData dispatchData = ConvolutionKernelBase::SetDefault(arg);
|
|
||||||
// TODO: fill the proper data here (I don't know where can I locate it).
|
|
||||||
return dispatchData;
|
|
||||||
}
|
|
||||||
|
|
||||||
JitConstants ConvolutionKernel_yxfb_yxio_b1_block::GetJitConstants(const convolution_params& params,
|
|
||||||
const DispatchData& dispatchData) const {
|
|
||||||
auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, dispatchData);
|
|
||||||
|
|
||||||
cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", dispatchData.lws[0]));
|
|
||||||
return cldnn_jit;
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelsData ConvolutionKernel_yxfb_yxio_b1_block::GetKernelsData(const Params& params,
|
|
||||||
const optional_params& options) const {
|
|
||||||
return GetTunedKernelsDataByIndex(params, options);
|
|
||||||
}
|
|
||||||
} // namespace kernel_selector
|
|
@ -1,38 +0,0 @@
|
|||||||
// Copyright (c) 2016 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "convolution_kernel_base.h"
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace kernel_selector {
|
|
||||||
|
|
||||||
class ConvolutionKernel_yxfb_yxio_b1_block : public ConvolutionKernelBase {
|
|
||||||
public:
|
|
||||||
ConvolutionKernel_yxfb_yxio_b1_block() : ConvolutionKernelBase("convolution_gpu_yxfb_yxio_b1_block_fp32") {}
|
|
||||||
virtual ~ConvolutionKernel_yxfb_yxio_b1_block() {}
|
|
||||||
|
|
||||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
|
||||||
ParamsKey GetSupportedKey() const override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
|
|
||||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
|
||||||
return WeightsLayout::yxio;
|
|
||||||
}
|
|
||||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
|
||||||
};
|
|
||||||
} // namespace kernel_selector
|
|
@ -1,130 +0,0 @@
|
|||||||
// Copyright (c) 2016-2017 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
#include "include/include_all.cl"
|
|
||||||
|
|
||||||
#if FP16_UNIT_USED == 0
|
|
||||||
#define ALIGNED_BLOCK_READ(ptr, offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr) + (offset)))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
|
||||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
|
||||||
KERNEL(convolution_gpu_bfyx_3x3_dw_opt)(
|
|
||||||
__global UNIT_TYPE* input,
|
|
||||||
__global UNIT_TYPE* output,
|
|
||||||
__global UNIT_TYPE* weights,
|
|
||||||
#if BIAS_TERM
|
|
||||||
__global UNIT_TYPE* biases,
|
|
||||||
#endif
|
|
||||||
uint split_idx)
|
|
||||||
{
|
|
||||||
const uint local_id = get_local_id(0);
|
|
||||||
const uint tile_x = (uint)get_global_id(0);
|
|
||||||
const uint tile_y = (uint)get_global_id(1);
|
|
||||||
const uint bf = (uint)get_global_id(2);
|
|
||||||
const uint f = bf % INPUT0_FEATURE_NUM;
|
|
||||||
const uint b = bf / INPUT0_FEATURE_NUM;
|
|
||||||
|
|
||||||
const uint start_x = tile_x / SUB_GROUP_SIZE * TILE_WIDTH;
|
|
||||||
const uint offset_x = start_x + (tile_x - tile_x / SUB_GROUP_SIZE * SUB_GROUP_SIZE) % TILE_WIDTH;
|
|
||||||
const uint offset = b * INPUT0_BATCH_PITCH + INPUT0_FEATURE_PITCH * f;
|
|
||||||
const uint out_offset = b * OUTPUT_BATCH_PITCH + OUTPUT_FEATURE_PITCH * f;
|
|
||||||
|
|
||||||
const int start_y = tile_y * TILE_HEIGHT;
|
|
||||||
const int end_y = min(INPUT0_SIZE_Y - 1, start_y + TILE_HEIGHT - 1);
|
|
||||||
const uint weight_offset = f * FILTER_IFM_PITCH + local_id;
|
|
||||||
|
|
||||||
// Read 3 lines of SUB_GROUP_SIZE floats.
|
|
||||||
// The 3 lines start one float before the current (to the left) and one line up:
|
|
||||||
// SUB_GROUP_SIZE=16
|
|
||||||
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
||||||
// 0 X 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
||||||
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
||||||
// In the diagram above X represents the current work item.
|
|
||||||
|
|
||||||
const int input_offset_const = INPUT0_OFFSET + offset + (start_y * INPUT0_Y_PITCH + start_x) - 1;
|
|
||||||
|
|
||||||
const uint base_addr_offset = INPUT0_Y_PITCH;
|
|
||||||
|
|
||||||
UNIT_TYPE input_buffer[3] = { UNIT_VAL_ZERO };
|
|
||||||
const int base_offset = -base_addr_offset * UNIT_BYTE_SIZE;
|
|
||||||
|
|
||||||
#if FP16_UNIT_USED
|
|
||||||
const uint lid = get_sub_group_local_id();
|
|
||||||
if(input_offset_const - base_addr_offset >= 0)
|
|
||||||
input_buffer[0] = input[input_offset_const - base_addr_offset + lid];
|
|
||||||
if(input_offset_const >= 0)
|
|
||||||
input_buffer[1] = input[input_offset_const + lid];
|
|
||||||
#else
|
|
||||||
input_buffer[0] = ALIGNED_BLOCK_READ(input, input_offset_const - base_addr_offset);
|
|
||||||
input_buffer[1] = ALIGNED_BLOCK_READ(input, input_offset_const);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
UNIT_TYPE w = weights[weight_offset];
|
|
||||||
|
|
||||||
int first = 0;
|
|
||||||
int second = 1;
|
|
||||||
int third = 2;
|
|
||||||
int input_offset = input_offset_const;
|
|
||||||
|
|
||||||
for (int y = start_y; y <= end_y; y++)
|
|
||||||
{
|
|
||||||
UNIT_TYPE res = UNIT_VAL_ZERO;
|
|
||||||
input_offset += base_addr_offset;
|
|
||||||
|
|
||||||
#if FP16_UNIT_USED
|
|
||||||
if(input_offset >= 0)
|
|
||||||
input_buffer[third] = input[input_offset + lid];
|
|
||||||
#else
|
|
||||||
input_buffer[third] = ALIGNED_BLOCK_READ(input, input_offset);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
uint kc = 0;
|
|
||||||
LOOP(FILTER_SIZE_X, kc,
|
|
||||||
{
|
|
||||||
res = mad(intel_sub_group_shuffle( w, FILTER_SIZE_Y + kc),intel_sub_group_shuffle( input_buffer[second], local_id + kc),res);
|
|
||||||
|
|
||||||
if (y == 0)
|
|
||||||
{
|
|
||||||
res = mad(intel_sub_group_shuffle( w, 2*FILTER_SIZE_Y + kc),intel_sub_group_shuffle( input_buffer[third], local_id + kc),res);
|
|
||||||
}
|
|
||||||
else if (y == INPUT0_SIZE_Y - 1)
|
|
||||||
{
|
|
||||||
res = mad(intel_sub_group_shuffle( w, kc),intel_sub_group_shuffle( input_buffer[first], local_id + kc),res);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
res = mad(intel_sub_group_shuffle( w, kc),intel_sub_group_shuffle( input_buffer[first], local_id + kc),res);
|
|
||||||
res = mad(intel_sub_group_shuffle( w, 2*FILTER_SIZE_Y + kc),intel_sub_group_shuffle( input_buffer[third], local_id + kc),res);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
#if BIAS_TERM
|
|
||||||
res += biases[f];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if ((local_id < TILE_WIDTH) && (offset_x < INPUT0_SIZE_X))
|
|
||||||
{
|
|
||||||
output[OUTPUT_OFFSET + out_offset + y * INPUT0_SIZE_X + offset_x] = ACTIVATION(res, ACTIVATION_PARAMS);
|
|
||||||
}
|
|
||||||
|
|
||||||
first = (first + 1) % 3;
|
|
||||||
second = (second + 1) % 3;
|
|
||||||
third = (third + 1) % 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef ALIGNED_BLOCK_READ
|
|
@ -1,254 +0,0 @@
|
|||||||
// Copyright (c) 2016-2017 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "include/common.cl"
|
|
||||||
#include "include/data_types.cl"
|
|
||||||
|
|
||||||
#define SIMD_SIZE SUB_GROUP_SIZE
|
|
||||||
// ---------------------------------------------------------------------------------------------------------------------
|
|
||||||
// Just-in-time macro definitions:
|
|
||||||
// ---------------------------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
// Required JIT constants:
|
|
||||||
// - INPUT - [tensor] Input dimensions (batch, spatial and feature).
|
|
||||||
// - OUTPUT - [tensor] Output dimensions (batch, spatial and feature).
|
|
||||||
// - STRIDE - [tensor] Stride (only spatial). Factors that describe step size in X or Y dimension of
|
|
||||||
// input position of application of convolution filter when next ouput value
|
|
||||||
// (step 1 in in X or Y dimension of output) is computed.
|
|
||||||
// - INPUT0_OFFSET - [tensor] Offset for the first element
|
|
||||||
// initial offset input position of application of convolution filter and output position.
|
|
||||||
// - FP16_SUPPORTED - [0/1] Value indicating whether device supports FP16 OpenCL extension (cl_khr_fp16).
|
|
||||||
// - FP16_UNIT_USED - [0/1] Value indicating that current kernel should use FP16.
|
|
||||||
// - UNIT_TYPE - Type of unit of input/output/weight/bias.
|
|
||||||
// - UNIT_VAL_ZERO - Literal of current UNIT_TYPE that represents 0.
|
|
||||||
// - RELU - [0/1] Indicates that ReLU activation function should be used on output.
|
|
||||||
// - NEGATIVE_SLOPE - [float] Factor for negative output values (required when ReLU is specified).
|
|
||||||
//
|
|
||||||
// - SUB_GROUP_SIZE - [int] Size of used subgroup (SIMD).
|
|
||||||
// - LEFTOVERS - [int] Optional parameter, required only when number of ofm is not dividable by SUB_GROUP_SIZE
|
|
||||||
// see comment for FEATURES_THREADS_PER_BATCH for more informations
|
|
||||||
|
|
||||||
/*
|
|
||||||
gpu::make_jit_constant("OUTPUT_LIMIT", output_size),
|
|
||||||
gpu::make_jit_constant("FILTER", filter_mem.argument().size),
|
|
||||||
gpu::make_jit_constant("FILTER_ARRAY_NUM", split),
|
|
||||||
gpu::make_jit_constant("OUTPUT_BLOCK_WIDTH", _kernel_data.block_width));
|
|
||||||
gpu::make_jit_constant("OUTPUT_BLOCK_HEIGHT", _kernel_data.block_height));
|
|
||||||
gpu::make_jit_constant("IN_BLOCK_ARRAY_SIZE", _kernel_data.input_block_array_size));
|
|
||||||
gpu::make_jit_constant("IN_BLOCK_WIDTH", _kernel_data.input_block_width));
|
|
||||||
gpu::make_jit_constant("PREFETCH", _kernel_data.prefetch));
|
|
||||||
if (_kernel_data.leftovers)
|
|
||||||
gpu::make_jit_constant("LEFTOVERS", _kernel_data.leftovers));
|
|
||||||
*/
|
|
||||||
|
|
||||||
// FEATURES_THREADS_PER_BATCH defines how many threads in z-dimension are processing single batch.
|
|
||||||
// ideally, z-dimension of value n should indicate processing of n-th output feature. however, since
|
|
||||||
// threads are stack in groups of SUB_GROUP_SIZE, when number of ofm is not dividable by SUB_GROUP_SIZE
|
|
||||||
// there are dummy threads added in z-dimension in count of LEFTOVERS. We need to take them into consideration
|
|
||||||
// while calculating batch's id (see lines 86-87). Values calculated by dummy threads are discarded at line 210.
|
|
||||||
#ifdef LEFTOVERS
|
|
||||||
#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM + LEFTOVERS)
|
|
||||||
#else
|
|
||||||
#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
|
||||||
__attribute__((reqd_work_group_size(1, 1, 2*SUB_GROUP_SIZE)))
|
|
||||||
KERNEL(convolution_gpu_bfyx_os_iyx_osv16_2_sg)(
|
|
||||||
const __global UNIT_TYPE* input,
|
|
||||||
__global UNIT_TYPE* output,
|
|
||||||
const __global UNIT_TYPE* weights,
|
|
||||||
#if BIAS_TERM
|
|
||||||
const __global UNIT_TYPE* bias,
|
|
||||||
#endif
|
|
||||||
uint split_idx) // TODO: removing this parameter cause a performance degradation... :)
|
|
||||||
{
|
|
||||||
const uint oc = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH; // oc = Output Column
|
|
||||||
const uint or = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT; // or = Output Row
|
|
||||||
const uint fm = (uint)get_group_id(2) * SUB_GROUP_SIZE + get_sub_group_local_id();//get_global_id(2); // fm = Feature Map = od = Output Depth
|
|
||||||
const uint lid = get_sub_group_local_id();
|
|
||||||
|
|
||||||
const uint ifm_part = get_sub_group_id();
|
|
||||||
__local float slm_vals[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT * SIMD_SIZE];
|
|
||||||
|
|
||||||
uint batch_idx = fm / FEATURES_THREADS_PER_BATCH;
|
|
||||||
uint feature_idx = fm % FEATURES_THREADS_PER_BATCH;
|
|
||||||
uint fmg = feature_idx / SUB_GROUP_SIZE;
|
|
||||||
|
|
||||||
UNIT_TYPE in[IN_BLOCK_ARRAY_SIZE];
|
|
||||||
UNIT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT];
|
|
||||||
UNIT_TYPE w[PREFETCH];
|
|
||||||
uint in_addr;
|
|
||||||
uint weight_addr = fmg * FILTER_IFM_NUM * FILTER_SIZE_X * FILTER_SIZE_Y * SUB_GROUP_SIZE + lid;
|
|
||||||
weight_addr += ifm_part * SUB_GROUP_SIZE * FILTER_IFM_NUM/2 * FILTER_SIZE_X * FILTER_SIZE_Y;
|
|
||||||
|
|
||||||
for(int i = 0; i < (OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT); i++) {
|
|
||||||
out[i] = UNIT_VAL_ZERO;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
|
|
||||||
in_addr = batch_idx * INPUT0_BATCH_PITCH;
|
|
||||||
in_addr += in_split_offset + INPUT0_OFFSET_WITH_PADDING + or * STRIDE_SIZE_Y * INPUT0_Y_PITCH + oc * STRIDE_SIZE_X + lid;
|
|
||||||
in_addr += ifm_part * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM/2;
|
|
||||||
|
|
||||||
for(int kd = 0; kd < FILTER_IFM_NUM/2; kd++) // _ID = 3, RGB
|
|
||||||
{
|
|
||||||
uint tmp_in_addr = in_addr;
|
|
||||||
|
|
||||||
#if IN_BLOCK_WIDTH % SUB_GROUP_SIZE == 0
|
|
||||||
__attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE)))
|
|
||||||
for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) {
|
|
||||||
// Horizontal position in input block after read.
|
|
||||||
const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE;
|
|
||||||
|
|
||||||
in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH];
|
|
||||||
|
|
||||||
// If we have row break, move to the next row.
|
|
||||||
if (in_block_next_x_pos == IN_BLOCK_WIDTH)
|
|
||||||
tmp_in_addr += INPUT0_Y_PITCH;
|
|
||||||
}
|
|
||||||
#elif (2 * IN_BLOCK_WIDTH) % SUB_GROUP_SIZE == 0
|
|
||||||
__attribute__((opencl_unroll_hint(IN_BLOCK_ARRAY_SIZE)))
|
|
||||||
for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE * SUB_GROUP_SIZE; in_block_pos += SUB_GROUP_SIZE) {
|
|
||||||
// Horizontal position in input block after read.
|
|
||||||
const uint in_block_next_x_pos = in_block_pos % IN_BLOCK_WIDTH + SUB_GROUP_SIZE;
|
|
||||||
|
|
||||||
if (in_block_next_x_pos <= IN_BLOCK_WIDTH) { //
|
|
||||||
in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH];
|
|
||||||
|
|
||||||
// If we have row break, move to the next row.
|
|
||||||
if (in_block_next_x_pos == IN_BLOCK_WIDTH)
|
|
||||||
tmp_in_addr += INPUT0_Y_PITCH;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// TODO: Generalize this step to relax IN_BLOCK_WIDTH restrictions.
|
|
||||||
// Position in sub-group on which new row need to be read.
|
|
||||||
const uint sg_br_pos = IN_BLOCK_WIDTH - in_block_pos % IN_BLOCK_WIDTH;
|
|
||||||
|
|
||||||
if (lid < sg_br_pos)
|
|
||||||
in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr + in_block_pos % IN_BLOCK_WIDTH];
|
|
||||||
// We have row break inside sub-group. Need to move to next line.
|
|
||||||
tmp_in_addr += INPUT0_Y_PITCH;
|
|
||||||
if (lid >= sg_br_pos)
|
|
||||||
in[in_block_pos / SUB_GROUP_SIZE] = input[tmp_in_addr - sg_br_pos];
|
|
||||||
|
|
||||||
// If we have another row break, move to the next row.
|
|
||||||
if (in_block_next_x_pos == 2 * IN_BLOCK_WIDTH)
|
|
||||||
tmp_in_addr += INPUT0_Y_PITCH;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#error IN_BLOCK_WIDTH must be multiple of SUB_GROUP_SIZE or half of SUB_GROUP_SIZE. Other scenarios are not currently implemented.
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//move to next filter
|
|
||||||
in_addr += INPUT0_FEATURE_PITCH;
|
|
||||||
|
|
||||||
for(int pf=0; pf<PREFETCH; pf++) {
|
|
||||||
w[pf] = weights[weight_addr]; weight_addr += SUB_GROUP_SIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint wi = 0;
|
|
||||||
uint kr = 0; // kr = Kernel Row
|
|
||||||
LOOP(FILTER_SIZE_Y, kr, // LOOP is a macro that unrolls the loop.
|
|
||||||
{
|
|
||||||
uint kc = 0; // kc = Kernel Column
|
|
||||||
LOOP(FILTER_SIZE_X, kc,
|
|
||||||
{
|
|
||||||
//w = weights[weight_addr];
|
|
||||||
for(uint br=0; br<OUTPUT_BLOCK_HEIGHT; br++) {
|
|
||||||
for(uint bc=0; bc<OUTPUT_BLOCK_WIDTH; bc++) {
|
|
||||||
|
|
||||||
#if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
|
|
||||||
//if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
|
|
||||||
UNIT_TYPE val = intel_sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
|
|
||||||
(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) % SUB_GROUP_SIZE);
|
|
||||||
#else
|
|
||||||
UNIT_TYPE val = intel_sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
out[br * OUTPUT_BLOCK_WIDTH + bc] = mad(w[wi % PREFETCH], val, out[br * OUTPUT_BLOCK_WIDTH + bc]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
w[wi % PREFETCH] = weights[weight_addr];
|
|
||||||
weight_addr += SUB_GROUP_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
|
|
||||||
wi++;
|
|
||||||
});
|
|
||||||
});
|
|
||||||
// addr went beyond due to prefetch so move it back to correct location.
|
|
||||||
weight_addr -= PREFETCH * SUB_GROUP_SIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(ifm_part == 1)
|
|
||||||
{
|
|
||||||
for(uint br=0; br<OUTPUT_BLOCK_HEIGHT; br++) {
|
|
||||||
for(uint bc=0; bc<OUTPUT_BLOCK_WIDTH; bc++) {
|
|
||||||
slm_vals[get_sub_group_local_id() + SIMD_SIZE * (bc + OUTPUT_BLOCK_WIDTH * (br) ) ] = out[br * OUTPUT_BLOCK_WIDTH + bc];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
|
|
||||||
uint out_addr = OUTPUT_OFFSET;
|
|
||||||
out_addr += batch_idx * OUTPUT_BATCH_PITCH;
|
|
||||||
out_addr += out_split_offset + feature_idx * OUTPUT_FEATURE_PITCH; // out_addr indices into start of 16 feature maps.
|
|
||||||
out_addr += or * OUTPUT_Y_PITCH + oc; // offset for the 4x3 block that this workitem is working on;
|
|
||||||
|
|
||||||
if(ifm_part == 0)
|
|
||||||
{
|
|
||||||
|
|
||||||
#if BIAS_TERM
|
|
||||||
for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
|
|
||||||
for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
|
|
||||||
#if BIAS_PER_OUTPUT
|
|
||||||
const unsigned bias_index = feature_idx*OUTPUT_SIZE_X*OUTPUT_SIZE_Y + or*OUTPUT_SIZE_X + oc;
|
|
||||||
#else
|
|
||||||
const unsigned bias_index = feature_idx;
|
|
||||||
#endif
|
|
||||||
out[r * OUTPUT_BLOCK_WIDTH + c] += bias[bias_index];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE); // we want to add barrier after biases addition so that the long slm write part latency is shadowed by it
|
|
||||||
|
|
||||||
if(ifm_part == 0)
|
|
||||||
{
|
|
||||||
for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
|
|
||||||
for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
|
|
||||||
out[r * OUTPUT_BLOCK_WIDTH + c] += slm_vals[get_sub_group_local_id() + SIMD_SIZE * (c + OUTPUT_BLOCK_WIDTH * r)];
|
|
||||||
out[r * OUTPUT_BLOCK_WIDTH + c] = ACTIVATION(out[r * OUTPUT_BLOCK_WIDTH + c], ACTIVATION_PARAMS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef LEFTOVERS
|
|
||||||
if (feature_idx < OUTPUT_FEATURE_NUM)
|
|
||||||
#endif
|
|
||||||
for(uint r = 0; r < OUTPUT_BLOCK_HEIGHT; r++) {
|
|
||||||
if(!(or + r >= OUTPUT_SIZE_Y))
|
|
||||||
{
|
|
||||||
for(uint c = 0; c < OUTPUT_BLOCK_WIDTH; c++) {
|
|
||||||
// this does a scattered write to 16 different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
|
|
||||||
if(!(oc + c >= OUTPUT_SIZE_X))
|
|
||||||
output[out_addr + r * OUTPUT_Y_PITCH + c] = out[r * OUTPUT_BLOCK_WIDTH + c];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef FEATURES_THREADS_PER_BATCH
|
|
@ -1,143 +0,0 @@
|
|||||||
// Copyright (c) 2016-2017 Intel Corporation
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "include/include_all.cl"
|
|
||||||
#include "include/sub_group.cl"
|
|
||||||
|
|
||||||
__attribute__((reqd_work_group_size(LOCAL_WORK_GROUP_SIZE, 1, 1)))
|
|
||||||
KERNEL(convolution_gpu_yxfb_yxio_b1_block)(
|
|
||||||
const __global float* input,
|
|
||||||
__global float* output,
|
|
||||||
const __global float* filter,
|
|
||||||
#if BIAS_TERM
|
|
||||||
const __global float* bias,
|
|
||||||
#endif
|
|
||||||
uint split_idx)
|
|
||||||
{
|
|
||||||
#ifdef USE_VECTOR_8
|
|
||||||
#define VECTOR_FLOAT float8
|
|
||||||
#define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
|
|
||||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
|
|
||||||
#endif
|
|
||||||
#ifdef USE_VECTOR_4
|
|
||||||
#define VECTOR_FLOAT float4
|
|
||||||
#define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
|
|
||||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
|
|
||||||
#endif
|
|
||||||
#ifdef USE_VECTOR_2
|
|
||||||
#define VECTOR_FLOAT float2
|
|
||||||
#define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
|
|
||||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const uint batch_num = INPUT0_BATCH_NUM;
|
|
||||||
const uint linear_id_xy = (uint)get_group_id(1) + (uint)get_global_size(1) * (uint)get_group_id(2);
|
|
||||||
uint global_id = (((uint)get_group_id(0) * LOCAL_WORK_GROUP_SIZE) / batch_num) * batch_num + (linear_id_xy * FILTER_ARRAY_NUM + split_idx) * (FILTER_OFM_NUM / OFM_PER_WORK_ITEM) * batch_num;
|
|
||||||
|
|
||||||
const uint out_batch_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
|
|
||||||
const uint out_x = get_group_id(1);
|
|
||||||
const uint out_y = get_group_id(2);
|
|
||||||
|
|
||||||
const uint out_id = (global_id / batch_num) * OFM_PER_WORK_ITEM * batch_num + out_batch_id;
|
|
||||||
|
|
||||||
const uint ofm_offset = (global_id * (OFM_PER_WORK_ITEM / batch_num)) % FILTER_OFM_NUM;
|
|
||||||
|
|
||||||
const uint sub_group_id = (uint)get_local_id(0) % INPUT0_BATCH_NUM;
|
|
||||||
|
|
||||||
VECTOR_FLOAT _data0 = 0.f;
|
|
||||||
|
|
||||||
const int x = (int)out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
|
||||||
const int y = (int)out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
|
||||||
|
|
||||||
for (uint i = 0; i < FILTER_SIZE_Y; i++)
|
|
||||||
{
|
|
||||||
const int input_offset_y = y + i * DILATION_SIZE_Y;
|
|
||||||
const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
|
|
||||||
|
|
||||||
if(!zero_y)
|
|
||||||
{
|
|
||||||
for (uint j = 0; j < FILTER_SIZE_X; j++)
|
|
||||||
{
|
|
||||||
const int input_offset_x = x + j * DILATION_SIZE_X;
|
|
||||||
const bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
|
|
||||||
|
|
||||||
if(!zero)
|
|
||||||
{
|
|
||||||
uint input_idx = input_offset_x*INPUT0_X_PITCH + input_offset_y*INPUT0_Y_PITCH;
|
|
||||||
input_idx += INPUT0_OFFSET + split_idx * FILTER_IFM_NUM * INPUT0_FEATURE_PITCH;
|
|
||||||
input_idx += out_batch_id;
|
|
||||||
|
|
||||||
uint filter_idx = ofm_offset + sub_group_id + i*FILTER_Y_PITCH + j*FILTER_X_PITCH;
|
|
||||||
|
|
||||||
#if INPUT0_BATCH_NUM == 1
|
|
||||||
for(uint h = 0; h < FILTER_IFM_NUM / 8; h++)
|
|
||||||
{
|
|
||||||
float _in = as_float(intel_sub_group_block_read((const __global uint*)input + input_idx));
|
|
||||||
float8 _input = TRANSPOSE_BLOCK_8(_in);
|
|
||||||
|
|
||||||
VECTOR_FLOAT _filter;
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s0, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s1, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s2, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s3, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s4, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s5, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s6, _filter, _data0);
|
|
||||||
|
|
||||||
_filter = BLOCK_READ(filter + filter_idx); filter_idx += FILTER_OFM_NUM;
|
|
||||||
_data0 = mad(_input.s7, _filter, _data0);
|
|
||||||
|
|
||||||
input_idx += 8 * INPUT0_FEATURE_PITCH;
|
|
||||||
}
|
|
||||||
for (uint h = FILTER_IFM_NUM - (FILTER_IFM_NUM % 8); h < FILTER_IFM_NUM; h++)
|
|
||||||
#else
|
|
||||||
for (uint h = 0; h < FILTER_IFM_NUM; h++)
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
VECTOR_FLOAT _filter = BLOCK_READ(filter + filter_idx);
|
|
||||||
_data0 = mad(input[input_idx], _filter, _data0);
|
|
||||||
filter_idx += FILTER_IFM_PITCH;
|
|
||||||
input_idx += INPUT0_FEATURE_PITCH;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if BIAS_TERM
|
|
||||||
_data0 += BLOCK_READ(bias + ofm_offset);
|
|
||||||
#endif
|
|
||||||
_data0 = ACTIVATION(_data0, ACTIVATION_PARAMS);
|
|
||||||
|
|
||||||
uint _out_id = OUTPUT_OFFSET + out_id;
|
|
||||||
BLOCK_WRITE(output + _out_id, _data0);
|
|
||||||
#if defined(USE_VECTOR_8) || defined(USE_VECTOR_4) || defined(USE_VECTOR_2)
|
|
||||||
#undef VECTOR_FLOAT
|
|
||||||
#undef BLOCK_READ
|
|
||||||
#undef BLOCK_WRITE
|
|
||||||
#endif
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user