[IE CLDNN] Cleanup part 2 (#1865)

* [IE CLDNN] Removed some unused kernels and layouts

* [IE CLDNN] Removed bsv4_fsv32 layout

* [IE CLDNN] Removed remaining BF8_XY16 usages. Removed definitions.cl
This commit is contained in:
Vladimir Paramuzov 2020-08-27 23:06:20 +03:00 committed by GitHub
parent 711d208eb8
commit 34ba00174e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
87 changed files with 39 additions and 3742 deletions

View File

@ -344,14 +344,6 @@ struct layout {
tensor get_pitches() const {
auto sizes = get_buffer_size().sizes(format);
if (format == format::byxf_af32) {
sizes[3] = align_to(sizes[3], 32);
}
if (format == format::byx8_f4) {
sizes[3] = align_to(sizes[3], 4);
sizes[2] = align_to(sizes[2], 8);
}
std::vector<tensor::value_type> pitches(sizes.size(), tensor::value_type(1));
std::partial_sum(sizes.rbegin(), sizes.rend() - 1, pitches.rbegin() + 1, std::multiplies<tensor::value_type>());
return {format, pitches};
@ -394,15 +386,7 @@ struct layout {
sizes[block_axis] = align_to(sizes[block_axis], block_size);
}
if (this->format == cldnn::format::bf8_xy16 && !(is_aligned_to(sizes[1], 8) && is_aligned_to(sizes[2] * sizes[3], 16))) {
sizes[3] = align_to(sizes[2] * sizes[3], 16);
sizes[2] = 1;
} else if (this->format == cldnn::format::byxf_af32 && !(is_aligned_to(sizes[1], 32))) {
sizes[1] = align_to(sizes[1], 32);
} else if (this->format == cldnn::format::byx8_f4 && (!is_aligned_to(sizes[1], 4) || !is_aligned_to(sizes[2], 8))) {
sizes[1] = align_to(sizes[1], 4);
sizes[2] = align_to(sizes[2], 8);
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
sizes[0] = align_to(sizes[0], 8);
sizes[1] = align_to(sizes[1], 32);
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) {

View File

@ -105,7 +105,6 @@ struct format {
bs_fs_zyx_bsv16_fsv16, ///< format used for 3D blocked convolution (batch and features blocked by 16)
bs_fs_yx_bsv16_fsv16, ///< format used for 2D blocked convolution (batch and features blocked by 16)
fs_b_yx_fsv32, ///< format for input for fp16 primitives
fs_bs_yx_bsv4_fsv32, ///< format for batched input for primitives using MMAD
b_fs_yx_fsv4, ///< format for input for IMAD convolutions
bs_xs_xsv8_bsv8, ///< format used only for fully connected weights: bs - batch slice,
///< xs - x slice, bsv8 - 8 values of single slice.
@ -114,10 +113,6 @@ struct format {
bs_x_bsv16, ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx)
///< \n \image html bs_x_bsv16.jpg
byxf_af32, ///< format for input for primitives using MMAD
byx8_f4, ///< format for input for MMAD convolutions
bf8_xy16, ///< format used only for convolution 1x1 input, xy aligned to 16, f aligned to 8
///< \n \image html bf8_xy16.jpg
b_fs_yx_32fp, ///< format for data for binary convolutions
winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
nv12, ///< format for media nv12 input
@ -227,11 +222,7 @@ struct format {
{ bs_xs_xsv8_bsv8, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 8}}}},
{ bs_xs_xsv8_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 16}}}},
{ bs_x_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{0, 16}}}},
{ bf8_xy16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 8}}}},
{ winograd_2x3_s1_data, { 1, 1, 2, 0, 0, "bxyf", "bfxy?", {}}},
{ byxf_af32, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ byx8_f4 , { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
{ fs_bs_yx_bsv4_fsv32, { 1, 1, 2, 0, 0, "fbyx", "bfxy?", {{0, 4}, {1, 32}}}},
{ b_fs_yx_fsv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 4}}}},
{ bfzyx, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {}}},
{ bfwzyx, { 1, 1, 4, 0, 0, "bfwzyx", "bfxyzw", {}}},
@ -943,23 +934,7 @@ public:
adjusted_coords[external_axis] /= block_size;
}
if (fmt == cldnn::format::byxf_af32 && !(is_aligned_to(my_sizes[3], 32))) {
my_sizes[3] = align_to(my_sizes[3], 32);
} else if (fmt == cldnn::format::byx8_f4 && (!(is_aligned_to(my_sizes[3], 4)) || !(is_aligned_to(my_sizes[2], 8)))) {
my_sizes[3] = align_to(my_sizes[3], 4);
my_sizes[2] = align_to(my_sizes[2], 8);
} else if (fmt == cldnn::format::bf8_xy16) {
// Special case of blocked format, where xy is treated as one flattened dimension
auto flat_xy = adjusted_coords[3] + adjusted_coords[2] * my_sizes[3];
my_sizes.push_back(16);
my_sizes[3] = ceil_div(my_sizes[2] * my_sizes[3], 16);
my_sizes[2] = 1;
adjusted_coords.push_back(flat_xy % 16);
adjusted_coords[3] = flat_xy / 16;
adjusted_coords[2] = 0;
} else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below
if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below
!(is_aligned_to(my_sizes[0], 8)) &&
!(is_aligned_to(my_sizes[1], 32))) {
my_sizes[0] = align_to(my_sizes[0], 8);

View File

@ -42,11 +42,7 @@ DataTensor::DataChannelArray DataTensor::dataChannelArray {{
{ DataLayout::bs_fs_yx_bsv16_fsv16, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::bs_f_bsv8__af8, { -1, -1, -1, -1, 0, 1 } },
{ DataLayout::bs_f_bsv16__af8, { -1, -1, -1, -1, 0, 1 } },
{ DataLayout::bf8_xy16, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::winograd_2x3_s1_data, { 2, 1, -1, -1, 0, 3 } },
{ DataLayout::byxf_af32, { 1, 2, -1, -1, 0, 3 } },
{ DataLayout::byx8_f4, { 1, 2, -1, -1, 0, 3 } },
{ DataLayout::fs_bs_yx_bsv4_fsv32, { 0, 1, -1, -1, 3, 2 } },
{ DataLayout::b_fs_yx_fsv4, { 0, 1, -1, -1, 2, 3 } },
{ DataLayout::bfzyx, { 0, 1, 2, -1, 3, 4 } },
{ DataLayout::fs_b_yx_fsv32, { 0, 1, -1, -1, 3, 2 } },
@ -167,26 +163,6 @@ NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
assert(newDims.size() == 5);
newDims[3] = RoundUp(newDims[3], 32);
break;
case bf8_xy16:
assert(newDims.size() == 4);
newDims[1] = RoundUp(newDims[1], 8);
newDims[3] = RoundUp(newDims[2] * newDims[3], 16);
newDims[2] = 1;
break;
case byxf_af32:
assert(newDims.size() == 4);
newDims[0] = RoundUp(newDims[0], 32);
break;
case byx8_f4:
assert(newDims.size() == 4);
newDims[0] = RoundUp(newDims[0], 4);
newDims[1] = RoundUp(newDims[1], 8);
break;
case fs_bs_yx_bsv4_fsv32:
assert(newDims.size() == 4);
newDims[3] = RoundUp(newDims[3], 32);
newDims[2] = RoundUp(newDims[2], 4);
break;
case b_fs_yx_32fp:
assert(newDims.size() == 4);
newDims[3] = RoundUp(newDims[3], 32);
@ -222,14 +198,6 @@ NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
pitch *= newDims[i];
}
if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32 || l == byx8_f4) {
ret[0].pitch = 1;
ret[1].pitch = ret[0].pitch * newDims[0];
ret[2].pitch = ret[1].pitch * newDims[1];
ret[3].pitch = ret[2].pitch * newDims[2];
ret[4].pitch = ret[3].pitch * newDims[3];
}
return ret;
}

View File

@ -51,11 +51,7 @@ enum DataLayout {
bs_fs_zyx_bsv16_fsv16, // batch, feature, 3D spatial. Blocks of 16 batch and channels
bs_f_bsv8__af8, // for optimized FC
bs_f_bsv16__af8, // for optimized FC
bf8_xy16, // for optimized conv1x1
winograd_2x3_s1_data, // winograd convolution input, F(2,3) -- filter 3x3 with stride 1
byxf_af32, // for MMAD convolution
byx8_f4, // for MMAD convolution
fs_bs_yx_bsv4_fsv32, // for batched MMAD
b_fs_yx_fsv4, // reordering format for swizzled input for convolution using IMAD
bfzyx, // batch+feature+3D spatial
fs_b_yx_fsv32, // for FP16 kernels, 32 features to avoid partial writes

View File

@ -41,7 +41,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::byxf);
k.EnableInputLayout(DataLayout::fyxb);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::bf);
@ -51,7 +50,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
k.EnableOutputLayout(DataLayout::byxf);
k.EnableOutputLayout(DataLayout::fyxb);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableTensorOffset();

View File

@ -25,11 +25,9 @@ ParamsKey ConvolutionKernel_bfyx_1x1::GetSupportedKey() const {
k.EnableOutputDataType(Datatype::F32);
k.EnableInputWeightsType(WeightsType::F16);
k.EnableInputWeightsType(WeightsType::F32);
k.EnableInputLayout(DataLayout::bf8_xy16);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::yxfb);
k.EnableOutputLayout(DataLayout::bf8_xy16);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();

View File

@ -82,7 +82,6 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
@ -116,8 +115,6 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
in_fsv = 4;
else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16)
in_fsv = 16;
else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32)
in_fsv = 32;
mem_consts.AddConstants({
MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)),

View File

@ -1,138 +0,0 @@
/*
// Copyright (c) 2019-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_imad_byxf_af32_1x1.h"
static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) {
for (int tile_len = 14; tile_len > 0; tile_len--) {
// Kernel writes 32 output features per HW thread
size_t threads = (out_xy / tile_len) * out_xy * out_f / 32;
// Chose largest valid tile with enough HW threads
if ((out_xy % tile_len == 0) && (threads >= min_threads)) {
return tile_len;
}
}
return 1;
}
namespace kernel_selector {
ParamsKey ConvolutionKernel_imad_byxf_af32_1x1::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();
k.EnableBiasPerFeature();
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
bool ConvolutionKernel_imad_byxf_af32_1x1::Validate(const Params& p, const optional_params& o) const {
if (!Parent::Validate(p, o)) {
return false;
}
const auto& params = static_cast<const convolution_params&>(p);
if (params.filterSize.x != 1 || params.filterSize.y != 1)
return false;
if (params.padding.x != 0 || params.padding.y != 0)
return false;
if (params.output.Feature().v % 32 != 0)
return false;
const auto& input = params.inputs[0];
// we do not support padded input
if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0)
return false;
if (params.split != 1)
return false;
if (params.groups != 1)
return false;
return true;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_1x1::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = Parent::SetDefault(arg);
// Sub-group size
constexpr size_t sub_group_size = 8;
const auto of_maps = arg.output.Feature().v;
const size_t of_maps_per_batch = RoundUp(of_maps, 32);
const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v;
// Need to have at least 4 HW threads per EU
const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4);
runInfo.cldnnStyle.blockWidth = tile_length;
runInfo.efficiency = FORCE_PRIORITY_1;
runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length;
runInfo.gws1 = of_maps_total / 4; // TILE_DEPTH==4
runInfo.gws2 = 1;
runInfo.lws0 = 1;
runInfo.lws1 = sub_group_size;
runInfo.lws2 = 1;
return runInfo;
}
JitConstants ConvolutionKernel_imad_byxf_af32_1x1::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4));
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_scalar = {"", {"b", "f2", "y", "(x+i)"}, "res", input_dt, 1 };
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
}
return jit;
}
KernelsData ConvolutionKernel_imad_byxf_af32_1x1::GetKernelsData(const Params& params,
const optional_params& options) const {
return GetTunedKernelsDataByIndex(params, options);
}
} // namespace kernel_selector

View File

@ -1,45 +0,0 @@
// Copyright (c) 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_imad_byxf_af32_1x1 : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_imad_byxf_af32_1x1() : ConvolutionKernelBase("fused_conv_eltwise_gpu_af32_imad_1x1") {}
virtual ~ConvolutionKernel_imad_byxf_af32_1x1() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool Validate(const Params& p, const optional_params& o) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_osv32_isv32_swizzled_by_4;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
FusedOpType::QUANTIZE,
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
};
} // namespace kernel_selector

View File

@ -1,143 +0,0 @@
/*
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_imad_byxf_af32_depthwise.h"
#define SIMD_SIZE 16
namespace kernel_selector {
ParamsKey ConvolutionKernel_imad_byxf_af32_depthiwise::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();
k.EnableBiasPerFeature();
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableSplitSupport();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableDepthwiseSeparableOpt();
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.DisableTuning();
k.EnableGroupedConvolution();
return k;
}
static size_t GetTileLength(size_t out_x) {
for (int i = 20; i >= 1; i--) {
if (out_x % i == 0)
return i;
}
return 1;
}
static int GetSplit(size_t out_x, int stride) {
if (out_x >= 75) {
if (stride > 1)
return 1;
else
return 3;
}
if (out_x == 38 && stride == 2)
return 2;
if (out_x < 75) {
if (stride > 1)
return 1;
else if (out_x % 2 == 0)
return 2;
}
return 1;
}
bool ConvolutionKernel_imad_byxf_af32_depthiwise::Validate(const Params& p, const optional_params& o) const {
if (!Parent::Validate(p, o)) {
return false;
}
const convolution_params& cp = static_cast<const convolution_params&>(p);
if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups || cp.groups == 1) {
return false;
}
return true;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_depthiwise::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = Parent::SetDefault(arg);
runInfo.efficiency = FORCE_PRIORITY_1;
runInfo.gws0 = Align(arg.output.Feature().v, SIMD_SIZE) * arg.output.Batch().v;
runInfo.gws1 = arg.output.X().v / GetTileLength(arg.output.X().v);
runInfo.gws2 = CeilDiv(arg.output.Y().v, GetSplit(arg.output.Y().v, arg.stride.y));
std::vector<size_t> local = { SIMD_SIZE, 1, 1 };
runInfo.lws0 = local[0];
runInfo.lws1 = local[1];
runInfo.lws2 = local[2];
return runInfo;
}
JitConstants ConvolutionKernel_imad_byxf_af32_depthiwise::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("ALIGNED_OFM", Align(params.output.Feature().v, SIMD_SIZE)));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", GetTileLength(params.output.X().v)));
jit.AddConstant(MakeJitConstant("SPLIT_Y", GetSplit(params.output.Y().v, params.stride.y)));
jit.AddConstant(MakeJitConstant("SIMD_SIZE", SIMD_SIZE));
if (params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y) != 0)
jit.AddConstant(MakeJitConstant("SPLIT_LEFTOVERS", params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y)));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_scalar = {"", {"b", "of", "(y+m)", "(x+l)"}, "res", input_dt, 1 };
conf_scalar.SetLoopAxes({Tensor::DataChannelName::Y, Tensor::DataChannelName::X});
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
}
return jit;
}
KernelsData ConvolutionKernel_imad_byxf_af32_depthiwise::GetKernelsData(const Params& params,
const optional_params& options) const {
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
if (!kd.empty())
kd[0].estimatedTime = FORCE_PRIORITY_1;
return kd;
}
} // namespace kernel_selector

View File

@ -1,46 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_imad_byxf_af32_depthiwise : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_imad_byxf_af32_depthiwise() : ConvolutionKernelBase("convolution_gpu_byxf_af32_depthwise") {}
virtual ~ConvolutionKernel_imad_byxf_af32_depthiwise() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
bool Validate(const Params& p, const optional_params& o) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::goiyx;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
FusedOpType::QUANTIZE,
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
};
} // namespace kernel_selector

View File

@ -1,93 +0,0 @@
/*
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_mmad.h"
namespace kernel_selector {
ParamsKey ConvolutionKernel_mmad::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();
k.EnableBiasPerFeature();
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableSplitSupport();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableDifferentInputWeightsTypes();
k.DisableTuning();
k.EnableDifferentTypes();
return k;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad::SetDefault(const convolution_params& arg, int) const {
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
constexpr size_t sub_group_size = 8;
const auto of_maps = arg.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
runInfo.efficiency = FORCE_PRIORITY_4;
runInfo.gws0 = arg.output.X().v;
runInfo.gws1 = arg.output.Y().v;
runInfo.gws2 = of_threads_per_batch * arg.output.Batch().v;
runInfo.lws0 = 1;
runInfo.lws1 = 1;
runInfo.lws2 = sub_group_size;
return runInfo;
}
JitConstants ConvolutionKernel_mmad::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_scalar = {"", {"b", "f", "y", "x"}, "res", input_dt, 1 };
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
}
return jit;
}
KernelsData ConvolutionKernel_mmad::GetKernelsData(const Params& params, const optional_params& options) const {
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
if (!kd.empty())
kd[0].estimatedTime = FORCE_PRIORITY_4;
return kd;
}
} // namespace kernel_selector

View File

@ -1,45 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_mmad() : ConvolutionKernelBase("convolution_gpu_mmad") {}
virtual ~ConvolutionKernel_mmad() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
FusedOpType::QUANTIZE,
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
};
} // namespace kernel_selector

View File

@ -1,95 +0,0 @@
/*
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_mmad_batched.h"
namespace kernel_selector {
ParamsKey ConvolutionKernel_mmad_batched::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableSplitSupport();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
constexpr size_t sub_group_size = 8;
const auto of_maps = arg.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
runInfo.efficiency = FORCE_PRIORITY_6;
runInfo.gws0 = arg.output.X().v;
runInfo.gws1 = arg.output.Y().v;
runInfo.gws2 = of_threads_per_batch * ((arg.output.Batch().v + 3) / 4);
runInfo.lws0 = 1;
runInfo.lws1 = 1;
runInfo.lws2 = sub_group_size;
return runInfo;
}
JitConstants ConvolutionKernel_mmad_batched::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
const size_t in_x_pitch = 32 * 4;
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
const size_t in_offset =
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
return jit;
}
KernelsData ConvolutionKernel_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const {
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
if (!kd.empty())
kd[0].estimatedTime = FORCE_PRIORITY_6;
return kd;
}
} // namespace kernel_selector

View File

@ -1,39 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad_batched : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_mmad_batched() : ConvolutionKernelBase("convolution_gpu_mmad_batched") {}
virtual ~ConvolutionKernel_mmad_batched() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
};
} // namespace kernel_selector

View File

@ -1,165 +0,0 @@
/*
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_mmad_batched_block.h"
#include "kernel_selector_utils.h"
#include <vector>
namespace kernel_selector {
ParamsKey ConvolutionKernel_mmad_batched_block::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBiasPerFeature();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
struct block_params {
int32_t out_width;
int32_t out_height;
int32_t out_depth;
};
static block_params get_out_block_size(const convolution_params& p) {
if (p.filterSize.x == 3 && p.filterSize.y == 3) {
if (p.output.X().v == 7)
return {7, 1, 4};
else if (p.output.X().v == 14)
return {7, 1, 4};
else if (p.output.X().v == 28)
return {7, 1, 4};
else if (p.output.X().v == 56)
return {8, 1, 4};
}
return {1, 1, 1};
}
WeightsLayout ConvolutionKernel_mmad_batched_block::GetPreferredWeightsLayout(
const convolution_params &cp) const {
auto block = get_out_block_size(cp);
if (block.out_depth == 4)
return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
else
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
bool ConvolutionKernel_mmad_batched_block::Validate(const Params& p, const optional_params& o) const {
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
return false;
}
const convolution_params& cp = static_cast<const convolution_params&>(p);
// if block sizes are 1x1, then this algorithm is probably not the best
auto block = get_out_block_size(cp);
if (block.out_width == 1 && block.out_height == 1)
return false;
if (cp.output.X().v % block.out_width != 0)
return false;
if (cp.output.Y().v % block.out_height != 0)
return false;
if (cp.filterSize.x == 1)
return false;
return true;
}
size_t static get_wg_batch_count(const convolution_params& params) {
if (params.inputs[0].Batch().v % 64 == 0)
return 16; // because we process 4 batches per SIMD
return 1;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
constexpr size_t sub_group_size = 8;
runInfo.efficiency = FORCE_PRIORITY_5;
auto block = get_out_block_size(arg);
runInfo.gws0 = arg.output.X().v / block.out_width;
runInfo.gws1 = arg.output.Y().v / block.out_height;
runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
block.out_depth; // process 4 output channels per Workitem
runInfo.lws0 = 1;
runInfo.lws1 = 1;
runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
return runInfo;
}
JitConstants ConvolutionKernel_mmad_batched_block::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
const int sub_group_size = 8;
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
const size_t in_x_pitch = 32 * 4;
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
const size_t in_offset =
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
const size_t out_x_pitch = 32 * 4;
jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
auto block = get_out_block_size(params);
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
return jit;
}
KernelsData ConvolutionKernel_mmad_batched_block::GetKernelsData(const Params& params,
const optional_params& options) const {
KernelsData kd = GetCommonKernelsData(params, options);
if (!kd.empty())
kd[0].estimatedTime = FORCE_PRIORITY_5;
return kd;
}
} // namespace kernel_selector

View File

@ -1,38 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad_batched_block : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_mmad_batched_block() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block") {}
virtual ~ConvolutionKernel_mmad_batched_block() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
bool Validate(const Params& p, const optional_params& o) const override;
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
};
} // namespace kernel_selector

View File

@ -1,171 +0,0 @@
/*
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_mmad_batched_block_1x1.h"
#include "kernel_selector_utils.h"
#include <vector>
namespace kernel_selector {
ParamsKey ConvolutionKernel_mmad_batched_block_1x1::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBiasPerFeature();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
struct block_params {
int32_t out_width;
int32_t out_height;
int32_t out_depth;
};
static block_params get_out_block_size(const convolution_params& p) {
if (p.output.X().v == 7)
return {7, 1, 4};
else if (p.output.X().v == 14)
return {7, 1, 4};
else if (p.output.X().v == 28)
return {4, 2, 4};
else if (p.output.X().v == 56)
return {8, 1, 4};
return {1, 1, 1};
}
WeightsLayout ConvolutionKernel_mmad_batched_block_1x1::GetPreferredWeightsLayout(
const convolution_params &cp) const {
auto block = get_out_block_size(cp);
if (block.out_depth == 4)
return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
else
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
bool ConvolutionKernel_mmad_batched_block_1x1::Validate(const Params& p, const optional_params& o) const {
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
return false;
}
const convolution_params& cp = static_cast<const convolution_params&>(p);
// only for conv 1x1
if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
return false;
// only for stride 1x1
if (cp.stride.x != 1 || cp.stride.y != 1)
return false;
// if block sizes are 1x1, then this algorithm is probably not the best
auto block = get_out_block_size(cp);
if (block.out_depth != 4)
return false;
if (cp.output.X().v % block.out_width != 0)
return false;
if (cp.output.Y().v % block.out_height != 0)
return false;
return true;
}
size_t static get_wg_batch_count(const convolution_params& params) {
if (params.inputs[0].Batch().v % 64 == 0)
return 16; // because we process 4 batches per SIMD
return 1;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block_1x1::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
constexpr size_t sub_group_size = 8;
runInfo.efficiency = FORCE_PRIORITY_3;
auto block = get_out_block_size(arg);
runInfo.gws0 = arg.output.X().v / block.out_width;
runInfo.gws1 = arg.output.Y().v / block.out_height;
runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
block.out_depth; // process 4 output channels per Workitem
runInfo.lws0 = 1;
runInfo.lws1 = 1;
runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
return runInfo;
}
JitConstants ConvolutionKernel_mmad_batched_block_1x1::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
const int sub_group_size = 8;
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
const size_t in_x_pitch = 32 * 4;
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
const size_t in_offset =
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
const size_t out_x_pitch = 32 * 4;
const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
auto block = get_out_block_size(params);
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
return jit;
}
KernelsData ConvolutionKernel_mmad_batched_block_1x1::GetKernelsData(const Params& params,
const optional_params& options) const {
KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
if (!kd.empty())
kd[0].estimatedTime = FORCE_PRIORITY_3;
return kd;
}
} // namespace kernel_selector

View File

@ -1,38 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad_batched_block_1x1 : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_mmad_batched_block_1x1() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block_1x1") {}
virtual ~ConvolutionKernel_mmad_batched_block_1x1() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
bool Validate(const Params& p, const optional_params& o) const override;
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
};
} // namespace kernel_selector

View File

@ -35,7 +35,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableDilation();

View File

@ -1,270 +0,0 @@
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "convolution_kernel_mmad_blocks.h"
#include <vector>
#include <utility>
#include <string>
#include <algorithm>
namespace kernel_selector {
ConvolutionKernel_mmad_blocks::ConvolutionKernel_mmad_blocks() : ConvolutionKernelBase("convolution_gpu_mmad_blocks") {
// Generate the dispatch options to the auto-tuner.
std::vector<size_t> blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32};
std::vector<size_t> blockHeightSizes = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
std::vector<size_t> prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10};
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
const size_t maxBlockSize = 240;
for (auto executionMode : executionModes) {
for (auto blockWidth : blockWidthSizes) {
for (auto blockHeight : blockHeightSizes) {
for (auto prefetch : prefetchSizes) {
if (blockWidth * blockHeight <= maxBlockSize) {
autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode});
}
}
}
}
}
}
ParamsKey ConvolutionKernel_mmad_blocks::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
// k.EnableDilation(); TODO: Add dilation support
k.EnableBiasPerFeature();
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableSplitSupport();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.DisableTuning();
return k;
}
bool ConvolutionKernel_mmad_blocks::Validate(const Params& p, const optional_params& o) const {
if (!Parent::Validate(p, o)) {
return false;
}
return true;
}
static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) {
// how many elements we will compute in each dimension
size_t computed_x = Align(output_x, block_x);
size_t computed_y = Align(output_y, block_y);
// how many simds we need in each dimension
size_t simds_x = computed_x / block_x;
size_t simds_y = computed_y / block_y;
// how many unused values we have in each dimension
size_t unused_x = computed_x - output_x;
size_t unused_y = computed_y - output_y;
block_x -= unused_x / simds_x;
block_y -= unused_y / simds_y;
}
ConvolutionKernel_mmad_blocks::AutoTuneOption ConvolutionKernel_mmad_blocks::GetAutoTuneOptions(
const Params& p,
int autoTuneIndex) const {
if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
return autoTuneOptions[autoTuneIndex];
}
// Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
constexpr size_t sub_group_size = 16;
AutoTuneOption option = {0, 0, 0, DEFAULT};
const convolution_params& cp = static_cast<const convolution_params&>(p);
if (cp.stride.x == 1 && cp.stride.y == 1) {
if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
option.blockWidth = 16;
option.blockHeight = 1;
option.prefetch = 4;
// if less than 16 values is required to compute one single row of output
// then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice
// performance results)
} else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) {
option.blockWidth = cp.output.X().v;
option.blockHeight = 1;
option.prefetch = 4;
} else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) {
option.blockWidth = sub_group_size - cp.filterSize.x + 1;
option.blockHeight = 2;
option.prefetch = 4;
} else {
option.blockWidth = 4;
option.blockHeight = 3;
option.prefetch = 4;
}
} else if (cp.stride.x == 2 && cp.stride.y == 2) {
option.blockWidth = 5;
option.blockHeight = 4;
option.prefetch = 4;
} else {
option.blockWidth = 4;
option.blockHeight = 3;
option.prefetch = 5;
// run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
}
// if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
// sizes
if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
}
return option;
}
static std::pair<size_t, size_t> get_byxf_af32_req_input_block_dims(size_t output_block_width,
size_t output_block_height,
const uSize& filter_size,
const uSize& stride,
const uSize& dilation,
size_t sub_group_size = 8,
size_t read_chunk_size = 8,
size_t min_read_size = 8) {
assert(output_block_width > 0 && output_block_height > 0);
assert(stride.x > 0 && stride.y > 0);
assert(filter_size.x > 0 && filter_size.y > 0);
// Number of elements in X dimension needed from input to compute output block without re-reading input.
size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1;
// Number of elements in Y dimension needed from input to compute output block without re-reading input.
size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1;
// Required number of elements in X dimension rounded to nearest >= read chunk size.
size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
// Number of sub-group-sized vectors of unit type needed to store input block.
size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sub_group_size);
// size of our array per workitem
input_block_array_size = input_block_req_height * input_block_read_width;
return std::make_pair(input_block_array_size, input_block_read_width);
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_blocks::SetDefault(const convolution_params& cp,
int autoTuneIndex) const {
// Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
constexpr size_t sub_group_size = 8;
DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
auto input_block_dims =
get_byxf_af32_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
runInfo.cldnnStyle.blockHeight,
cp.filterSize,
cp.stride,
cp.dilation,
sub_group_size,
runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
sub_group_size);
runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
const auto of_maps = cp.output.Feature().v;
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
runInfo.efficiency = FORCE_PRIORITY_3;
runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
runInfo.lws0 = 1;
runInfo.lws1 = 1;
runInfo.lws2 = sub_group_size;
return runInfo;
}
JitConstants ConvolutionKernel_mmad_blocks::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = Parent::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf_scalar = {"", {"b", "f", "(y+br)", "(x+bc)"}, "res", input_dt, 1 };
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
}
return jit;
}
KernelsData ConvolutionKernel_mmad_blocks::GetKernelsData(const Params& params, const optional_params& options) const {
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
if (!kd.empty())
kd[0].estimatedTime = FORCE_PRIORITY_2;
return kd;
}
KernelsData ConvolutionKernel_mmad_blocks::GetKernelsDataForAutoTune(const Params& params,
const optional_params& options) const {
if (!Validate(params, options)) {
return {};
}
KernelsData res = {};
for (size_t i = 0; i < autoTuneOptions.size(); i++) {
KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
if (!kd.empty()) {
res.emplace_back(kd[0]);
}
}
return res;
}
} // namespace kernel_selector

View File

@ -1,59 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <string>
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad_blocks : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
ConvolutionKernel_mmad_blocks();
virtual ~ConvolutionKernel_mmad_blocks() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
bool Validate(const Params& p, const optional_params& o) const override;
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
FusedOpType::QUANTIZE,
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
private:
struct AutoTuneOption {
size_t blockWidth;
size_t blockHeight;
size_t prefetch;
std::string exeMode;
};
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
std::vector<AutoTuneOption> autoTuneOptions = {};
};
} // namespace kernel_selector

View File

@ -1,119 +0,0 @@
/*
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_mmad_slm_2x14_rep4.h"
#include "kernel_selector_utils.h"
namespace kernel_selector {
ParamsKey ConvolutionKernel_mmad_slm_2x14_rep4::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBiasPerFeature();
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
bool ConvolutionKernel_mmad_slm_2x14_rep4::Validate(const Params& p, const optional_params& o) const {
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
return false;
}
const convolution_params& cp = static_cast<const convolution_params&>(p);
if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
return false;
if (cp.inputs[0].X().v != 56 || cp.inputs[0].Y().v != 56)
return false;
if (cp.stride.x != 1 || cp.stride.y != 1)
return false;
return true;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_2x14_rep4::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
runInfo.efficiency = FORCE_PRIORITY_1;
const size_t rep_count = 4;
const size_t batch_per_wi = 1;
const size_t out_block_width = 14;
const size_t out_block_height = 2;
runInfo.gws0 = arg.output.Feature().v *
(arg.output.Batch().v / (rep_count * batch_per_wi)); // number of tiles needed to cover output width
runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
runInfo.gws2 = ((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
runInfo.lws0 = 32; // depth
runInfo.lws1 = 1; // width
runInfo.lws2 = 4; // height
return runInfo;
}
JitConstants ConvolutionKernel_mmad_slm_2x14_rep4::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
const size_t in_x_pitch = 32 * 4;
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
const size_t in_offset =
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 14));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 2));
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", runInfo.lws2));
return jit;
}
KernelsData ConvolutionKernel_mmad_slm_2x14_rep4::GetKernelsData(const Params& params,
const optional_params& options) const {
return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
}
} // namespace kernel_selector

View File

@ -1,39 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad_slm_2x14_rep4 : public ConvolutionKernelBase {
public:
ConvolutionKernel_mmad_slm_2x14_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_2x14_rep4") {}
virtual ~ConvolutionKernel_mmad_slm_2x14_rep4() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool Validate(const Params& p, const optional_params& o) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
};
} // namespace kernel_selector

View File

@ -1,128 +0,0 @@
/*
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "convolution_kernel_mmad_slm_7x7_rep4.h"
#include "kernel_selector_utils.h"
namespace kernel_selector {
ParamsKey ConvolutionKernel_mmad_slm_7x7_rep4::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableInputWeightsType(WeightsType::INT8);
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBiasPerFeature();
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}
bool ConvolutionKernel_mmad_slm_7x7_rep4::Validate(const Params& p, const optional_params& o) const {
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
return false;
}
const convolution_params& cp = static_cast<const convolution_params&>(p);
if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
return false;
if (cp.stride.x != 1 || cp.stride.y != 1)
return false;
if (cp.inputs[0].X().v == 7 && cp.inputs[0].Y().v == 7)
return true;
if (cp.inputs[0].X().v == 14 && cp.inputs[0].Y().v == 14)
return true;
return false;
}
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_7x7_rep4::SetDefault(const convolution_params& arg,
int) const {
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
runInfo.efficiency = FORCE_PRIORITY_1;
const size_t rep_count = 4;
const size_t batch_per_wi = 4;
const size_t out_block_width = 7;
// const size_t out_block_height = 1;
runInfo.gws0 = (arg.output.Feature().v * arg.output.Batch().v) /
(rep_count * batch_per_wi); // number of tiles needed to cover output width
runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
// since this kernel only apply to 7x7 sizes we need to manually set gws2 to 8
runInfo.gws2 = Align(arg.inputs[0].Y().v,
8); // 8;//((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
runInfo.lws0 = 16; // depth
runInfo.lws1 = 1; // width
runInfo.lws2 = 8; // height
return runInfo;
}
JitConstants ConvolutionKernel_mmad_slm_7x7_rep4::GetJitConstants(const convolution_params& params,
const DispatchData& runInfo) const {
auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
// pitch for special block format used in this kernel
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
const size_t filter_ofm_block_pitch =
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
const size_t in_x_pitch = 32 * 4;
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
const size_t in_offset =
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
jit.AddConstant(MakeJitConstant("OUT_X_PITCH", in_x_pitch));
jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 7));
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 1));
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", 7)); // must be 7 since we process 7 in Y per workgroup
return jit;
}
KernelsData ConvolutionKernel_mmad_slm_7x7_rep4::GetKernelsData(const Params& params,
const optional_params& options) const {
return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
}
} // namespace kernel_selector

View File

@ -1,39 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "convolution_kernel_base.h"
#include <vector>
namespace kernel_selector {
class ConvolutionKernel_mmad_slm_7x7_rep4 : public ConvolutionKernelBase {
public:
ConvolutionKernel_mmad_slm_7x7_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_7x7_rep4") {}
virtual ~ConvolutionKernel_mmad_slm_7x7_rep4() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
protected:
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
bool Validate(const Params& p, const optional_params& o) const override;
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
}
};
} // namespace kernel_selector

View File

@ -33,9 +33,6 @@
#include "convolution_kernel_bfyx_1x1_gemm_buf.h"
#include "convolution_kernel_winograd_2x3_s1_fused.h"
#include "convolution_kernel_winograd_6x3_s1_fused.h"
#include "convolution_kernel_mmad.h"
#include "convolution_kernel_mmad_blocks.h"
#include "convolution_kernel_imad_byxf_af32_depthwise.h"
#include "convolution_kernel_bfyx_depthwise_weights_lwg.h"
#include "convolution_kernel_imad.h"
#include "convolution_kernel_fs_byx_fsv32.h"
@ -51,7 +48,6 @@
#include "deformable_convolution_kernel_bfyx_interp.h"
#include "convolution_kernel_b_fs_zyx_fsv16_fp32.h"
#include "convolution_kernel_b_fs_zyx_fsv16_fp16.h"
#include "convolution_kernel_imad_byxf_af32_1x1.h"
#include "convolution_kernel_imad_b_fs_yx_fsv4_1x1.h"
#include "convolution_kernel_imad_b_fs_yx_fsv4_dw.hpp"
#include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.h"
@ -119,12 +115,6 @@ convolution_kernel_selector::convolution_kernel_selector() {
Attach<ConvolutionKernel_Winograd_2x3_s1_fused>();
Attach<ConvolutionKernel_Winograd_6x3_s1_fused>();
// byxf_af32 int8
Attach<ConvolutionKernel_mmad>();
Attach<ConvolutionKernel_mmad_blocks>();
Attach<ConvolutionKernel_imad_byxf_af32_1x1>();
Attach<ConvolutionKernel_imad_byxf_af32_depthiwise>();
// b_fs_yx_fsv4 kernels
Attach<ConvolutionKernel_imad>();
Attach<ConvolutionKernel_imad_b_fs_yx_fsv4_1x1>();

View File

@ -55,9 +55,6 @@ ParamsKey DeconvolutionKernel_imad_along_f_tile_bfx::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.EnableBatching();
@ -179,9 +176,6 @@ JitConstants DeconvolutionKernel_imad_along_f_tile_bfx::GetJitConstants(const de
input_tile_ifm_pitch = zyx_pitch_factor * 16 * 16;
}
input_in_tile_batch_pitch = 16;
} else if (in_layout == DataLayout::byxf_af32) {
input_tile_ifm_pitch = tile_ifm;
input_in_tile_batch_pitch = zyx_pitch_factor * Align(in.Feature().LogicalDimPadded(), 32);
}
jit.AddConstant(MakeJitConstant("INPUT_VALID_TILE_IFM_PITCH", input_tile_ifm_pitch != 0));
@ -242,8 +236,7 @@ size_t DeconvolutionKernel_imad_along_f_tile_bfx::GetTileIFM(const deconvolution
fsv = 16;
}
if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32
|| params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32
|| params.inputs[0].GetLayout() == DataLayout::byxf_af32) {
|| params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32) {
fsv = 32;
}

View File

@ -41,7 +41,6 @@ ParamsKey DeconvolutionKernel_imad_ref::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableAllOutputLayout();
k.EnableDifferentTypes();

View File

@ -43,14 +43,12 @@ bool EltwiseKernel_vload8::Validate(const Params& params, const optional_params&
const auto& ewParams = static_cast<const eltwise_params&>(params);
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
if (ewParams.inputs[i].GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;
}
if (ewParams.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
(ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32)
return false;

View File

@ -37,7 +37,6 @@ ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const {
k.EnableDifferentTypes();
k.EnableInputLayout(DataLayout::bfyx);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
k.EnableOutputLayout(DataLayout::bf);
@ -129,7 +128,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
size_t input_y_pitch = input.Y().pitch;
size_t input_z_pitch = input.Z().pitch;
if (input.GetLayout() == DataLayout::byxf_af32 || input.GetLayout() == DataLayout::bfyx) {
if (input.GetLayout() == DataLayout::bfyx) {
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32));
} else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
input_x_pitch = 32;

View File

@ -136,7 +136,7 @@ PoolingKernelBase::DispatchData PoolingKernelBase::SetDefault(const pooling_para
kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
if (output.GetLayout() == DataLayout::bfyx || output.GetLayout() == DataLayout::b_fs_yx_fsv4 ||
output.GetLayout() == DataLayout::byxf || output.GetLayout() == DataLayout::byxf_af32 ||
output.GetLayout() == DataLayout::byxf ||
output.GetLayout() == DataLayout::bfzyx || output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
// Determine global work sizes.

View File

@ -27,7 +27,6 @@ ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();

View File

@ -1,89 +0,0 @@
// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "pooling_kernel_gpu_byxf_af32.h"
namespace kernel_selector {
ParamsKey PoolingKerneGPU_byxf_af32::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnablePoolType(PoolType::MAX);
k.EnablePoolType(PoolType::AVG);
k.EnablePoolRemainder(PoolRemainder::FLOOR);
k.EnablePoolRemainder(PoolRemainder::CEIL);
k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
k.EnableDifferentTypes();
return k;
}
PoolingKernelBase::DispatchData PoolingKerneGPU_byxf_af32::SetDefault(const pooling_params& params) const {
constexpr int simdSize = 8;
DispatchData runInfo = PoolingKernelBase::SetDefault(params);
runInfo.gws0 = params.output.X().v;
runInfo.gws1 = params.output.Y().v;
// we got byxf_af32 format, so if we process 4 features per workitem, that means we process 32 per simd, so divide
// by 4 and we end up with 8
runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * params.output.Batch().v) / 4;
runInfo.lws0 = 1;
runInfo.lws1 = 1;
runInfo.lws2 = simdSize;
return runInfo;
}
JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& params, DispatchData kd) const {
JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)"));
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
if (!params.fused_ops.empty()) {
auto input_dt = GetActivationType(params);
FusedOpsConfiguration conf = {"",
{"b", "f", "y", "x"},
"fused_pool_result",
input_dt,
4,
LoadType::LT_UNALIGNED,
BoundaryCheck::ENABLED,
IndexType::TENSOR_COORD,
Tensor::DataChannelName::FEATURE};
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
}
return jit;
}
KernelsData PoolingKerneGPU_byxf_af32::GetKernelsData(const Params& params, const optional_params& options) const {
return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
}
} // namespace kernel_selector

View File

@ -1,37 +0,0 @@
// Copyright (c) 2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pooling_kernel_base.h"
#include <vector>
namespace kernel_selector {
class PoolingKerneGPU_byxf_af32 : public PoolingKernelBase {
public:
PoolingKerneGPU_byxf_af32() : PoolingKernelBase("pooling_gpu_byxf_af32") {}
virtual ~PoolingKerneGPU_byxf_af32() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
DispatchData SetDefault(const pooling_params& params) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::ELTWISE,
FusedOpType::QUANTIZE,
FusedOpType::SCALE,
FusedOpType::ACTIVATION };
}
};
} // namespace kernel_selector

View File

@ -29,7 +29,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::bfzyx);
k.EnableInputLayout(DataLayout::yxfb);
k.EnableInputLayout(DataLayout::byxf);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
@ -38,7 +37,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const {
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::yxfb);
k.EnableOutputLayout(DataLayout::byxf);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);

View File

@ -18,7 +18,6 @@
#include "pooling_kernel_gpu_byxf_opt.h"
#include "pooling_kernel_gpu_bfyx_block_opt.h"
#include "pooling_kernel_gpu_byxf_padding_opt.h"
#include "pooling_kernel_gpu_byxf_af32.h"
#include "pooling_kernel_gpu_int8_ref.h"
#include "pooling_kernel_gpu_b_fs_yx_fsv4.h"
#include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
@ -35,7 +34,6 @@ pooling_kernel_selector::pooling_kernel_selector() {
Attach<PoolingKernelGPUBfyxBlockOpt>();
Attach<PoolingKernelGPUByxfPaddingOpt>();
Attach<PoolingKernelGPUInt8Ref>();
Attach<PoolingKerneGPU_byxf_af32>();
Attach<PoolingKerneGPU_b_fs_yx_fsv4>();
Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
Attach<PoolingKernel_b_fs_yx_fsv16>();

View File

@ -40,14 +40,12 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
k.EnableInputLayout(DataLayout::byxf_af32);
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::yxfb);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
k.EnableOutputLayout(DataLayout::byxf_af32);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);

View File

@ -1,87 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
#include "kernel_selector_utils.h"
namespace kernel_selector {
ParamsKey reorder_kernel_byxf_f32_to_byx8_f4_i8::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT8);
k.EnableDifferentTypes();
k.EnableInputLayout(DataLayout::byxf);
k.EnableOutputLayout(DataLayout::byx8_f4);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
return k;
}
bool reorder_kernel_byxf_f32_to_byx8_f4_i8::Validate(const Params& p, const optional_params& o) const {
if (!ReorderKernelBase::Validate(p, o)) {
return false;
}
const reorder_params& params = static_cast<const reorder_params&>(p);
if (params.output.X().v % 16 != 0)
return false;
if (params.inputs[0].Feature().v != 3)
return false;
if (params.mode == MeanSubtractMode::IN_BUFFER && params.mean.LogicalSize() != params.inputs[0].Feature().v)
return false;
return true;
}
size_t static get_wg_batch_size(const reorder_params& params) {
if (params.inputs[0].Batch().v % 16 == 0)
return 16;
return 1;
}
reorder_kernel_byxf_f32_to_byx8_f4_i8::DispatchData reorder_kernel_byxf_f32_to_byx8_f4_i8::SetDefault(
const reorder_params& params) const {
DispatchData kd;
const auto& input = params.inputs[0];
kd.gws0 = input.X().v;
kd.gws1 = input.Y().v;
kd.gws2 = input.Batch().v;
kd.lws0 = 16;
kd.lws1 = 1;
kd.lws2 = get_wg_batch_size(params);
return kd;
}
JitConstants reorder_kernel_byxf_f32_to_byx8_f4_i8::GetJitConstants(const reorder_params& params) const {
auto jit = ReorderKernelBase::GetJitConstants(params);
jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));
jit.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params)));
return jit;
}
KernelsData reorder_kernel_byxf_f32_to_byx8_f4_i8::GetKernelsData(const Params& params,
const optional_params& options) const {
const reorder_params& orgParams = static_cast<const reorder_params&>(params);
return GetCommonKernelsData(orgParams, options, FORCE_PRIORITY_5);
}
} // namespace kernel_selector

View File

@ -1,32 +0,0 @@
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "reorder_kernel_base.h"
namespace kernel_selector {
class reorder_kernel_byxf_f32_to_byx8_f4_i8 : public ReorderKernelBase {
public:
reorder_kernel_byxf_f32_to_byx8_f4_i8() : ReorderKernelBase("reorder_data_byxf_f32_to_byx8_f4_i8") {}
virtual ~reorder_kernel_byxf_f32_to_byx8_f4_i8() {}
bool Validate(const Params& p, const optional_params& o) const override;
DispatchData SetDefault(const reorder_params& params) const override;
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
JitConstants GetJitConstants(const reorder_params& params) const override;
};
} // namespace kernel_selector

View File

@ -33,7 +33,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const {
k.EnableInputLayout(DataLayout::bfwzyx);
k.EnableInputLayout(DataLayout::bs_f_bsv8__af8);
k.EnableInputLayout(DataLayout::bs_f_bsv16__af8);
k.EnableInputLayout(DataLayout::bf8_xy16);
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
@ -45,7 +44,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const {
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableOutputLayout(DataLayout::bs_f_bsv8__af8);
k.EnableOutputLayout(DataLayout::bs_f_bsv16__af8);
k.EnableOutputLayout(DataLayout::bf8_xy16);
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);

View File

@ -19,7 +19,6 @@
#include "reorder_from_winograd_2x3_kernel.h"
#include "reorder_to_winograd_2x3_kernel.h"
#include "reorder_kernel_to_yxfb_batched.h"
#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
#include "reorder_kernel_binary.h"
#include "reorder_biplanar_nv12.h"
#include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h"
@ -33,7 +32,6 @@ reorder_kernel_selector::reorder_kernel_selector() {
Attach<ReorderFromWinograd2x3Kernel>();
Attach<ReorderToWinograd2x3Kernel>();
Attach<ReorderKernel_to_yxfb_batched>();
Attach<reorder_kernel_byxf_f32_to_byx8_f4_i8>();
Attach<reorder_biplanar_nv12>();
Attach<ReorderKernel_fs_b_yx_fsv32_to_bfyx>();
}

View File

@ -61,8 +61,6 @@ static size_t packing_factor(const resample_params& params) {
return 16;
case DataLayout::b_fs_yx_fsv4:
return 4;
case DataLayout::byxf_af32:
return 16;
default:
break;
}

View File

@ -17,7 +17,7 @@
#if FP16_UNIT_USED
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
{ \
const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
@ -64,9 +64,9 @@
__attribute__((intel_reqd_sub_group_size(16)))
KERNEL(convolution_bfyx_1x1)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
#if BIAS_TERM
__global BIAS_TYPE* biases,
#endif
@ -107,10 +107,10 @@ KERNEL(convolution_bfyx_1x1)(
{
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
uint input_idx = input_offset + k * 8 * xy_block_num * 16;
uint filter_idx = filter_offset + k * 8 * 16;
blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
@ -124,11 +124,7 @@ KERNEL(convolution_bfyx_1x1)(
for(uint i = 0; i < 16; i++)
{
#if OUTPUT_LAYOUT_BF8_XY16
const uint dst_index = GET_DATA_BF8_XY16_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
#else
const uint dst_index = GET_DATA_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
#endif
#if LEFTOVERS
if(group_f+i < OUTPUT_FEATURE_NUM)
#endif

View File

@ -1,214 +0,0 @@
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/common.cl"
#include "include/data_types.cl"
#include "include/fetch.cl"
#include "include/mmad.cl"
#if STRIDE_SIZE_Y == DILATION_SIZE_Y
#define BLOCK_Y_SIZE (FILTER_SIZE_Y + (SPLIT_Y - 1))
#define LOAD_Y_WITH_STRIDES
#else
#define BLOCK_Y_SIZE ((SPLIT_Y - 1) * STRIDE_SIZE_Y + (FILTER_SIZE_Y - 1) * (DILATION_SIZE_Y - 1) + FILTER_SIZE_Y)
#endif
#if STRIDE_SIZE_X == DILATION_SIZE_X
#define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
#define LOAD_X_WITH_STRIDES
#else
#define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
#define LOAD_X_WITH_STRIDES
#define DONT_USE_X_SHIFTS
#endif
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
KERNEL(convolution_gpu_byxf_af32_depthwise)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
#if BIAS_TERM
__global BIAS_TYPE* biases,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint split_idx)
{
const uint x = get_global_id(1) * OUT_BLOCK_WIDTH;
const uint y = get_global_id(2) * SPLIT_Y;
#if OUTPUT_BATCH_NUM == 1
const uint of = get_global_id(0);
const uint b = 0;
#else
const uint of = (uint)get_global_id(0) % ALIGNED_OFM;
const uint b = (uint)get_global_id(0) / ALIGNED_OFM;
#endif
const uint g = of;
if (of >= OUTPUT_FEATURE_NUM)
return;
int dotProd[SPLIT_Y] = {0};
OUTPUT_TYPE out[SPLIT_Y];
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
const uint filter_offset = g*FILTER_GROUPS_PITCH;
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + g*FILTER_IFM_NUM;
// read all weights
FILTER_TYPE w[FILTER_IFM_PITCH];
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
for (int j = 0; j < FILTER_SIZE_Y; j++) {
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
for (int i = 0; i < FILTER_SIZE_X; i++) {
w[j * FILTER_SIZE_X + i] = weights[filter_offset + j * FILTER_Y_PITCH + i * FILTER_X_PITCH];
}
}
// initial input read
INPUT0_TYPE in[FILTER_SIZE_X_PRELOAD * BLOCK_Y_SIZE];
__attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
for (int i = 0; i < BLOCK_Y_SIZE; i++) {
__attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
#ifdef LOAD_Y_WITH_STRIDES
int input_offset_y = input_y + i * DILATION_SIZE_Y;
#else
int input_offset_y = input_y + i;
#endif
#ifdef LOAD_X_WITH_STRIDES
int input_offset_x = input_x + j * DILATION_SIZE_X;
#else
int input_offset_x = input_x + j;
#endif
uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
}
}
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_PRELOAD;
#endif
for (int l = 0; l < OUT_BLOCK_WIDTH; l++) {
//calculate dotproduct
__attribute__((opencl_unroll_hint(SPLIT_Y)))
for (int i = 0; i < SPLIT_Y; i++) {
__attribute__((opencl_unroll_hint(FILTER_IFM_PITCH)))
for (int j = 0; j < FILTER_IFM_PITCH; j++) {
#if defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
dotProd[i] += (int)in[start_pos_y + j] * (int)w[j];
#elif defined(LOAD_X_WITH_STRIDES) && !defined(LOAD_Y_WITH_STRIDES)
const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
const uint pos_x = (j % FILTER_SIZE_X);
dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
#elif defined(LOAD_Y_WITH_STRIDES) && !defined(LOAD_X_WITH_STRIDES)
const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * FILTER_SIZE_X_PRELOAD;
const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
#else
const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
#endif // defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
}
}
__attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
for (int i = 0; i < BLOCK_Y_SIZE; i++) {
// inputs shift
#ifndef DONT_USE_X_SHIFTS
#if (FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X) > 0
__attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X)))
#endif
for (int j = 0; j < FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X; j++) {
in[i * FILTER_SIZE_X_PRELOAD + j] = in[i * FILTER_SIZE_X_PRELOAD + j + STRIDE_SIZE_X];
}
#endif
// read additional inputs
#ifdef LOAD_Y_WITH_STRIDES
int input_offset_y = input_y + i * DILATION_SIZE_Y;
#else
int input_offset_y = input_y + i;
#endif // LOAD_Y_WITH_STRIDES
#if defined(DONT_USE_X_SHIFTS)
__attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + j * DILATION_SIZE_X;
uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
}
#else
{
int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) * DILATION_SIZE_X;
uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
in[i * FILTER_SIZE_X_PRELOAD + FILTER_SIZE_X_PRELOAD - 1] = input[input_idx];
}
#endif // defined(DONT_USE_X_SHIFTS)
}
__attribute__((opencl_unroll_hint(SPLIT_Y)))
for (int m = 0; m < SPLIT_Y; m++) {
#if BIAS_TERM
#if BIAS_PER_OUTPUT
#if OUTPUT_LAYOUT_BYXF_AF32 == 1
const uint bias_index = GET_DATA_INDEX(BIAS, b, of, y + m, x + l);
#elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
const uint bias_index = GET_DATA_B_FS_YX_FSV4_INDEX(BIAS, b, of, y + m, x + l);
#else
#error "Incorrect output layout"
#endif
#elif BIAS_PER_OFM
const uint bias_index = of;
#endif
// TODO: Maybe half should be supported as well.
float res = (float)dotProd[m] + biases[bias_index];
#else
float res = (float)dotProd[m];
#endif
dotProd[m] = 0;
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC;
#else
FUSED_OPS;
#endif
out[m] = FUSED_OPS_RESULT;
#else
out[m] = TO_OUTPUT_TYPE(res);
#endif
}
__attribute__((opencl_unroll_hint(SPLIT_Y)))
for (int m = 0; m < SPLIT_Y; m++) {
#ifdef SPLIT_LEFTOVERS
if (y + m >= OUTPUT_SIZE_Y)
continue;
#endif
const uint dst_index = OUTPUT_GET_INDEX(b, of, y + m, x + l);
output[dst_index] = ACTIVATION(out[m], ACTIVATION_PARAMS);
}
} // OUT_BLOCK_WIDTH
}

View File

@ -1,124 +0,0 @@
// Copyright (c) 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/common.cl"
#include "include/data_types.cl"
#include "include/fetch.cl"
#include "include/mmad.cl"
#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
KERNEL(convolution_MMAD)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
#if BIAS_TERM
__global BIAS_TYPE* biases,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint split_idx)
{
const uint x = get_global_id(0);
const uint y = get_global_id(1);
#if OUTPUT_BATCH_NUM == 1
const uint f = get_global_id(2);
const uint b = 0;
#else
const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
#endif
int dotProd = 0;
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
{
for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
{
const int input_offset_y = input_y + j * DILATION_SIZE_Y;
const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
if(!zero_y)
{
for (uint i = 0; i < FILTER_SIZE_X ; ++i)
{
const int input_offset_x = input_x + i * DILATION_SIZE_X;
const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
if(!zero_x)
{
uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32;
uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
PACKED_TYPE input_data = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes
activations.s0 = sub_group_broadcast(input_data, 0);
activations.s1 = sub_group_broadcast(input_data, 1);
activations.s2 = sub_group_broadcast(input_data, 2);
activations.s3 = sub_group_broadcast(input_data, 3);
activations.s4 = sub_group_broadcast(input_data, 4);
activations.s5 = sub_group_broadcast(input_data, 5);
activations.s6 = sub_group_broadcast(input_data, 6);
activations.s7 = sub_group_broadcast(input_data, 7);
int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
dotProd = MMAD_8(activations, weights_data, dotProd);
}
}
}
}
}
#if BIAS_TERM
#if BIAS_PER_OUTPUT
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
#elif BIAS_PER_OFM
const uint bias_index = f;
#endif
float res = (float)dotProd + biases[bias_index];
#else
float res = (float)dotProd;
#endif // BIAS_TERM
#if HAS_FUSED_OPS
FUSED_OPS;
OUTPUT_TYPE result = FUSED_OPS_RESULT;
#else
OUTPUT_TYPE result = TO_OUTPUT_TYPE(res);
#endif
const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
const uint dst_index = OUTPUT_GET_INDEX(b, f, y, x) + out_split_offset;
output[dst_index] = result;
}
#undef FILTER_IFM_MMAD_NUM
#undef FILTER_OFM_MMAD_NUM
#undef FILTER_IFM_ALIGNED
#undef FILTER_OFM_ALIGNED

View File

@ -1,158 +0,0 @@
// Copyright (c) 2016-2017 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/common.cl"
#include "include/data_types.cl"
#include "include/fetch.cl"
#include "include/mmad.cl"
#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
KERNEL(convolution_MMAD_blocks)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
#if BIAS_TERM
__global BIAS_TYPE* biases,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint split_idx)
{
const uint x = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH;
const uint y = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT;
#if OUTPUT_BATCH_NUM == 1
const uint f = (uint)get_global_id(2);
const uint b = 0;
#else
const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
#endif
int acc[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
PACKED_TYPE in[IN_BLOCK_ARRAY_SIZE];
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH;
uint filter_idx = filter_offset;
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
{
// preload input data
for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE; in_block_pos++)
{
uint block_x = in_block_pos % IN_BLOCK_WIDTH;
uint block_y = in_block_pos / IN_BLOCK_WIDTH;
uint input_idx = in_addr + block_x * INPUT0_X_PITCH + block_y * INPUT0_Y_PITCH;
in[in_block_pos] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
}
// end of preloading input data
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
{
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
for (uint i = 0; i < FILTER_SIZE_X ; ++i)
{
int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
__attribute__((opencl_unroll_hint(OUTPUT_BLOCK_HEIGHT)))
for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
{
__attribute__((opencl_unroll_hint(OUTPUT_BLOCK_WIDTH)))
for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
{
PACKED_TYPE input_data = in[(br * STRIDE_SIZE_Y + j) * IN_BLOCK_WIDTH + bc * STRIDE_SIZE_X + i];
MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes
activations.s0 = sub_group_broadcast(input_data, 0);
activations.s1 = sub_group_broadcast(input_data, 1);
activations.s2 = sub_group_broadcast(input_data, 2);
activations.s3 = sub_group_broadcast(input_data, 3);
activations.s4 = sub_group_broadcast(input_data, 4);
activations.s5 = sub_group_broadcast(input_data, 5);
activations.s6 = sub_group_broadcast(input_data, 6);
activations.s7 = sub_group_broadcast(input_data, 7);
acc[br * OUTPUT_BLOCK_WIDTH + bc] = MMAD_8(activations, weights_data, acc[br * OUTPUT_BLOCK_WIDTH + bc]);
}
}
filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel
}
}
in_addr += 32; // 4 features per channel * 8 SIMD channels
}
#if BIAS_TERM
#if BIAS_PER_OUTPUT
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
#elif BIAS_PER_OFM
const uint bias_index = f;
#endif
#endif // BIAS_TERM
OUTPUT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
{
for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
{
#if BIAS_TERM
// TODO: Maybe half should be supported as well.
float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc] + biases[bias_index];
#else
float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc];
#endif
#if HAS_FUSED_OPS
FUSED_OPS;
out[br * OUTPUT_BLOCK_WIDTH + bc] = FUSED_OPS_RESULT;
#else
out[br * OUTPUT_BLOCK_WIDTH + bc] = TO_OUTPUT_TYPE(res);
#endif
}
}
const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
{
if(y + br < OUTPUT_SIZE_Y)
{
for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
{
if(x + bc < OUTPUT_SIZE_X)
{
const uint dst_index = OUTPUT_GET_INDEX(b, f, y+br, x+bc) + out_split_offset;
output[dst_index] = out[br * OUTPUT_BLOCK_WIDTH + bc];
}
}
}
}
}
#undef FILTER_IFM_MMAD_NUM
#undef FILTER_OFM_MMAD_NUM
#undef FILTER_IFM_ALIGNED
#undef FILTER_OFM_ALIGNED

View File

@ -1,163 +0,0 @@
// Copyright (c) 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/common.cl"
#include "include/data_types.cl"
#include "include/fetch.cl"
#include "include/imad.cl"
#if defined(ACCUMULATOR_TYPE)
#undef ACCUMULATOR_TYPE
#endif
#if QUANTIZATION_TERM
# define ACCUMULATOR_TYPE int
# define ACTIVATION_TYPE float
# define TO_ACTIVATION_TYPE(x) convert_float(x)
#else
# define ACCUMULATOR_TYPE INPUT0_TYPE
# define ACTIVATION_TYPE INPUT0_TYPE
# define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
#endif
#define FILTER_IFM_SLICES_NUM ((FILTER_IFM_NUM + 31) / 32)
#define FILTER_OFM_NUM_ALIGNED ((FILTER_OFM_NUM + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE * SUB_GROUP_SIZE)
// we are packing 4 8bit activations per 32 bit
#define PACK 4
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
KERNEL(fused_conv_eltwise_gpu_af32_imad_1x1)(
const __global PACKED_TYPE* input,
__global OUTPUT_TYPE* restrict output,
const __global uint* weights,
#if BIAS_TERM
__global BIAS_TYPE* biases,
#endif
#if HAS_FUSED_OPS_DECLS
FUSED_OPS_DECLS,
#endif
uint split_idx)
{
const uint x = (uint)get_global_id(0) * TILE_LENGTH % OUTPUT_SIZE_X;
const uint y = (uint)get_global_id(0) * TILE_LENGTH / OUTPUT_SIZE_X;
const uint f = (((uint)get_global_id(1) * TILE_DEPTH) % FILTER_OFM_NUM_ALIGNED) / (TILE_DEPTH * SUB_GROUP_SIZE) * (TILE_DEPTH * SUB_GROUP_SIZE);
const uint b = ((uint)get_global_id(1) * TILE_DEPTH) / FILTER_OFM_NUM_ALIGNED;
const uint lid = get_sub_group_local_id();
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
PACKED_TYPE input_slice[TILE_LENGTH];
int8 weights_slice;
ACCUMULATOR_TYPE accu[TILE_LENGTH][TILE_DEPTH] = {0};
uint filter_idx = f * FILTER_IFM_SLICES_NUM * 32 / PACK;
uint in_addr = (INPUT0_GET_INDEX(b, 0, input_y, input_x)) / PACK;
__attribute__((opencl_unroll_hint(1)))
for (uint k = 0; k < FILTER_IFM_SLICES_NUM; ++k)
{
// Read 32 input features for each pixel in the tile. 4 features in each int, 8 ints across SIMD
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
for (uint i = 0; i < TILE_LENGTH; ++i)
{
uint tmp_addr = in_addr + i * INPUT0_X_PITCH * STRIDE_SIZE_X / PACK;
input_slice[i] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)input + tmp_addr));
}
// Loop through TILE_DEPTH output features
__attribute__((opencl_unroll_hint(TILE_DEPTH)))
for (uint of = 0; of < TILE_DEPTH; ++of)
{
// Read 32 weights. 8 ints, 4 weights in each int, each SIMD lane has own weghts
weights_slice = as_int8(intel_sub_group_block_read8(weights + filter_idx));
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
for (uint i = 0; i < TILE_LENGTH; ++i)
{
PACKED_TYPE A_scalar;
A_scalar = sub_group_broadcast(input_slice[i], 0); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s0));
A_scalar = sub_group_broadcast(input_slice[i], 1); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s1));
A_scalar = sub_group_broadcast(input_slice[i], 2); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s2));
A_scalar = sub_group_broadcast(input_slice[i], 3); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s3));
A_scalar = sub_group_broadcast(input_slice[i], 4); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s4));
A_scalar = sub_group_broadcast(input_slice[i], 5); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s5));
A_scalar = sub_group_broadcast(input_slice[i], 6); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s6));
A_scalar = sub_group_broadcast(input_slice[i], 7); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s7));
}
filter_idx += 32 * 8 / 4; // 32 features per channel * 8 SIMD channels / sizeof(int)
}
in_addr += 4 * 8 / 4; // 4 features per channel * 8 SIMD channels / sizeof(int) -> next 32 input features
}
#if TILE_DEPTH == 8
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result[TILE_LENGTH];
#elif TILE_DEPTH == 4
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result[TILE_LENGTH];
#endif
uint dst_index = (OUTPUT_GET_INDEX(b, f, y, x)) / PACK;
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
for (uint i = 0; i < TILE_LENGTH; ++i)
{
__attribute__((opencl_unroll_hint(TILE_DEPTH)))
for (uint j = 0; j < TILE_DEPTH; ++j)
{
const uint f2 = f + lid * 4 + (j % 4) + (j / 4 * 32);
ACCUMULATOR_TYPE dotProd = accu[i][j];
#if BIAS_TERM
#if BIAS_PER_OUTPUT
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
#elif BIAS_PER_OFM
const uint bias_index = f2;
#endif
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd) + TO_ACTIVATION_TYPE(biases[bias_index]);
#else
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
#endif //BIAS_TERM
#if HAS_FUSED_OPS
FUSED_OPS;
result[i][j] = FUSED_OPS_RESULT;
#else
result[i][j] = TO_OUTPUT_TYPE(res);
#endif
}
}
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
for (uint i = 0; i < TILE_LENGTH; ++i)
{
#if TILE_DEPTH == 8
intel_sub_group_block_write2((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint2(result[i]));
#elif TILE_DEPTH == 4
intel_sub_group_block_write((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint(result[i]));
#endif
}
}
#undef FILTER_IFM_SLICES_NUM
#undef FILTER_OFM_NUM_ALIGNED
#undef ACCUMULATOR_TYPE
#undef ACTIVATION_TYPE
#undef TO_ACTIVATION_TYPE

View File

@ -129,7 +129,6 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
#else
in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
#endif
// TODO This will cause errors for byxf_af32 format on input
in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down
#endif
}
@ -191,9 +190,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
if(!zero_c)
#endif
{
#if OUTPUT_LAYOUT_BYXF_AF32 == 1
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
#elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
#if OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
#elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);

View File

@ -99,20 +99,6 @@
((b) / (sub_group_size))*CAT(prefix, _BATCH_PITCH) \
)
inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint offset)
{
const uint xy_idx = x + y * x_size;
const uint xy_offset = (xy_idx % 16) + (xy_idx / 16) * 16 * 8;
const uint xy_block_num = (x_size * y_size + 16 - 1) / 16;
const uint f_offset = (f % 8) * 16 + (f / 8) * xy_block_num * 16 * 8;
const uint f_block_num = (f_size + 8 - 1) / 8;
const uint b_offset = b * f_block_num * xy_block_num * 128;
const size_t idx = offset + xy_offset + f_offset + b_offset;
return idx;
}
inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
uint x_size, uint y_size, uint f_size, uint b_size,
uint b_pad_before, uint b_pad_after,
@ -495,93 +481,6 @@ inline uint FUNC(get_os_zyxi_osv16_index)(uint o, uint i, uint z, uint y, uint x
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z))
inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pitch, uint b_pitch, uint f_size, uint f_pad_before, uint f_pad_after, uint offset)
{
const uint f_aligned_to_32 = ((f_size + 31) / 32) * 32;
const uint x_pitch = f_pad_before + f_aligned_to_32 + f_pad_after;
const uint b_offset = b * b_pitch;
const uint xy_offset = x_pitch * x + y_pitch * y;
const uint f_offset = f;
const size_t idx = offset + xy_offset + b_offset + f_offset;
return idx;
}
#define GET_DATA_BYXF_AF32_INDEX(prefix, b, f, y, x) \
FUNC_CALL(get_byxf_af32_index)( \
b, f, y, x, CAT(prefix, _Y_PITCH), \
CAT(prefix, _BATCH_PITCH), \
CAT(prefix, _FEATURE_NUM), \
CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \
CAT(prefix, _PAD_AFTER_FEATURE_NUM), \
CAT(prefix, _OFFSET))
inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x,
uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset)
{
const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4;
const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8;
const uint b_offset = b * b_pitch;
const uint xy_offset = x * x_pitch + y * y_pitch;
const uint f_offset = f;
const size_t idx = offset + xy_offset + b_offset + f_offset;
return idx;
}
#define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x) \
FUNC_CALL(get_byx8_f4_index)( \
b, f, y, x, CAT(prefix, _X_PITCH), \
CAT(prefix, _Y_PITCH), \
CAT(prefix, _BATCH_PITCH), \
CAT(prefix, _FEATURE_NUM), \
CAT(prefix, _SIZE_X), \
CAT(prefix, _OFFSET))
#define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \
FUNC_CALL(get_bf8_xy16_index)( \
b, f, y, x, CAT(prefix, _SIZE_X ), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _FEATURE_NUM), \
CAT(prefix, _OFFSET))
inline uint FUNC(get_fs_bs_yx_bsv4_fsv32_index)(uint b, uint f, uint y, uint x,
uint x_pad_before, uint x_size, uint x_pad_after,
uint y_pad_before, uint y_size, uint y_pad_after,
uint size_f, uint size_b)
{
const uint f_32_aligned = ((size_f + 31)/32) * 32;
const uint b_4_aligned = ((size_b + 3)/4) * 4;
const uint fsv_idx = f % 32;
const uint bsv_idx = b % 4;
const uint fs_idx = f / 32;
const uint bs_idx = b / 4;
const uint x_pitch = 32 * 4;
const uint y_pitch = 32 * 4 * (x_pad_before + x_size + x_pad_after);
const uint bs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after);
const uint fs_pitch = bs_pitch * (b_4_aligned / 4);
uint offset = x_pitch * x_pad_before + y_pitch * y_pad_before;
size_t idx = offset + fsv_idx + bsv_idx * 32;
idx += 32*4 * x;
idx += y * y_pitch;
idx += bs_idx * bs_pitch;
idx += fs_idx * fs_pitch;
return idx;
}
#define GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(prefix, b, f, y, x) \
FUNC_CALL(get_fs_bs_yx_bsv4_fsv32_index)( \
b, f, y, x, \
CAT(prefix, _PAD_BEFORE_SIZE_X), \
CAT(prefix, _SIZE_X), \
CAT(prefix, _PAD_AFTER_SIZE_X), \
CAT(prefix, _PAD_BEFORE_SIZE_Y), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _PAD_AFTER_SIZE_Y), \
CAT(prefix, _FEATURE_NUM), \
CAT(prefix, _BATCH_NUM))
#define GET_FILTER_GOIYX(prefix, g, o, i, y, x) \
CAT(prefix, _OFFSET) + \
(x)*CAT(prefix, _X_PITCH) + \

View File

@ -172,7 +172,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
}
#endif
#if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
#if OUTPUT_LAYOUT_B_FS_YX_FSV4
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
#if OUTPUT_FEATURE_NUM % 4 == 0
*((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;

View File

@ -1,189 +0,0 @@
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/include_all.cl"
#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
#if MAX_POOLING
#define INIT_VAL ACCUMULATOR_VAL_MIN
#elif AVG_POOLING
#define INIT_VAL ACCUMULATOR_VAL_ZERO
#else
#error
#endif
inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
{
#if MAX_POOLING
return ACCUMULATOR_MAX_FUNC(tmp, in);
#elif AVG_POOLING
return tmp + in;
#endif
}
KERNEL(pooling_gpu_byxf_af32)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
, FUSED_OPS_DECLS
#endif
)
{
const uint x = (uint)get_global_id(0);
const uint y = (uint)get_global_id(1);
const uint bf = (uint)get_global_id(2);
// we process 4 features per workitem that's why we need to divide it
const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
const uint f = 4 * (bf % (aligned32_features / 4));
const uint b = bf / (aligned32_features / 4);
typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_t;
if (x >= OUTPUT_SIZE_X)
{
return;
}
const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
ACCUMULATOR_VEC4 result = INIT_VAL;
#ifdef CHECK_BOUNDRY
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
{
return;
}
#ifdef DYNAMIC_KERNEL_DIVIDER
uint num_elementes = 0;
#endif
const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0);
for(uint j = 0; j < POOL_SIZE_Y; j++)
{
int input_offset_y = offset_y + j;
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
if(!zero_y)
{
for(uint i = 0; i < POOL_SIZE_X; i++)
{
int input_offset_x = offset_x + i;
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
if(!zero)
{
const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
#ifdef DYNAMIC_KERNEL_DIVIDER
num_elementes++;
#endif
}
}
}
}
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
const uint num_elementes = (hend - offset_y) * (wend - offset_x);
#endif
#else
uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x);
for(uint j = 0; j < POOL_SIZE_Y; j++)
{
for(uint i = 0; i < POOL_SIZE_X; i++)
{
input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
input_idx += INPUT0_X_PITCH;
}
input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH);
}
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
#endif
#endif
#if defined AVG_POOLING
#if ENABLE_ROUND
int4 not_fused_result;
for (uint i = 0; i < 4; ++i) {
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1)));
#else
not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
#endif
}
#else // ENABLE_ROUND
float4 not_fused_result;
for (uint i = 0; i < 4; ++i) {
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1);
#else
not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
#endif
}
#endif // ENABLE_ROUND
#else // AVG_POOLING
float4 not_fused_result = convert_float4(result);
#endif // AVG_POOLING
OUTPUT_VEC4 final_result;
#if HAS_FUSED_OPS
ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result);
FUSED_OPS;
final_result = FUSED_OPS_RESULT;
for(uint op = 0; op < 4; op++)
{
const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
output[output_pos] = final_result[op];
}
#else
final_result = TO_OUTPUT_VEC4(not_fused_result);
for(uint op = 0; op < 4; op++)
{
const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS));
output[output_pos] = final_result[op];
}
#endif
}
#undef INIT_VAL
#undef ACCUMULATOR_VEC4
#undef ACTIVATION_VEC4
#undef TO_ACTIVATION_VEC4
#undef OUTPUT_VEC4
#undef TO_OUTPUT_VEC4

View File

@ -40,7 +40,7 @@ KERNEL(pooling_gpu_int8_ref)(
#endif
)
{
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BYXF_AF32 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
const uint x = (uint)get_global_id(0);
const uint yz = (uint)get_global_id(1);
#if OUTPUT_DIMS == 5

View File

@ -44,7 +44,7 @@ KERNEL(pooling_gpu)(
)
{
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\
OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4
const uint x = (uint)get_global_id(0);
#if OUTPUT_DIMS == 5
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;

View File

@ -27,16 +27,8 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
#elif defined OUTPUT_LAYOUT_BF8_XY16
return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_BYXF_AF32
return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_BYX8_F4
return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_FS_B_YX_FSV32

View File

@ -1,130 +0,0 @@
// Copyright (c) 2016-2017 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/reshape_dims.cl"
#include "include/fetch.cl"
#include "include/data_types.cl"
///////////////////////// Input Index /////////////////////////
inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
{
#if INPUT0_SIMPLE
return GET_DATA_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
#elif defined INPUT0_LAYOUT_BF8_XY16
return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_BYXF_AF32
return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_BYX8_F4
return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32
return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4
return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x);
#else
#error reorder_data.cl: input format - not supported
#endif
}
///////////////////////// Output Index /////////////////////////
inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x)
{
#if OUTPUT_SIMPLE
return GET_DATA_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
#elif defined OUTPUT_LAYOUT_BF8_XY16
return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_BYXF_AF32
return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_BYX8_F4
return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
#else
#error reorder_data.cl: output format - not supported
#endif
}
__attribute__((intel_reqd_sub_group_size(16)))
KERNEL (reorder_data_byxf_f32_to_byx8_f4_i8)(
const __global INPUT_REORDER_TYPE* input,
__global OUTPUT_REORDER_TYPE* output
#ifdef MEAN_SUBTRACT_IN_BUFFER
, __global MEAN_SUBTRACT_TYPE* mean_subtract
#endif
)
{
const uint x = get_global_id(0);
const uint y = get_group_id(1);
const uint b = (uint)get_group_id(2) * WG_BATCH_SIZE + (uint)get_sub_group_id();
const uint input_idx = FUNC_CALL(get_input_index)(b, 0, y, x);
const uint output_idx = FUNC_CALL(get_output_index)(b, 0, y, x);
#if defined MEAN_SUBTRACT_INSIDE_PARAMS
float4 res;
res.s0 = TO_MEAN_TYPE(input[input_idx]);
res.s0 = MEAN_OP(res.s0, VALUE_TO_SUBTRACT[0 % VALUE_TO_SUBTRACT_SIZE]);
res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
res.s1 = MEAN_OP(res.s1, VALUE_TO_SUBTRACT[1 % VALUE_TO_SUBTRACT_SIZE]);
res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
res.s2 = MEAN_OP(res.s2, VALUE_TO_SUBTRACT[2 % VALUE_TO_SUBTRACT_SIZE]);
res.s3 = 0;
#elif defined MEAN_SUBTRACT_IN_BUFFER
#if defined MEAN_PER_FEATURE
MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
res.s0 = TO_MEAN_TYPE(input[input_idx]);
res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
res.s3 = 0
#else
MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
res.s0 = TO_MEAN_TYPE(input[input_idx]);
res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
res.s3 = 0;
res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
#endif
#else
MAKE_VECTOR_TYPE(CALC_TYPE, 4) res;
res.s0 = TO_CALC_TYPE(input[input_idx]);
res.s1 = TO_CALC_TYPE(input[input_idx+1]);
res.s2 = TO_CALC_TYPE(input[input_idx+2]);
res.s3 = 0;
#endif
char4 out_vals;
out_vals.s0 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s0), ACTIVATION_PARAMS_TYPED);
out_vals.s1 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s1), ACTIVATION_PARAMS_TYPED);
out_vals.s2 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s2), ACTIVATION_PARAMS_TYPED);
out_vals.s3 = 0;
__global uint* dst = (__global uint*)output;
dst[output_idx/4] = as_uint(out_vals);
}

View File

@ -30,8 +30,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
#elif defined INPUT0_LAYOUT_BF8_XY16
return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_ZYX_FSV16
@ -54,8 +52,6 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
#elif defined OUTPUT_LAYOUT_BF8_XY16
return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
#elif defined OUTPUT_LAYOUT_B_FS_ZYX_FSV16

View File

@ -26,10 +26,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
#elif defined INPUT0_LAYOUT_BF8_XY16
return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_BYXF_AF32
return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
#elif defined INPUT0_LAYOUT_FS_B_YX_FSV32

View File

@ -330,8 +330,6 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
raw_index_func_val = "GET_DATA_INDEX_RAW(" + _name + ", b, f, y, x)";
} else if (layout == DataLayout::b_fs_yx_fsv16 ||
layout == DataLayout::b_fs_yx_fsv32 ||
layout == DataLayout::byxf_af32 ||
layout == DataLayout::fs_bs_yx_bsv4_fsv32 ||
layout == DataLayout::b_fs_yx_fsv4 ||
layout == DataLayout::fs_b_yx_fsv32 ||
layout == DataLayout::bs_fs_yx_bsv16_fsv16) {

View File

@ -103,11 +103,7 @@ std::string toString(DataLayout l) {
case kernel_selector::DataLayout::b_fs_zyx_fsv32: return "B_FS_ZYX_FSV32";
case kernel_selector::DataLayout::bs_f_bsv8__af8: return "BS_F_BSV8__AF8";
case kernel_selector::DataLayout::bs_f_bsv16__af8: return "BS_F_BSV16__AF8";
case kernel_selector::DataLayout::bf8_xy16: return "BF8_XY16";
case kernel_selector::DataLayout::winograd_2x3_s1_data: return "WINOGRAD_2x3_S1_DATA";
case kernel_selector::DataLayout::byxf_af32: return "BYXF_AF32";
case kernel_selector::DataLayout::byx8_f4: return "BYX8_F4";
case kernel_selector::DataLayout::fs_bs_yx_bsv4_fsv32: return "FS_BS_YX_BSV4_FSV32";
case kernel_selector::DataLayout::b_fs_yx_fsv4: return "B_FS_YX_FSV4";
case kernel_selector::DataLayout::b_fs_yx_32fp: return "B_FS_YX_32FP";
case kernel_selector::DataLayout::bfzyx: return "BFZYX";

View File

@ -272,40 +272,6 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
return {output_type, format::b_fs_yx_32fp, output_size};
}
// due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
// will take byxf and return fs_bs_yx_bsv4_fsv32
if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
}
auto users = node.get_users();
if (users.size() == 1 && users.front()->is_type<convolution>()) {
auto conv_split = users.front()->as<convolution>().get_split();
auto conv_groups = (int32_t)users.front()->as<convolution>().get_groups();
bool next_is_dw = ((conv_split > 1 && conv_split == output_size.feature[0]) ||
(conv_groups > 1 && conv_groups == output_size.feature[0]));
if (input_layout.data_type == data_types::i8 && input_layout.format == format::b_fs_yx_fsv4 && next_is_dw) {
return layout{output_type, cldnn::format::byxf_af32, output_size};
}
auto prev_node = node.get_dependencies().front();
if (prev_node->is_type<reorder>())
prev_node = prev_node->get_dependencies().front();
auto prev_is_convo = prev_node->is_type<convolution>();
if (prev_is_convo) {
auto prev2_node = prev_node->get_dependencies().front();
auto prev_input_format = prev2_node->get_output_layout().format;
if (input_layout.data_type == data_types::i8 && input_layout.format == format::byxf_af32 && !next_is_dw &&
prev_input_format == format::b_fs_yx_fsv4) {
return layout{output_type, cldnn::format::b_fs_yx_fsv4, output_size};
}
}
}
return {output_type, input_layout.format, output_size};
}

View File

@ -230,13 +230,6 @@ layout fused_conv_eltwise_inst::calc_output_layout(fused_conv_eltwise_node const
output_range.spatial[1],
output_range.spatial[2]);
// due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
// will take byxf and return fs_bs_yx_bsv4_fsv32
if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
}
return {output_type, input_layout.format, output_size};
}

View File

@ -150,8 +150,6 @@ attach_concatenation_gpu::attach_concatenation_gpu() {
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create},
// MMAD
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), concatenation_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), concatenation_gpu::create},

View File

@ -189,8 +189,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
// block f16 format
@ -202,11 +200,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
// MMAD
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
@ -216,7 +209,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);

View File

@ -1,192 +0,0 @@
/*
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#ifdef CODE_PREFIX
#define CODE_BEGIN CODE_PREFIX
#define CODE_END CODE_POSTFIX
#else
#define CODE_BEGIN
#define CODE_END
#endif
CODE_BEGIN
enum neural_memory_format {
x_f32,
xb_f32, // 1D+batch, float32
bx_f32, // 1D+batch, float32
yxfb_f32, // 3D+batch, float32
byxf_f32, // for convolution_cpu_jit_batch1
bfyx_f32, // used in Caffe
fyxb_f32, // used in Caffe
oiyx_f32, // format used only for weights: o - output feature maps, i - input feature maps
byxf_b24_f32, // for convolution_cpu_generic
yxoi_o4_f32, // for convolution_cpu_generic
os_yxi_sv16_f32, // format used only for weights: os - output slice, i - input feature maps, sv16 - 16 values of single slice
bs_yxf_bv24_f32,
any=-1
};
#pragma pack(push, 4)
typedef struct _neural_memory_tag {
uint format;
uint feature_offset;
uint spatial_offset;
uint vector_size;
uint data_offset;
uint data[1];
} neural_memory;
typedef struct _neural_vector_tag {
uint feature_offset;
uint spatial_offset;
uint raw_size;
uint data[1];
} neural_vector;
#pragma pack(pop)
// neural_memory accessors
__attribute__((overloadable)) __global uint* get_raw(__global neural_memory* mem) { return &(mem->data[0]); }
__attribute__((overloadable)) const __global uint* get_raw(const __global neural_memory* mem) { return &(mem->data[0]); }
__attribute__((overloadable)) uint get_raw_size(const __global neural_memory* mem) { return mem->vector_size; }
__attribute__((overloadable)) __global uint* get_batch(__global neural_memory* mem) { return get_raw(mem); }
__attribute__((overloadable)) const __global uint* get_batch(const __global neural_memory* mem) { return get_raw(mem); }
__attribute__((overloadable)) uint get_batch_size(const __global neural_memory* mem) { return mem->feature_offset; }
__attribute__((overloadable)) __global uint* get_feature(__global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
__attribute__((overloadable)) const __global uint* get_feature(const __global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
__attribute__((overloadable)) uint get_feature_size(const __global neural_memory* mem) { return mem->spatial_offset - mem->feature_offset; }
__attribute__((overloadable)) __global uint* get_spatial(__global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
__attribute__((overloadable)) uint get_spatial_size(const __global neural_memory* mem) { return get_raw_size(mem) - mem->spatial_offset; }
__attribute__((overloadable)) __global void* get_data(__global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
__attribute__((overloadable)) const __global void* get_data(const __global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
__attribute__((overloadable)) size_t get_element_size(const __global neural_memory* mem) { return sizeof(float); }
__attribute__((overloadable)) size_t get_data_size(const __global neural_memory* mem) {
size_t result = get_element_size(mem);
const __global uint* raw = get_raw(mem);
uint raw_size = get_raw_size(mem);
for(uint i = 0; i < raw_size; i++) {
result *= raw[i];
}
return result;
}
// neural_vector accessors
// TODO NOTE: non-const accessors are disabled now, because read-only neural_vector argument is only supported now
//__attribute__((overloadable)) __global uint* get_raw(__global neural_vector* v) { return &(v->data[0]); }
__attribute__((overloadable)) const __global uint* get_raw(const __global neural_vector* v) { return &(v->data[0]); }
__attribute__((overloadable)) uint get_raw_size(const __global neural_vector* v) { return v->raw_size; }
//__attribute__((overloadable)) __global uint* get_batch(__global neural_vector* v) { return get_raw(v); }
__attribute__((overloadable)) const __global uint* get_batch(const __global neural_vector* v) { return get_raw(v); }
__attribute__((overloadable)) uint get_batch_size(const __global neural_vector* v) { return v->feature_offset; }
//__attribute__((overloadable)) __global uint* get_feature(__global neural_vector* v) { return &(v->data[v->feature_offset]); }
__attribute__((overloadable)) const __global uint* get_feature(const __global neural_vector* v) { return &(v->data[v->feature_offset]); }
__attribute__((overloadable)) uint get_feature_size(const __global neural_vector* v) { return v->spatial_offset - v->feature_offset; }
//__attribute__((overloadable)) __global uint* get_spatial(__global neural_vector* v) { return &(v->data[v->spatial_offset]); }
__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_vector* v) { return &(v->data[v->spatial_offset]); }
__attribute__((overloadable)) uint get_spatial_size(const __global neural_vector* v) { return get_raw_size(v) - v->spatial_offset; }
CODE_END
/*
KERNEL(Fully_Connected_GPU)
DECALRE_CONSTANT()
BEGIN_ARGUMENTS_DECLARATION
DECLARE_INPUT_MEMORY_ARGUMENT(input_mem)
DECLARE_INPUT_MEMORY_ARGUMENT(weights_mem)
DECLARE_INPUT_MEMORY_ARGUMENT(bias_mem)
DECLARE_OUTPUT_MEMORY_ARGUMENT(dst_mem)
END_ARGUMENTS_DECLARATION
CODE_BEGIN
#define WEIGHTS { 1.0, 3.2, 4.5, 6.7 }
#define WEIGHTS_SIZE { 2, 2 }
#define WEIGHTS_DIM 2
*/
__kernel void Fully_Connected_GPU(__global neural_memory* input_mem, __global neural_memory* weights_mem, __global neural_memory* bias_mem, __global neural_memory* dst_mem)
{
__global uint* input_size = get_raw(input_mem);
__global uint* weights_size = get_raw(weights_mem);
__global float* input = (__global float*)get_data(input_mem);
__global float* weights = (__global float*)get_data(weights_mem);
__global float* bias = (__global float*)get_data(bias_mem);
__global float* pDst = (__global float*)get_data(dst_mem);
const int x = get_global_id(0);
pDst[x] = 0;
uint outXIdx = x / input_size[0];
uint inputBatchIdx = x % input_size[0];
uint weightYIdx = outXIdx * weights_size[0];
for (uint i = 0; i < input_size[2]; i++)
{
pDst[x] += input[i * input_size[0] + inputBatchIdx] * weights[weightYIdx + i];
}
pDst[x] += bias[outXIdx];
}
CODE_END
CODE_BEGIN
__kernel void Convolution_GPU(
const __global neural_memory* input_mem,
const __global neural_memory* filter_mem,
float bias,
__global neural_memory* dst_mem,
const __global neural_vector* spatial_stride)
{
//
const __global uint* input_size = get_raw(input_mem);
const __global uint* filter_size = get_raw(filter_mem);
const __global uint* dst_size = get_raw(dst_mem);
const __global float* input = (const __global float*)get_data(input_mem);
const __global float* filter = (const __global float*)get_data(filter_mem);
__global float* pDst = (__global float*)get_data(dst_mem);
//
int global_id = get_global_id(0);
const int batch_num = dst_size[0];
const int batch_offset = global_id % dst_size[0];
const int idx = global_id / batch_num;
const int x = (idx % input_size[2]) * get_spatial(spatial_stride)[0];
const int y = (idx * get_spatial(spatial_stride)[1]) / input_size[2];
const int out_offset = idx * batch_num + batch_offset;
pDst[out_offset] = 0;
for (uint i = 0; i < filter_size[4]; i++)
{
for (uint j = 0; j < filter_size[3]; j++)
{
int input_idx = (x + j + ((y + i) * input_size[2])) * batch_num + batch_offset;
int filter_idx = i * filter_size[3] + j;
pDst[out_offset] += input[input_idx] * filter[filter_idx];
}
}
pDst[out_offset] += bias;
}
CODE_END

View File

@ -182,9 +182,6 @@ attach_eltwise_gpu::attach_eltwise_gpu() {
{ std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
// MMAD
{ std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create },
{ std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), eltwise_gpu::create },

View File

@ -100,8 +100,6 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() {
{std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw},
{std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw},
// MMAD
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw},
{std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw},
// IMAD

View File

@ -183,23 +183,13 @@ attach_fused_conv_eltwise_gpu::attach_fused_conv_eltwise_gpu() {
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
fused_conv_eltwise_gpu::create);
// MMAD
implementation_map<fused_conv_eltwise>::add(
std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32),
fused_conv_eltwise_gpu::create);
// IMAD
implementation_map<fused_conv_eltwise>::add(
std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(
std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32),
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32),
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
fused_conv_eltwise_gpu::create);
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba),
fused_conv_eltwise_gpu::create);
fused_conv_eltwise_gpu::create);
}
} // namespace detail

View File

@ -206,9 +206,6 @@ attach_pooling_gpu::attach_pooling_gpu() {
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), pooling_gpu::create);
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create);

View File

@ -113,11 +113,6 @@ attach_quantize_gpu::attach_quantize_gpu() {
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw);

View File

@ -99,11 +99,7 @@ attach_resample_gpu::attach_resample_gpu() {
{std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), resample_gpu::create},
{std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), resample_gpu::create}});
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create}});
}
} // namespace detail

View File

@ -127,10 +127,6 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
(l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
// TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
return false;
if (l.format == format::bs_fs_yx_bsv16_fsv16)
return false;

View File

@ -121,12 +121,9 @@ void prepare_padding::run(program_impl& p) {
// right now output padding optimization is only available for bfyx format and data type = float32
if (conv_layout.format != cldnn::format::bfyx &&
conv_layout.format != cldnn::format::bf8_xy16 &&
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
conv_layout.format != cldnn::format::byxf_af32 &&
conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
conv_layout.format != cldnn::format::b_fs_yx_fsv4 &&
conv_layout.format != cldnn::format::fs_b_yx_fsv32 &&
conv_layout.format != cldnn::format::b_fs_yx_32fp) {

View File

@ -760,11 +760,8 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
for (auto& dep : eltw_node->get_dependencies()) {
format fmt = dep->get_output_layout().format;
data_types dep_dt = dep->get_output_layout().data_type;
if ((fmt != format::fs_bs_yx_bsv4_fsv32 || dep_dt != data_types::i8) &&
(fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
if ((fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
(fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::u8) &&
(fmt != format::byxf_af32 || dep_dt != data_types::i8) &&
(fmt != format::byxf_af32 || dep_dt != data_types::u8) &&
(fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) &&
(fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) &&
(fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused))

View File

@ -372,70 +372,6 @@ void reorder_inputs::run(program_impl& p, layout_optimizer& lo, reorder_factory&
auto fmt_map = get_preferred_formats(p, lo);
propagate_formats(p, fmt_map, lo);
minimize_local_reorders(p, fmt_map, lo);
// WA START ============================================================================================================
if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) {
// This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer.
//
// Find pattern:
// mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) ->
// Replace with:
// mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) ->
//
// Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32,
// but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed.
// Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution.
for (auto& node_ptr : p.get_processing_order()) {
if (!node_ptr->is_in_data_flow() || !node_ptr->is_type<convolution>() || fmt_map.at(node_ptr) != format::byxf_af32)
continue;
auto& conv_node = node_ptr->as<convolution>();
bool input_path =
conv_node.input().get_output_layout().data_type == data_types::i8 &&
conv_node.input().is_type<mvn>() &&
fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16;
bool output_path =
conv_node.get_users().size() == 1 &&
conv_node.get_users().front()->is_type<mvn>() &&
fmt_map.at(conv_node.get_users().front()) == format::bfyx &&
conv_node.get_users().front()->get_users().size() == 1 &&
!conv_node.get_users().front()->as<mvn>().get_primitive()->across_channels;
if (!input_path || !output_path)
continue;
auto in_lay = conv_node.input().get_output_layout();
auto out_lay = conv_node.get_output_layout();
auto wei_lay = conv_node.weights().get_output_layout();
bool correct_layouts =
// weights
wei_lay.data_type == data_types::i8 &&
wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 &&
// input/output
in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 &&
in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 &&
in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 &&
in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720;
if (!correct_layouts)
continue;
bool correct_conv =
conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 &&
!conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() &&
!conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() &&
conv_node.get_primitive()->dilation == tensor(1);
if (!correct_conv)
continue;
fmt_map.at(node_ptr) = format::b_fs_yx_fsv16;
fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16;
}
}
// WA END ==============================================================================================================
insert_reorders(p, fmt_map, rf);
for (auto n : p.get_processing_order()) {

View File

@ -85,16 +85,8 @@ inline std::string fmt_to_str(format fmt) {
return "bs_xs_xsv8_bsv16";
case format::bs_x_bsv16:
return "bs_x_bsv16";
case format::bf8_xy16:
return "bf8_xy16";
case format::winograd_2x3_s1_data:
return "winograd_2x3_s1_data";
case format::byxf_af32:
return "byxf_af32";
case format::byx8_f4:
return "byx8_f4";
case format::fs_bs_yx_bsv4_fsv32:
return "fs_bs_yx_bsv4_fsv32";
case format::b_fs_yx_fsv4:
return "b_fs_yx_fsv4";
case format::b_fs_yx_32fp:

View File

@ -126,16 +126,8 @@ kernel_selector::data_layout to_data_layout(format f) {
return kernel_selector::data_layout::bs_f_bsv8__af8;
case format::bs_xs_xsv8_bsv16:
return kernel_selector::data_layout::bs_f_bsv16__af8;
case format::bf8_xy16:
return kernel_selector::data_layout::bf8_xy16;
case format::winograd_2x3_s1_data:
return kernel_selector::data_layout::winograd_2x3_s1_data;
case format::byxf_af32:
return kernel_selector::data_layout::byxf_af32;
case format::byx8_f4:
return kernel_selector::data_layout::byx8_f4;
case format::fs_bs_yx_bsv4_fsv32:
return kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32;
case format::b_fs_yx_fsv4:
return kernel_selector::data_layout::b_fs_yx_fsv4;
case format::b_fs_yx_32fp:
@ -185,16 +177,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
return cldnn::format::bs_xs_xsv8_bsv8;
case kernel_selector::data_layout::bs_f_bsv16__af8:
return cldnn::format::bs_x_bsv16;
case kernel_selector::data_layout::bf8_xy16:
return cldnn::format::bf8_xy16;
case kernel_selector::data_layout::winograd_2x3_s1_data:
return cldnn::format::winograd_2x3_s1_data;
case kernel_selector::data_layout::byxf_af32:
return cldnn::format::byxf_af32;
case kernel_selector::data_layout::byx8_f4:
return cldnn::format::byx8_f4;
case kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32:
return cldnn::format::fs_bs_yx_bsv4_fsv32;
case kernel_selector::data_layout::b_fs_yx_32fp:
return cldnn::format::b_fs_yx_32fp;
case kernel_selector::data_layout::bfzyx:
@ -510,20 +494,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split
size_t pitch = 1;
auto new_vals = vals;
if (ks_layout == kernel_selector::Tensor::byxf_af32) {
new_vals[3] = align_to(vals[3], 32);
}
if (ks_layout == kernel_selector::Tensor::b_fs_yx_fsv32) {
new_vals[1] = align_to(vals[1], 32);
}
if (ks_layout == kernel_selector::Tensor::fs_bs_yx_bsv4_fsv32) {
new_vals[3] = align_to(vals[3], 32);
new_vals[2] = align_to(vals[2], 4);
}
if (ks_layout == kernel_selector::Tensor::byx8_f4) {
new_vals[3] = align_to(vals[3], 4);
new_vals[2] = align_to(vals[2], 8);
}
if (ks_layout == kernel_selector::Tensor::bs_fs_yx_bsv16_fsv16) {
new_vals[0] = align_to(vals[0], 16);
new_vals[1] = align_to(vals[1], 16);

View File

@ -183,22 +183,13 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
if (next.is_type<fully_connected>() &&
(fmt_prev == format::bfyx || fmt_prev == format::yxfb ||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::fs_b_yx_fsv32 ||
fmt_prev == format::byxf_af32 || fmt_prev == format::b_fs_yx_fsv32 ||
fmt_prev == format::b_fs_yx_fsv32 ||
(fmt_prev == format::b_fs_yx_fsv4 &&
prev_output_layout.size.feature[0] % 32 == 0 &&
prev_output_layout.size.spatial[0] == 1 &&
prev_output_layout.size.spatial[1] == 1)))
return true;
if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::b_fs_yx_fsv4 && next.as<convolution>().get_groups() != 1)
return true;
if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::bfyx)
return true;
if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::byxf_af32 && next.as<convolution>().get_groups() == 1)
return true;
if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0))
return true;
@ -232,7 +223,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
if (next.is_type<convolution>() &&
(fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx) && prev_output_layout.size.feature[0] == 3 &&
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 ||
(fmt_next == format::b_fs_yx_fsv4 ||
fmt_next == format::bs_fs_yx_bsv16_fsv16))
return true;
@ -727,23 +718,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
// nothing to do, just go out from here.
} else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim) ||
(_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) {
// commented out due to performance reasons, maybe enable in future
/*if (current_layout.data_type == data_types::f32 &&
current_layout.size.batch[0] % 16 == 0 &&
current_layout.format == format::bfyx &&
output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
!node.get_transposed())
{
if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] *
current_layout.size.spatial[1]) == 16 && current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
{
expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
expected_format = cldnn::format::bf8_xy16;
}
}
else*/
{
expected_tensor = current_layout.size;
if (current_layout.format == format::b_fs_zyx_fsv16 || current_layout.format == format::bs_fs_zyx_bsv16_fsv16)

View File

@ -254,8 +254,7 @@ memory_impl::ptr memory_pool::get_from_padded_pool(const layout& layout,
((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) ||
(layout.size.feature[0] % 32 == 0)) &&
// TODO: check if this condition always correct
((layout.format == format::byxf_af32 && layout.size.feature[0] == rec_list._memory->get_layout().size.feature[0]) ||
(layout.format != format::byxf_af32 && layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0])) &&
layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0] &&
layout.size.batch[0] <= rec_list._memory->get_layout().size.batch[0] &&
rec_list._memory->get_layout().format != format::fs_b_yx_fsv32 &&
layout.format != format::fs_b_yx_fsv32 &&

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2018-2019 Intel Corporation
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -142,9 +142,7 @@ std::pair<bool, bool> program_helpers::are_layouts_identical(layout const& l1, l
return {false, false};
if (l1.get_linear_size() != l2.get_linear_size())
return {false, false};
if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
(l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16) ||
(l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
if ((l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
(l2.format == format::b_fs_yx_fsv4 && l1.format != format::b_fs_yx_fsv4) ||
(l1.format == format::fs_b_yx_fsv32 && l2.format != format::fs_b_yx_fsv32) ||
(l2.format == format::fs_b_yx_fsv32 && l1.format != format::fs_b_yx_fsv32) ||

View File

@ -6873,7 +6873,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa
build_options options;
options.set_option(build_option::optimize_data(true));
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
network network(engine, topology, options);
network.set_input_data("input", input);
@ -6893,7 +6893,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa
EXPECT_EQ(output_layout.format, format::bfyx);
EXPECT_EQ(y_size, output_size.spatial[1]);
EXPECT_EQ(y_size, output_size.spatial[1]);
EXPECT_EQ(x_size, output_size.spatial[0]);
EXPECT_EQ(f_size, output_size.feature[0]);
EXPECT_EQ(b_size, output_size.batch[0]);
@ -7945,8 +7945,6 @@ INSTANTIATE_TEST_CASE_P(
.all_test_params(format::bfyx, false, true)
.all_test_params(format::bfyx, true, false)
.all_test_params(format::b_fs_yx_fsv4)
// byxf_af32 - depthwise broken for batch > 1
// .smoke_test_params(format::byxf_af32)
.all_test_params(format::b_fs_yx_fsv32)
.all_test_params(format::b_fs_yx_fsv32, true, true)
.all_test_params(format::b_fs_yx_fsv32, false, true)

View File

@ -1336,7 +1336,7 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_i8_i8_test::PrintToStringParamName
);
@ -1350,7 +1350,7 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_i8_u8_test::PrintToStringParamName
);
@ -1364,7 +1364,7 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_i8_f32_test::PrintToStringParamName
);
@ -1378,7 +1378,7 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_u8_i8_test::PrintToStringParamName
);
@ -1392,7 +1392,7 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_u8_u8_test::PrintToStringParamName
);
@ -1406,7 +1406,7 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(1, 3),
testing::Values(1, 3),
testing::Values(3, 32),
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
),
fully_connected_u8_f32_test::PrintToStringParamName
);

View File

@ -117,7 +117,6 @@ TEST(fused_conv_eltwise, basic_image2d)
EXPECT_EQ(outputs_act.begin()->first, "eltwise");
auto output_act = outputs_act.begin()->second.get_memory();
auto&& out_act_layout = output_act.get_layout();
auto out_act_ptr = output_act.pointer<uint8_t>();
topology topology_ref(
@ -140,7 +139,6 @@ TEST(fused_conv_eltwise, basic_image2d)
EXPECT_EQ(outputs_ref.begin()->first, "out");
auto output_ref = outputs_ref.begin()->second.get_memory();
auto&& out_ref_layout = output_ref.get_layout();
auto out_ref_ptr = output_ref.pointer<uint8_t>();
for (int i = 0;i < 3 * 256 * 4;i++) {

View File

@ -1373,58 +1373,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_shift_swish,
bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
}), );
class conv_int8_byxf_af32 : public ConvFusingTest {};
TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)),
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
scale("scale", "conv_prim", "scale_data"),
reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::byxf_af32, "" };
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
TEST_P(conv_int8_byxf_af32, per_element_coeffs) {
auto p = GetParam();
create_topologies(input_layout("input", get_input_layout(p)),
data("weights", get_mem(get_weights_layout(p))),
data("bias", get_mem(get_bias_layout(p))),
data("eltwise_data", get_mem(get_output_layout(p))),
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum),
reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
);
implementation_desc conv_impl = { format::byxf_af32, "" };
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
tolerance = 1e-5f;
execute(p);
}
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_byxf_af32,
::testing::ValuesIn(std::vector<bc_test_params>{
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
bc_test_params{CASE_CONV_U8S8_4, 2, 3},
bc_test_params{CASE_CONV_U8S8_6, 2, 3},
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
bc_test_params{CASE_CONV_S8S8_4, 2, 3},
bc_test_params{CASE_CONV_S8S8_6, 2, 3},
}), );
class conv_int8_prelu_eltwise : public ConvFusingTest {};
TEST_P(conv_int8_prelu_eltwise, basic) {
auto p = GetParam();
@ -3452,7 +3400,7 @@ struct activation_test_params {
#define CASE_ACTIVATION_F32_2 {7, 3, 7, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_3 {1, 14, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_4 {1, 17, 31, 29}, data_types::f32, format::yxfb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::byxf_af32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_6 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F32_7 {1, 17, 31, 29}, data_types::f32, format::fyxb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_3D_F32_0 {3, 16, 13, 13, 13}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
@ -3461,14 +3409,13 @@ struct activation_test_params {
#define CASE_ACTIVATION_3D_F32_3 {1, 17, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_4 {1, 17, 7, 7, 7}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_5 {1, 17, 7, 7, 7}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F32_6 {1, 17, 7, 7, 7}, data_types::f32, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_F16_0 {7, 32, 5, 5}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_2 {7, 16, 7, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_3 {1, 14, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_4 {1, 17, 31, 29}, data_types::f16, format::yxfb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::byxf_af32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_6 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
#define CASE_ACTIVATION_F16_7 {1, 17, 31, 29}, data_types::f16, format::fyxb, data_types::f32, format::bfyx
#define CASE_ACTIVATION_3D_F16_0 {3, 16, 13, 13, 13}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
@ -3477,7 +3424,6 @@ struct activation_test_params {
#define CASE_ACTIVATION_3D_F16_3 {1, 17, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F16_4 {1, 17, 7, 7, 7}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F16_5 {1, 17, 7, 7, 7}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_3D_F16_6 {1, 17, 7, 7, 7}, data_types::f16, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
#define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_ACTIVATION_U8_2 {1, 12, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
@ -3572,7 +3518,6 @@ INSTANTIATE_TEST_CASE_P(
activation_test_params{CASE_ACTIVATION_F32_7, 2, 3, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"}, // FIXME - accuracy bug
}), );
class activation_scale_activation_quantize_u8 : public ActivationFusingTest {};
@ -3640,7 +3585,6 @@ INSTANTIATE_TEST_CASE_P(
activation_scale_activation_quantize_u8,
::testing::ValuesIn(std::vector<activation_test_params>{
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"}, // FIXME - accuracy bug
}), );
class activation_scale_activation : public ActivationFusingTest {};
@ -3697,8 +3641,7 @@ INSTANTIATE_TEST_CASE_P(
activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_5, 2, 4, "activation_ref"},
activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
// InputDataType = UINT8
activation_test_params{CASE_ACTIVATION_U8_1, 2, 4, "activation_ref"},
@ -3719,8 +3662,6 @@ INSTANTIATE_TEST_CASE_P(
::testing::ValuesIn(std::vector<activation_test_params>{
activation_test_params{CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug
activation_test_params{CASE_ACTIVATION_3D_F16_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug
}), );
/* ----------------------------------------------------------------------------------------------------- */
@ -4450,30 +4391,21 @@ struct pooling_test_params {
#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx
#define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
#define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
#define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx
#define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
#define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx
#define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
#define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx
#define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
#define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
// Disabled
#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx
#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
class PoolingFusingTest : public ::BaseFusingTest<pooling_test_params> {
public:
void execute(pooling_test_params& p) {
@ -4674,8 +4606,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
// Input type: INT8
pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
@ -4688,8 +4618,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
}), );
@ -4697,9 +4625,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
pooling_scale_activation_quantize,
::testing::ValuesIn(std::vector<pooling_test_params>{
pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"}, //currently not enabled, fusing not upported
}), );
@ -4742,8 +4667,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
// Input type: INT8
pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
@ -4754,8 +4677,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
@ -4822,8 +4743,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
// Input type: INT8
pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
@ -4834,24 +4753,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
}), );
INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
pooling_scale_activation,
::testing::ValuesIn(std::vector<pooling_test_params>{
pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
}), );
/* ----------------------------------------------------------------------------------------------------- */
/* -------------------------------- DepthToSpace cases ------------------------------------------------- */
/* ----------------------------------------------------------------------------------------------------- */

View File

@ -2619,7 +2619,6 @@ INSTANTIATE_TEST_CASE_P(
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
testing::Values(format::yxfb,
format::bfyx,
format::byxf_af32,
format::b_fs_yx_fsv4,
format::b_fs_yx_fsv16,
format::b_fs_yx_fsv32)),
@ -2737,8 +2736,7 @@ INSTANTIATE_TEST_CASE_P(
format::b_fs_yx_fsv16,
format::fs_b_yx_fsv32,
format::b_fs_yx_fsv32,
format::b_fs_yx_fsv4,
format::fs_bs_yx_bsv4_fsv32)),
format::b_fs_yx_fsv4)),
testing::internal::DefaultParamName<pooling_random_test_params>);
TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride)

View File

@ -766,8 +766,6 @@ INSTANTIATE_TEST_CASE_P(smoke,
resample_random_test,
testing::ValuesIn(
resample_random_test_param_generator()
.smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32)
.smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32)
.smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
.smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
.smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)

View File

@ -105,16 +105,4 @@ TEST(tensor_api, linear_offsets) {
test_tensor_offset({ 2, 19, 4, 3 }, { 1, 18, 3, 2 }, cldnn::format::b_fs_yx_fsv16, 754);
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::fs_b_yx_fsv32, 675);
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::fs_b_yx_fsv32, 1507);
// Formats with alignment:
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byxf_af32, 675);
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byxf_af32, 1507);
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byx8_f4, 331);
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byx8_f4, 1755);
// Non-standard blocked formats:
// bf8_xy16 - b_fs_es_fsv8_esv16, where e is flattened yx := x + y * size_x
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::bf8_xy16, 185);
test_tensor_offset({ 2, 19, 7, 3 }, { 1, 18, 3, 2 }, cldnn::format::bf8_xy16, 1441);
}