[IE CLDNN] Cleanup part 2 (#1865)
* [IE CLDNN] Removed some unused kernels and layouts * [IE CLDNN] Removed bsv4_fsv32 layout * [IE CLDNN] Removed remaining BF8_XY16 usages. Removed definitions.cl
This commit is contained in:
parent
711d208eb8
commit
34ba00174e
18
inference-engine/thirdparty/clDNN/api/layout.hpp
vendored
18
inference-engine/thirdparty/clDNN/api/layout.hpp
vendored
@ -344,14 +344,6 @@ struct layout {
|
||||
tensor get_pitches() const {
|
||||
auto sizes = get_buffer_size().sizes(format);
|
||||
|
||||
if (format == format::byxf_af32) {
|
||||
sizes[3] = align_to(sizes[3], 32);
|
||||
}
|
||||
|
||||
if (format == format::byx8_f4) {
|
||||
sizes[3] = align_to(sizes[3], 4);
|
||||
sizes[2] = align_to(sizes[2], 8);
|
||||
}
|
||||
std::vector<tensor::value_type> pitches(sizes.size(), tensor::value_type(1));
|
||||
std::partial_sum(sizes.rbegin(), sizes.rend() - 1, pitches.rbegin() + 1, std::multiplies<tensor::value_type>());
|
||||
return {format, pitches};
|
||||
@ -394,15 +386,7 @@ struct layout {
|
||||
sizes[block_axis] = align_to(sizes[block_axis], block_size);
|
||||
}
|
||||
|
||||
if (this->format == cldnn::format::bf8_xy16 && !(is_aligned_to(sizes[1], 8) && is_aligned_to(sizes[2] * sizes[3], 16))) {
|
||||
sizes[3] = align_to(sizes[2] * sizes[3], 16);
|
||||
sizes[2] = 1;
|
||||
} else if (this->format == cldnn::format::byxf_af32 && !(is_aligned_to(sizes[1], 32))) {
|
||||
sizes[1] = align_to(sizes[1], 32);
|
||||
} else if (this->format == cldnn::format::byx8_f4 && (!is_aligned_to(sizes[1], 4) || !is_aligned_to(sizes[2], 8))) {
|
||||
sizes[1] = align_to(sizes[1], 4);
|
||||
sizes[2] = align_to(sizes[2], 8);
|
||||
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
|
||||
if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
|
||||
sizes[0] = align_to(sizes[0], 8);
|
||||
sizes[1] = align_to(sizes[1], 32);
|
||||
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) {
|
||||
|
27
inference-engine/thirdparty/clDNN/api/tensor.hpp
vendored
27
inference-engine/thirdparty/clDNN/api/tensor.hpp
vendored
@ -105,7 +105,6 @@ struct format {
|
||||
bs_fs_zyx_bsv16_fsv16, ///< format used for 3D blocked convolution (batch and features blocked by 16)
|
||||
bs_fs_yx_bsv16_fsv16, ///< format used for 2D blocked convolution (batch and features blocked by 16)
|
||||
fs_b_yx_fsv32, ///< format for input for fp16 primitives
|
||||
fs_bs_yx_bsv4_fsv32, ///< format for batched input for primitives using MMAD
|
||||
b_fs_yx_fsv4, ///< format for input for IMAD convolutions
|
||||
bs_xs_xsv8_bsv8, ///< format used only for fully connected weights: bs - batch slice,
|
||||
///< xs - x slice, bsv8 - 8 values of single slice.
|
||||
@ -114,10 +113,6 @@ struct format {
|
||||
bs_x_bsv16, ///< format used only for fully connected weights fp16 batch=1 : bs - batch slice
|
||||
///< (responses slice), bsv16 - 16 values of single batch slice, x - flattened plane of (fyx)
|
||||
///< \n \image html bs_x_bsv16.jpg
|
||||
byxf_af32, ///< format for input for primitives using MMAD
|
||||
byx8_f4, ///< format for input for MMAD convolutions
|
||||
bf8_xy16, ///< format used only for convolution 1x1 input, xy aligned to 16, f aligned to 8
|
||||
///< \n \image html bf8_xy16.jpg
|
||||
b_fs_yx_32fp, ///< format for data for binary convolutions
|
||||
winograd_2x3_s1_data, ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
|
||||
nv12, ///< format for media nv12 input
|
||||
@ -227,11 +222,7 @@ struct format {
|
||||
{ bs_xs_xsv8_bsv8, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 8}}}},
|
||||
{ bs_xs_xsv8_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{2, 8}, {0, 16}}}},
|
||||
{ bs_x_bsv16, { 1, 1, 1, 0, 0, "bx", "b?x??", {{0, 16}}}},
|
||||
{ bf8_xy16, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 8}}}},
|
||||
{ winograd_2x3_s1_data, { 1, 1, 2, 0, 0, "bxyf", "bfxy?", {}}},
|
||||
{ byxf_af32, { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
|
||||
{ byx8_f4 , { 1, 1, 2, 0, 0, "byxf", "bfxy?", {}}},
|
||||
{ fs_bs_yx_bsv4_fsv32, { 1, 1, 2, 0, 0, "fbyx", "bfxy?", {{0, 4}, {1, 32}}}},
|
||||
{ b_fs_yx_fsv4, { 1, 1, 2, 0, 0, "bfyx", "bfxy?", {{1, 4}}}},
|
||||
{ bfzyx, { 1, 1, 3, 0, 0, "bfzyx", "bfxyz", {}}},
|
||||
{ bfwzyx, { 1, 1, 4, 0, 0, "bfwzyx", "bfxyzw", {}}},
|
||||
@ -943,23 +934,7 @@ public:
|
||||
adjusted_coords[external_axis] /= block_size;
|
||||
}
|
||||
|
||||
if (fmt == cldnn::format::byxf_af32 && !(is_aligned_to(my_sizes[3], 32))) {
|
||||
my_sizes[3] = align_to(my_sizes[3], 32);
|
||||
} else if (fmt == cldnn::format::byx8_f4 && (!(is_aligned_to(my_sizes[3], 4)) || !(is_aligned_to(my_sizes[2], 8)))) {
|
||||
my_sizes[3] = align_to(my_sizes[3], 4);
|
||||
my_sizes[2] = align_to(my_sizes[2], 8);
|
||||
} else if (fmt == cldnn::format::bf8_xy16) {
|
||||
// Special case of blocked format, where xy is treated as one flattened dimension
|
||||
auto flat_xy = adjusted_coords[3] + adjusted_coords[2] * my_sizes[3];
|
||||
|
||||
my_sizes.push_back(16);
|
||||
my_sizes[3] = ceil_div(my_sizes[2] * my_sizes[3], 16);
|
||||
my_sizes[2] = 1;
|
||||
|
||||
adjusted_coords.push_back(flat_xy % 16);
|
||||
adjusted_coords[3] = flat_xy / 16;
|
||||
adjusted_coords[2] = 0;
|
||||
} else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below
|
||||
if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4 && // TODO Fix offsets calculation for formats below
|
||||
!(is_aligned_to(my_sizes[0], 8)) &&
|
||||
!(is_aligned_to(my_sizes[1], 32))) {
|
||||
my_sizes[0] = align_to(my_sizes[0], 8);
|
||||
|
@ -42,11 +42,7 @@ DataTensor::DataChannelArray DataTensor::dataChannelArray {{
|
||||
{ DataLayout::bs_fs_yx_bsv16_fsv16, { 0, 1, -1, -1, 2, 3 } },
|
||||
{ DataLayout::bs_f_bsv8__af8, { -1, -1, -1, -1, 0, 1 } },
|
||||
{ DataLayout::bs_f_bsv16__af8, { -1, -1, -1, -1, 0, 1 } },
|
||||
{ DataLayout::bf8_xy16, { 0, 1, -1, -1, 2, 3 } },
|
||||
{ DataLayout::winograd_2x3_s1_data, { 2, 1, -1, -1, 0, 3 } },
|
||||
{ DataLayout::byxf_af32, { 1, 2, -1, -1, 0, 3 } },
|
||||
{ DataLayout::byx8_f4, { 1, 2, -1, -1, 0, 3 } },
|
||||
{ DataLayout::fs_bs_yx_bsv4_fsv32, { 0, 1, -1, -1, 3, 2 } },
|
||||
{ DataLayout::b_fs_yx_fsv4, { 0, 1, -1, -1, 2, 3 } },
|
||||
{ DataLayout::bfzyx, { 0, 1, 2, -1, 3, 4 } },
|
||||
{ DataLayout::fs_b_yx_fsv32, { 0, 1, -1, -1, 3, 2 } },
|
||||
@ -167,26 +163,6 @@ NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
|
||||
assert(newDims.size() == 5);
|
||||
newDims[3] = RoundUp(newDims[3], 32);
|
||||
break;
|
||||
case bf8_xy16:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[1] = RoundUp(newDims[1], 8);
|
||||
newDims[3] = RoundUp(newDims[2] * newDims[3], 16);
|
||||
newDims[2] = 1;
|
||||
break;
|
||||
case byxf_af32:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[0] = RoundUp(newDims[0], 32);
|
||||
break;
|
||||
case byx8_f4:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[0] = RoundUp(newDims[0], 4);
|
||||
newDims[1] = RoundUp(newDims[1], 8);
|
||||
break;
|
||||
case fs_bs_yx_bsv4_fsv32:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[3] = RoundUp(newDims[3], 32);
|
||||
newDims[2] = RoundUp(newDims[2], 4);
|
||||
break;
|
||||
case b_fs_yx_32fp:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[3] = RoundUp(newDims[3], 32);
|
||||
@ -222,14 +198,6 @@ NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
|
||||
pitch *= newDims[i];
|
||||
}
|
||||
|
||||
if (l == byxf_af32 || l == fs_bs_yx_bsv4_fsv32 || l == byx8_f4) {
|
||||
ret[0].pitch = 1;
|
||||
ret[1].pitch = ret[0].pitch * newDims[0];
|
||||
ret[2].pitch = ret[1].pitch * newDims[1];
|
||||
ret[3].pitch = ret[2].pitch * newDims[2];
|
||||
ret[4].pitch = ret[3].pitch * newDims[3];
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -51,11 +51,7 @@ enum DataLayout {
|
||||
bs_fs_zyx_bsv16_fsv16, // batch, feature, 3D spatial. Blocks of 16 batch and channels
|
||||
bs_f_bsv8__af8, // for optimized FC
|
||||
bs_f_bsv16__af8, // for optimized FC
|
||||
bf8_xy16, // for optimized conv1x1
|
||||
winograd_2x3_s1_data, // winograd convolution input, F(2,3) -- filter 3x3 with stride 1
|
||||
byxf_af32, // for MMAD convolution
|
||||
byx8_f4, // for MMAD convolution
|
||||
fs_bs_yx_bsv4_fsv32, // for batched MMAD
|
||||
b_fs_yx_fsv4, // reordering format for swizzled input for convolution using IMAD
|
||||
bfzyx, // batch+feature+3D spatial
|
||||
fs_b_yx_fsv32, // for FP16 kernels, 32 features to avoid partial writes
|
||||
|
@ -41,7 +41,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::byxf);
|
||||
k.EnableInputLayout(DataLayout::fyxb);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::bf);
|
||||
@ -51,7 +50,6 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::byxf);
|
||||
k.EnableOutputLayout(DataLayout::fyxb);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
|
@ -25,11 +25,9 @@ ParamsKey ConvolutionKernel_bfyx_1x1::GetSupportedKey() const {
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableInputWeightsType(WeightsType::F16);
|
||||
k.EnableInputWeightsType(WeightsType::F32);
|
||||
k.EnableInputLayout(DataLayout::bf8_xy16);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::yxfb);
|
||||
k.EnableOutputLayout(DataLayout::bf8_xy16);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableDilation();
|
||||
|
@ -82,7 +82,6 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
|
||||
@ -116,8 +115,6 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
|
||||
in_fsv = 4;
|
||||
else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16)
|
||||
in_fsv = 16;
|
||||
else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32)
|
||||
in_fsv = 32;
|
||||
|
||||
mem_consts.AddConstants({
|
||||
MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)),
|
||||
|
@ -1,138 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2019-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_imad_byxf_af32_1x1.h"
|
||||
|
||||
static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) {
|
||||
for (int tile_len = 14; tile_len > 0; tile_len--) {
|
||||
// Kernel writes 32 output features per HW thread
|
||||
size_t threads = (out_xy / tile_len) * out_xy * out_f / 32;
|
||||
// Chose largest valid tile with enough HW threads
|
||||
if ((out_xy % tile_len == 0) && (threads >= min_threads)) {
|
||||
return tile_len;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_imad_byxf_af32_1x1::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableDilation();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_imad_byxf_af32_1x1::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!Parent::Validate(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& params = static_cast<const convolution_params&>(p);
|
||||
|
||||
if (params.filterSize.x != 1 || params.filterSize.y != 1)
|
||||
return false;
|
||||
|
||||
if (params.padding.x != 0 || params.padding.y != 0)
|
||||
return false;
|
||||
|
||||
if (params.output.Feature().v % 32 != 0)
|
||||
return false;
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
|
||||
// we do not support padded input
|
||||
if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0)
|
||||
return false;
|
||||
|
||||
if (params.split != 1)
|
||||
return false;
|
||||
|
||||
if (params.groups != 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_1x1::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = Parent::SetDefault(arg);
|
||||
|
||||
// Sub-group size
|
||||
constexpr size_t sub_group_size = 8;
|
||||
|
||||
const auto of_maps = arg.output.Feature().v;
|
||||
const size_t of_maps_per_batch = RoundUp(of_maps, 32);
|
||||
const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v;
|
||||
|
||||
// Need to have at least 4 HW threads per EU
|
||||
const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4);
|
||||
runInfo.cldnnStyle.blockWidth = tile_length;
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_1;
|
||||
|
||||
runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length;
|
||||
runInfo.gws1 = of_maps_total / 4; // TILE_DEPTH==4
|
||||
runInfo.gws2 = 1;
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = sub_group_size;
|
||||
runInfo.lws2 = 1;
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_imad_byxf_af32_1x1::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1));
|
||||
jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth));
|
||||
jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4));
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_scalar = {"", {"b", "f2", "y", "(x+i)"}, "res", input_dt, 1 };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_imad_byxf_af32_1x1::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
return GetTunedKernelsDataByIndex(params, options);
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,45 +0,0 @@
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_imad_byxf_af32_1x1 : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_imad_byxf_af32_1x1() : ConvolutionKernelBase("fused_conv_eltwise_gpu_af32_imad_1x1") {}
|
||||
virtual ~ConvolutionKernel_imad_byxf_af32_1x1() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::os_is_osv32_isv32_swizzled_by_4;
|
||||
}
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,143 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_imad_byxf_af32_depthwise.h"
|
||||
|
||||
#define SIMD_SIZE 16
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_imad_byxf_af32_depthiwise::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableDilation();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableSplitSupport();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.EnableDepthwiseSeparableOpt();
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
return k;
|
||||
}
|
||||
|
||||
static size_t GetTileLength(size_t out_x) {
|
||||
for (int i = 20; i >= 1; i--) {
|
||||
if (out_x % i == 0)
|
||||
return i;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int GetSplit(size_t out_x, int stride) {
|
||||
if (out_x >= 75) {
|
||||
if (stride > 1)
|
||||
return 1;
|
||||
else
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (out_x == 38 && stride == 2)
|
||||
return 2;
|
||||
|
||||
if (out_x < 75) {
|
||||
if (stride > 1)
|
||||
return 1;
|
||||
else if (out_x % 2 == 0)
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_imad_byxf_af32_depthiwise::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!Parent::Validate(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
if (cp.inputs[0].Feature().v != cp.groups || cp.output.Feature().v != cp.groups || cp.groups == 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_byxf_af32_depthiwise::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = Parent::SetDefault(arg);
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_1;
|
||||
|
||||
runInfo.gws0 = Align(arg.output.Feature().v, SIMD_SIZE) * arg.output.Batch().v;
|
||||
runInfo.gws1 = arg.output.X().v / GetTileLength(arg.output.X().v);
|
||||
runInfo.gws2 = CeilDiv(arg.output.Y().v, GetSplit(arg.output.Y().v, arg.stride.y));
|
||||
|
||||
std::vector<size_t> local = { SIMD_SIZE, 1, 1 };
|
||||
|
||||
runInfo.lws0 = local[0];
|
||||
runInfo.lws1 = local[1];
|
||||
runInfo.lws2 = local[2];
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_imad_byxf_af32_depthiwise::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("ALIGNED_OFM", Align(params.output.Feature().v, SIMD_SIZE)));
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", GetTileLength(params.output.X().v)));
|
||||
jit.AddConstant(MakeJitConstant("SPLIT_Y", GetSplit(params.output.Y().v, params.stride.y)));
|
||||
jit.AddConstant(MakeJitConstant("SIMD_SIZE", SIMD_SIZE));
|
||||
|
||||
if (params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y) != 0)
|
||||
jit.AddConstant(MakeJitConstant("SPLIT_LEFTOVERS", params.output.Y().v % GetSplit(params.output.Y().v, params.stride.y)));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_scalar = {"", {"b", "of", "(y+m)", "(x+l)"}, "res", input_dt, 1 };
|
||||
conf_scalar.SetLoopAxes({Tensor::DataChannelName::Y, Tensor::DataChannelName::X});
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
|
||||
KernelsData ConvolutionKernel_imad_byxf_af32_depthiwise::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
|
||||
if (!kd.empty())
|
||||
kd[0].estimatedTime = FORCE_PRIORITY_1;
|
||||
return kd;
|
||||
}
|
||||
|
||||
} // namespace kernel_selector
|
@ -1,46 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_imad_byxf_af32_depthiwise : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_imad_byxf_af32_depthiwise() : ConvolutionKernelBase("convolution_gpu_byxf_af32_depthwise") {}
|
||||
virtual ~ConvolutionKernel_imad_byxf_af32_depthiwise() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::goiyx;
|
||||
}
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,93 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_mmad.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableDilation();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableSplitSupport();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.DisableTuning();
|
||||
k.EnableDifferentTypes();
|
||||
return k;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad::SetDefault(const convolution_params& arg, int) const {
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
|
||||
|
||||
constexpr size_t sub_group_size = 8;
|
||||
|
||||
const auto of_maps = arg.output.Feature().v;
|
||||
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_4;
|
||||
|
||||
runInfo.gws0 = arg.output.X().v;
|
||||
runInfo.gws1 = arg.output.Y().v;
|
||||
runInfo.gws2 = of_threads_per_batch * arg.output.Batch().v;
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = sub_group_size;
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_scalar = {"", {"b", "f", "y", "x"}, "res", input_dt, 1 };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
|
||||
if (!kd.empty())
|
||||
kd[0].estimatedTime = FORCE_PRIORITY_4;
|
||||
return kd;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,45 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_mmad() : ConvolutionKernelBase("convolution_gpu_mmad") {}
|
||||
virtual ~ConvolutionKernel_mmad() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,95 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_mmad_batched.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad_batched::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableDilation();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableSplitSupport();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
|
||||
|
||||
constexpr size_t sub_group_size = 8;
|
||||
|
||||
const auto of_maps = arg.output.Feature().v;
|
||||
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_6;
|
||||
|
||||
runInfo.gws0 = arg.output.X().v;
|
||||
runInfo.gws1 = arg.output.Y().v;
|
||||
runInfo.gws2 = of_threads_per_batch * ((arg.output.Batch().v + 3) / 4);
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = sub_group_size;
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad_batched::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
const size_t in_x_pitch = 32 * 4;
|
||||
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
|
||||
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
|
||||
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
|
||||
const size_t in_offset =
|
||||
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
|
||||
if (!kd.empty())
|
||||
kd[0].estimatedTime = FORCE_PRIORITY_6;
|
||||
return kd;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,39 +0,0 @@
|
||||
// Copyright (c) 2018 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad_batched : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_mmad_batched() : ConvolutionKernelBase("convolution_gpu_mmad_batched") {}
|
||||
virtual ~ConvolutionKernel_mmad_batched() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,165 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_mmad_batched_block.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad_batched_block::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
struct block_params {
|
||||
int32_t out_width;
|
||||
int32_t out_height;
|
||||
int32_t out_depth;
|
||||
};
|
||||
|
||||
static block_params get_out_block_size(const convolution_params& p) {
|
||||
if (p.filterSize.x == 3 && p.filterSize.y == 3) {
|
||||
if (p.output.X().v == 7)
|
||||
return {7, 1, 4};
|
||||
else if (p.output.X().v == 14)
|
||||
return {7, 1, 4};
|
||||
else if (p.output.X().v == 28)
|
||||
return {7, 1, 4};
|
||||
else if (p.output.X().v == 56)
|
||||
return {8, 1, 4};
|
||||
}
|
||||
|
||||
return {1, 1, 1};
|
||||
}
|
||||
|
||||
WeightsLayout ConvolutionKernel_mmad_batched_block::GetPreferredWeightsLayout(
|
||||
const convolution_params &cp) const {
|
||||
auto block = get_out_block_size(cp);
|
||||
if (block.out_depth == 4)
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
|
||||
else
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_mmad_batched_block::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
|
||||
return false;
|
||||
}
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
// if block sizes are 1x1, then this algorithm is probably not the best
|
||||
auto block = get_out_block_size(cp);
|
||||
if (block.out_width == 1 && block.out_height == 1)
|
||||
return false;
|
||||
|
||||
if (cp.output.X().v % block.out_width != 0)
|
||||
return false;
|
||||
if (cp.output.Y().v % block.out_height != 0)
|
||||
return false;
|
||||
|
||||
if (cp.filterSize.x == 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t static get_wg_batch_count(const convolution_params& params) {
|
||||
if (params.inputs[0].Batch().v % 64 == 0)
|
||||
return 16; // because we process 4 batches per SIMD
|
||||
return 1;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
|
||||
|
||||
constexpr size_t sub_group_size = 8;
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_5;
|
||||
|
||||
auto block = get_out_block_size(arg);
|
||||
|
||||
runInfo.gws0 = arg.output.X().v / block.out_width;
|
||||
runInfo.gws1 = arg.output.Y().v / block.out_height;
|
||||
runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
|
||||
block.out_depth; // process 4 output channels per Workitem
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad_batched_block::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
const int sub_group_size = 8;
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
const size_t in_x_pitch = 32 * 4;
|
||||
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
|
||||
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
|
||||
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
|
||||
const size_t in_offset =
|
||||
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
|
||||
|
||||
const size_t out_x_pitch = 32 * 4;
|
||||
jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
|
||||
|
||||
auto block = get_out_block_size(params);
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
|
||||
jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
|
||||
|
||||
jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_batched_block::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
KernelsData kd = GetCommonKernelsData(params, options);
|
||||
if (!kd.empty())
|
||||
kd[0].estimatedTime = FORCE_PRIORITY_5;
|
||||
return kd;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,38 +0,0 @@
|
||||
// Copyright (c) 2018 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad_batched_block : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_mmad_batched_block() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block") {}
|
||||
virtual ~ConvolutionKernel_mmad_batched_block() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,171 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_mmad_batched_block_1x1.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad_batched_block_1x1::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
struct block_params {
|
||||
int32_t out_width;
|
||||
int32_t out_height;
|
||||
int32_t out_depth;
|
||||
};
|
||||
|
||||
static block_params get_out_block_size(const convolution_params& p) {
|
||||
if (p.output.X().v == 7)
|
||||
return {7, 1, 4};
|
||||
else if (p.output.X().v == 14)
|
||||
return {7, 1, 4};
|
||||
else if (p.output.X().v == 28)
|
||||
return {4, 2, 4};
|
||||
else if (p.output.X().v == 56)
|
||||
return {8, 1, 4};
|
||||
|
||||
return {1, 1, 1};
|
||||
}
|
||||
|
||||
WeightsLayout ConvolutionKernel_mmad_batched_block_1x1::GetPreferredWeightsLayout(
|
||||
const convolution_params &cp) const {
|
||||
auto block = get_out_block_size(cp);
|
||||
if (block.out_depth == 4)
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4;
|
||||
else
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_mmad_batched_block_1x1::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
|
||||
return false;
|
||||
}
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
// only for conv 1x1
|
||||
if (cp.filterSize.x != 1 || cp.filterSize.y != 1)
|
||||
return false;
|
||||
|
||||
// only for stride 1x1
|
||||
if (cp.stride.x != 1 || cp.stride.y != 1)
|
||||
return false;
|
||||
|
||||
// if block sizes are 1x1, then this algorithm is probably not the best
|
||||
auto block = get_out_block_size(cp);
|
||||
if (block.out_depth != 4)
|
||||
return false;
|
||||
|
||||
if (cp.output.X().v % block.out_width != 0)
|
||||
return false;
|
||||
if (cp.output.Y().v % block.out_height != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t static get_wg_batch_count(const convolution_params& params) {
|
||||
if (params.inputs[0].Batch().v % 64 == 0)
|
||||
return 16; // because we process 4 batches per SIMD
|
||||
return 1;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_batched_block_1x1::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
|
||||
|
||||
constexpr size_t sub_group_size = 8;
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_3;
|
||||
|
||||
auto block = get_out_block_size(arg);
|
||||
|
||||
runInfo.gws0 = arg.output.X().v / block.out_width;
|
||||
runInfo.gws1 = arg.output.Y().v / block.out_height;
|
||||
runInfo.gws2 = (arg.output.Feature().v) * ((arg.output.Batch().v + 3) / 4) /
|
||||
block.out_depth; // process 4 output channels per Workitem
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = sub_group_size * get_wg_batch_count(arg);
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad_batched_block_1x1::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
const int sub_group_size = 8;
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
const size_t in_x_pitch = 32 * 4;
|
||||
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
|
||||
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
|
||||
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
|
||||
const size_t in_offset =
|
||||
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
|
||||
|
||||
const size_t out_x_pitch = 32 * 4;
|
||||
const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
|
||||
|
||||
jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
|
||||
|
||||
auto block = get_out_block_size(params);
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", block.out_width));
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", block.out_height));
|
||||
jit.AddConstant(MakeJitConstant("WEIGHTS_PER_WORKITEM", block.out_depth));
|
||||
|
||||
jit.AddConstant(MakeJitConstant("WG_BATCH_COUNT", get_wg_batch_count(params)));
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_batched_block_1x1::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
|
||||
if (!kd.empty())
|
||||
kd[0].estimatedTime = FORCE_PRIORITY_3;
|
||||
return kd;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,38 +0,0 @@
|
||||
// Copyright (c) 2018 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad_batched_block_1x1 : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_mmad_batched_block_1x1() : ConvolutionKernelBase("convolution_gpu_mmad_batched_block_1x1") {}
|
||||
virtual ~ConvolutionKernel_mmad_batched_block_1x1() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -35,7 +35,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const {
|
||||
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableDilation();
|
||||
|
@ -1,270 +0,0 @@
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "convolution_kernel_mmad_blocks.h"
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
namespace kernel_selector {
|
||||
ConvolutionKernel_mmad_blocks::ConvolutionKernel_mmad_blocks() : ConvolutionKernelBase("convolution_gpu_mmad_blocks") {
|
||||
// Generate the dispatch options to the auto-tuner.
|
||||
std::vector<size_t> blockWidthSizes = {1, 2, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32};
|
||||
std::vector<size_t> blockHeightSizes = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
|
||||
std::vector<size_t> prefetchSizes = {1, 2, 3, 4, 5, 6, 8, 10};
|
||||
std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
|
||||
const size_t maxBlockSize = 240;
|
||||
for (auto executionMode : executionModes) {
|
||||
for (auto blockWidth : blockWidthSizes) {
|
||||
for (auto blockHeight : blockHeightSizes) {
|
||||
for (auto prefetch : prefetchSizes) {
|
||||
if (blockWidth * blockHeight <= maxBlockSize) {
|
||||
autoTuneOptions.emplace_back(AutoTuneOption{blockWidth, blockHeight, prefetch, executionMode});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad_blocks::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
// k.EnableDilation(); TODO: Add dilation support
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableSplitSupport();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_mmad_blocks::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!Parent::Validate(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t& block_x, size_t& block_y) {
|
||||
// how many elements we will compute in each dimension
|
||||
size_t computed_x = Align(output_x, block_x);
|
||||
size_t computed_y = Align(output_y, block_y);
|
||||
// how many simds we need in each dimension
|
||||
size_t simds_x = computed_x / block_x;
|
||||
size_t simds_y = computed_y / block_y;
|
||||
// how many unused values we have in each dimension
|
||||
size_t unused_x = computed_x - output_x;
|
||||
size_t unused_y = computed_y - output_y;
|
||||
|
||||
block_x -= unused_x / simds_x;
|
||||
block_y -= unused_y / simds_y;
|
||||
}
|
||||
|
||||
ConvolutionKernel_mmad_blocks::AutoTuneOption ConvolutionKernel_mmad_blocks::GetAutoTuneOptions(
|
||||
const Params& p,
|
||||
int autoTuneIndex) const {
|
||||
if ((autoTuneIndex >= 0) && (autoTuneIndex < static_cast<int>(autoTuneOptions.size()))) {
|
||||
return autoTuneOptions[autoTuneIndex];
|
||||
}
|
||||
|
||||
// Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
|
||||
constexpr size_t sub_group_size = 16;
|
||||
|
||||
AutoTuneOption option = {0, 0, 0, DEFAULT};
|
||||
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
if (cp.stride.x == 1 && cp.stride.y == 1) {
|
||||
if (cp.filterSize.x == 1 && cp.filterSize.y == 1) {
|
||||
option.blockWidth = 16;
|
||||
option.blockHeight = 1;
|
||||
option.prefetch = 4;
|
||||
// if less than 16 values is required to compute one single row of output
|
||||
// then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice
|
||||
// performance results)
|
||||
} else if (cp.output.X().v + (cp.filterSize.x - 1) * cp.dilation.x < sub_group_size) {
|
||||
option.blockWidth = cp.output.X().v;
|
||||
option.blockHeight = 1;
|
||||
option.prefetch = 4;
|
||||
} else if (cp.filterSize.x < 5 && cp.filterSize.y < 5) {
|
||||
option.blockWidth = sub_group_size - cp.filterSize.x + 1;
|
||||
option.blockHeight = 2;
|
||||
option.prefetch = 4;
|
||||
} else {
|
||||
option.blockWidth = 4;
|
||||
option.blockHeight = 3;
|
||||
option.prefetch = 4;
|
||||
}
|
||||
} else if (cp.stride.x == 2 && cp.stride.y == 2) {
|
||||
option.blockWidth = 5;
|
||||
option.blockHeight = 4;
|
||||
option.prefetch = 4;
|
||||
} else {
|
||||
option.blockWidth = 4;
|
||||
option.blockHeight = 3;
|
||||
option.prefetch = 5;
|
||||
// run_info.efficiency = FORCE_PRIORITY_7; // GEMM is better
|
||||
}
|
||||
|
||||
// if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block
|
||||
// sizes
|
||||
if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1) {
|
||||
shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v, option.blockWidth, option.blockHeight);
|
||||
}
|
||||
|
||||
return option;
|
||||
}
|
||||
|
||||
static std::pair<size_t, size_t> get_byxf_af32_req_input_block_dims(size_t output_block_width,
|
||||
size_t output_block_height,
|
||||
const uSize& filter_size,
|
||||
const uSize& stride,
|
||||
const uSize& dilation,
|
||||
size_t sub_group_size = 8,
|
||||
size_t read_chunk_size = 8,
|
||||
size_t min_read_size = 8) {
|
||||
assert(output_block_width > 0 && output_block_height > 0);
|
||||
assert(stride.x > 0 && stride.y > 0);
|
||||
assert(filter_size.x > 0 && filter_size.y > 0);
|
||||
|
||||
// Number of elements in X dimension needed from input to compute output block without re-reading input.
|
||||
size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1) * dilation.x + 1;
|
||||
// Number of elements in Y dimension needed from input to compute output block without re-reading input.
|
||||
size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1) * dilation.y + 1;
|
||||
|
||||
// Required number of elements in X dimension rounded to nearest >= read chunk size.
|
||||
size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
|
||||
// Number of sub-group-sized vectors of unit type needed to store input block.
|
||||
size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sub_group_size);
|
||||
|
||||
// size of our array per workitem
|
||||
input_block_array_size = input_block_req_height * input_block_read_width;
|
||||
return std::make_pair(input_block_array_size, input_block_read_width);
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_blocks::SetDefault(const convolution_params& cp,
|
||||
int autoTuneIndex) const {
|
||||
// Sub-group size used by "convolution_gpu_mmad_blocks" kernel.
|
||||
constexpr size_t sub_group_size = 8;
|
||||
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
|
||||
|
||||
auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
|
||||
runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
|
||||
runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
|
||||
runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
|
||||
|
||||
auto input_block_dims =
|
||||
get_byxf_af32_req_input_block_dims(runInfo.cldnnStyle.blockWidth,
|
||||
runInfo.cldnnStyle.blockHeight,
|
||||
cp.filterSize,
|
||||
cp.stride,
|
||||
cp.dilation,
|
||||
sub_group_size,
|
||||
runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
|
||||
sub_group_size);
|
||||
runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
|
||||
runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
|
||||
|
||||
const auto of_maps = cp.output.Feature().v;
|
||||
const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_3;
|
||||
|
||||
runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
|
||||
runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
|
||||
runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = sub_group_size;
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad_blocks::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
|
||||
jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
|
||||
jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
|
||||
jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_scalar = {"", {"b", "f", "(y+br)", "(x+bc)"}, "res", input_dt, 1 };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_blocks::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options);
|
||||
if (!kd.empty())
|
||||
kd[0].estimatedTime = FORCE_PRIORITY_2;
|
||||
|
||||
return kd;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_blocks::GetKernelsDataForAutoTune(const Params& params,
|
||||
const optional_params& options) const {
|
||||
if (!Validate(params, options)) {
|
||||
return {};
|
||||
}
|
||||
|
||||
KernelsData res = {};
|
||||
|
||||
for (size_t i = 0; i < autoTuneOptions.size(); i++) {
|
||||
KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
|
||||
if (!kd.empty()) {
|
||||
res.emplace_back(kd[0]);
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace kernel_selector
|
@ -1,59 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad_blocks : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
ConvolutionKernel_mmad_blocks();
|
||||
virtual ~ConvolutionKernel_mmad_blocks() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
KernelsData GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
|
||||
private:
|
||||
struct AutoTuneOption {
|
||||
size_t blockWidth;
|
||||
size_t blockHeight;
|
||||
size_t prefetch;
|
||||
std::string exeMode;
|
||||
};
|
||||
|
||||
AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const;
|
||||
std::vector<AutoTuneOption> autoTuneOptions = {};
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,119 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_mmad_slm_2x14_rep4.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad_slm_2x14_rep4::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_mmad_slm_2x14_rep4::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
|
||||
return false;
|
||||
|
||||
if (cp.inputs[0].X().v != 56 || cp.inputs[0].Y().v != 56)
|
||||
return false;
|
||||
|
||||
if (cp.stride.x != 1 || cp.stride.y != 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_2x14_rep4::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_1;
|
||||
|
||||
const size_t rep_count = 4;
|
||||
const size_t batch_per_wi = 1;
|
||||
const size_t out_block_width = 14;
|
||||
const size_t out_block_height = 2;
|
||||
runInfo.gws0 = arg.output.Feature().v *
|
||||
(arg.output.Batch().v / (rep_count * batch_per_wi)); // number of tiles needed to cover output width
|
||||
runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
|
||||
runInfo.gws2 = ((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
|
||||
|
||||
runInfo.lws0 = 32; // depth
|
||||
runInfo.lws1 = 1; // width
|
||||
runInfo.lws2 = 4; // height
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad_slm_2x14_rep4::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
const size_t in_x_pitch = 32 * 4;
|
||||
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
|
||||
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
|
||||
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
|
||||
const size_t in_offset =
|
||||
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
|
||||
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 14));
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 2));
|
||||
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
|
||||
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
|
||||
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", runInfo.lws2));
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_slm_2x14_rep4::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,39 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad_slm_2x14_rep4 : public ConvolutionKernelBase {
|
||||
public:
|
||||
ConvolutionKernel_mmad_slm_2x14_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_2x14_rep4") {}
|
||||
virtual ~ConvolutionKernel_mmad_slm_2x14_rep4() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,128 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#include "convolution_kernel_mmad_slm_7x7_rep4.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey ConvolutionKernel_mmad_slm_7x7_rep4::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
bool ConvolutionKernel_mmad_slm_7x7_rep4::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const convolution_params& cp = static_cast<const convolution_params&>(p);
|
||||
|
||||
if (cp.filterSize.x != 3 || cp.filterSize.y != 3)
|
||||
return false;
|
||||
|
||||
if (cp.stride.x != 1 || cp.stride.y != 1)
|
||||
return false;
|
||||
|
||||
if (cp.inputs[0].X().v == 7 && cp.inputs[0].Y().v == 7)
|
||||
return true;
|
||||
|
||||
if (cp.inputs[0].X().v == 14 && cp.inputs[0].Y().v == 14)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_slm_7x7_rep4::SetDefault(const convolution_params& arg,
|
||||
int) const {
|
||||
DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
|
||||
|
||||
runInfo.efficiency = FORCE_PRIORITY_1;
|
||||
|
||||
const size_t rep_count = 4;
|
||||
const size_t batch_per_wi = 4;
|
||||
const size_t out_block_width = 7;
|
||||
// const size_t out_block_height = 1;
|
||||
runInfo.gws0 = (arg.output.Feature().v * arg.output.Batch().v) /
|
||||
(rep_count * batch_per_wi); // number of tiles needed to cover output width
|
||||
runInfo.gws1 = ((arg.inputs[0].X().v / arg.stride.x) + (out_block_width - 1)) / out_block_width;
|
||||
// since this kernel only apply to 7x7 sizes we need to manually set gws2 to 8
|
||||
runInfo.gws2 = Align(arg.inputs[0].Y().v,
|
||||
8); // 8;//((arg.inputs[0].Y().v / arg.stride.y) + (out_block_height - 1)) / out_block_height;
|
||||
|
||||
runInfo.lws0 = 16; // depth
|
||||
runInfo.lws1 = 1; // width
|
||||
runInfo.lws2 = 8; // height
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants ConvolutionKernel_mmad_slm_7x7_rep4::GetJitConstants(const convolution_params& params,
|
||||
const DispatchData& runInfo) const {
|
||||
auto jit = ConvolutionKernelBase::GetJitConstants(params, runInfo);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 8));
|
||||
|
||||
// pitch for special block format used in this kernel
|
||||
const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32);
|
||||
const size_t filter_ofm_block_pitch =
|
||||
(ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8;
|
||||
jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch));
|
||||
|
||||
const size_t in_x_pitch = 32 * 4;
|
||||
const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded();
|
||||
const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded();
|
||||
const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4);
|
||||
const size_t in_offset =
|
||||
in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before;
|
||||
|
||||
const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded();
|
||||
|
||||
jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
|
||||
jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
|
||||
|
||||
jit.AddConstant(MakeJitConstant("OUT_X_PITCH", in_x_pitch));
|
||||
jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch));
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", 7));
|
||||
jit.AddConstant(MakeJitConstant("OUT_BLOCK_HEIGHT", 1));
|
||||
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_X", runInfo.lws0));
|
||||
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Y", runInfo.lws1));
|
||||
jit.AddConstant(MakeJitConstant("LOCAL_SIZE_Z", 7)); // must be 7 since we process 7 in Y per workgroup
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData ConvolutionKernel_mmad_slm_7x7_rep4::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char");
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,39 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class ConvolutionKernel_mmad_slm_7x7_rep4 : public ConvolutionKernelBase {
|
||||
public:
|
||||
ConvolutionKernel_mmad_slm_7x7_rep4() : ConvolutionKernelBase("convolution_gpu_mmad_slm_7x7_rep4") {}
|
||||
virtual ~ConvolutionKernel_mmad_slm_7x7_rep4() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
|
||||
return WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -33,9 +33,6 @@
|
||||
#include "convolution_kernel_bfyx_1x1_gemm_buf.h"
|
||||
#include "convolution_kernel_winograd_2x3_s1_fused.h"
|
||||
#include "convolution_kernel_winograd_6x3_s1_fused.h"
|
||||
#include "convolution_kernel_mmad.h"
|
||||
#include "convolution_kernel_mmad_blocks.h"
|
||||
#include "convolution_kernel_imad_byxf_af32_depthwise.h"
|
||||
#include "convolution_kernel_bfyx_depthwise_weights_lwg.h"
|
||||
#include "convolution_kernel_imad.h"
|
||||
#include "convolution_kernel_fs_byx_fsv32.h"
|
||||
@ -51,7 +48,6 @@
|
||||
#include "deformable_convolution_kernel_bfyx_interp.h"
|
||||
#include "convolution_kernel_b_fs_zyx_fsv16_fp32.h"
|
||||
#include "convolution_kernel_b_fs_zyx_fsv16_fp16.h"
|
||||
#include "convolution_kernel_imad_byxf_af32_1x1.h"
|
||||
#include "convolution_kernel_imad_b_fs_yx_fsv4_1x1.h"
|
||||
#include "convolution_kernel_imad_b_fs_yx_fsv4_dw.hpp"
|
||||
#include "convolution_kernel_mmad_bfyx_to_b_fs_yx_fsv4.h"
|
||||
@ -119,12 +115,6 @@ convolution_kernel_selector::convolution_kernel_selector() {
|
||||
Attach<ConvolutionKernel_Winograd_2x3_s1_fused>();
|
||||
Attach<ConvolutionKernel_Winograd_6x3_s1_fused>();
|
||||
|
||||
// byxf_af32 int8
|
||||
Attach<ConvolutionKernel_mmad>();
|
||||
Attach<ConvolutionKernel_mmad_blocks>();
|
||||
Attach<ConvolutionKernel_imad_byxf_af32_1x1>();
|
||||
Attach<ConvolutionKernel_imad_byxf_af32_depthiwise>();
|
||||
|
||||
// b_fs_yx_fsv4 kernels
|
||||
Attach<ConvolutionKernel_imad>();
|
||||
Attach<ConvolutionKernel_imad_b_fs_yx_fsv4_1x1>();
|
||||
|
@ -55,9 +55,6 @@ ParamsKey DeconvolutionKernel_imad_along_f_tile_bfx::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
|
||||
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.EnableBatching();
|
||||
@ -179,9 +176,6 @@ JitConstants DeconvolutionKernel_imad_along_f_tile_bfx::GetJitConstants(const de
|
||||
input_tile_ifm_pitch = zyx_pitch_factor * 16 * 16;
|
||||
}
|
||||
input_in_tile_batch_pitch = 16;
|
||||
} else if (in_layout == DataLayout::byxf_af32) {
|
||||
input_tile_ifm_pitch = tile_ifm;
|
||||
input_in_tile_batch_pitch = zyx_pitch_factor * Align(in.Feature().LogicalDimPadded(), 32);
|
||||
}
|
||||
|
||||
jit.AddConstant(MakeJitConstant("INPUT_VALID_TILE_IFM_PITCH", input_tile_ifm_pitch != 0));
|
||||
@ -242,8 +236,7 @@ size_t DeconvolutionKernel_imad_along_f_tile_bfx::GetTileIFM(const deconvolution
|
||||
fsv = 16;
|
||||
}
|
||||
if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32
|
||||
|| params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32
|
||||
|| params.inputs[0].GetLayout() == DataLayout::byxf_af32) {
|
||||
|| params.inputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32) {
|
||||
fsv = 32;
|
||||
}
|
||||
|
||||
|
@ -41,7 +41,6 @@ ParamsKey DeconvolutionKernel_imad_ref::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableAllOutputLayout();
|
||||
|
||||
k.EnableDifferentTypes();
|
||||
|
@ -43,14 +43,12 @@ bool EltwiseKernel_vload8::Validate(const Params& params, const optional_params&
|
||||
const auto& ewParams = static_cast<const eltwise_params&>(params);
|
||||
|
||||
for (size_t i = 0; i < ewParams.inputs.size(); i++) {
|
||||
if (ewParams.inputs[i].GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
|
||||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
|
||||
if ((ewParams.inputs[i].GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
|
||||
(ewParams.inputs[i].GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.inputs[i].Feature().v % 16 != 0) ||
|
||||
ewParams.inputs[i].GetLayout() == DataLayout::fs_b_yx_fsv32)
|
||||
return false;
|
||||
}
|
||||
if (ewParams.output.GetLayout() == DataLayout::fs_bs_yx_bsv4_fsv32 ||
|
||||
(ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
|
||||
if ((ewParams.output.GetLayout() == DataLayout::b_fs_yx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
|
||||
(ewParams.output.GetLayout() == DataLayout::b_fs_zyx_fsv16 && ewParams.output.Feature().v % 16 != 0) ||
|
||||
ewParams.output.GetLayout() == DataLayout::fs_b_yx_fsv32)
|
||||
return false;
|
||||
|
@ -37,7 +37,6 @@ ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const {
|
||||
k.EnableDifferentTypes();
|
||||
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::bf);
|
||||
@ -129,7 +128,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
|
||||
size_t input_y_pitch = input.Y().pitch;
|
||||
size_t input_z_pitch = input.Z().pitch;
|
||||
|
||||
if (input.GetLayout() == DataLayout::byxf_af32 || input.GetLayout() == DataLayout::bfyx) {
|
||||
if (input.GetLayout() == DataLayout::bfyx) {
|
||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32));
|
||||
} else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
|
||||
input_x_pitch = 32;
|
||||
|
@ -136,7 +136,7 @@ PoolingKernelBase::DispatchData PoolingKernelBase::SetDefault(const pooling_para
|
||||
kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16;
|
||||
|
||||
if (output.GetLayout() == DataLayout::bfyx || output.GetLayout() == DataLayout::b_fs_yx_fsv4 ||
|
||||
output.GetLayout() == DataLayout::byxf || output.GetLayout() == DataLayout::byxf_af32 ||
|
||||
output.GetLayout() == DataLayout::byxf ||
|
||||
output.GetLayout() == DataLayout::bfzyx || output.GetLayout() == DataLayout::b_fs_zyx_fsv16 ||
|
||||
output.GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16) {
|
||||
// Determine global work sizes.
|
||||
|
@ -27,7 +27,6 @@ ParamsKey PoolingKerneGPU_b_fs_yx_fsv4::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
|
@ -1,89 +0,0 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "pooling_kernel_gpu_byxf_af32.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey PoolingKerneGPU_byxf_af32::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::F16);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
k.EnablePoolType(PoolType::MAX);
|
||||
k.EnablePoolType(PoolType::AVG);
|
||||
k.EnablePoolRemainder(PoolRemainder::FLOOR);
|
||||
k.EnablePoolRemainder(PoolRemainder::CEIL);
|
||||
k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
|
||||
k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
|
||||
k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
|
||||
k.EnableDifferentTypes();
|
||||
return k;
|
||||
}
|
||||
|
||||
PoolingKernelBase::DispatchData PoolingKerneGPU_byxf_af32::SetDefault(const pooling_params& params) const {
|
||||
constexpr int simdSize = 8;
|
||||
|
||||
DispatchData runInfo = PoolingKernelBase::SetDefault(params);
|
||||
|
||||
runInfo.gws0 = params.output.X().v;
|
||||
runInfo.gws1 = params.output.Y().v;
|
||||
// we got byxf_af32 format, so if we process 4 features per workitem, that means we process 32 per simd, so divide
|
||||
// by 4 and we end up with 8
|
||||
runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * params.output.Batch().v) / 4;
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = simdSize;
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& params, DispatchData kd) const {
|
||||
JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)"));
|
||||
jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
|
||||
jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf = {"",
|
||||
{"b", "f", "y", "x"},
|
||||
"fused_pool_result",
|
||||
input_dt,
|
||||
4,
|
||||
LoadType::LT_UNALIGNED,
|
||||
BoundaryCheck::ENABLED,
|
||||
IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName::FEATURE};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
|
||||
KernelsData PoolingKerneGPU_byxf_af32::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,37 +0,0 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pooling_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
class PoolingKerneGPU_byxf_af32 : public PoolingKernelBase {
|
||||
public:
|
||||
PoolingKerneGPU_byxf_af32() : PoolingKernelBase("pooling_gpu_byxf_af32") {}
|
||||
virtual ~PoolingKerneGPU_byxf_af32() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
|
||||
DispatchData SetDefault(const pooling_params& params) const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -29,7 +29,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::bfzyx);
|
||||
k.EnableInputLayout(DataLayout::yxfb);
|
||||
k.EnableInputLayout(DataLayout::byxf);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
@ -38,7 +37,6 @@ ParamsKey PoolingKernelGPUInt8Ref::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::bfzyx);
|
||||
k.EnableOutputLayout(DataLayout::yxfb);
|
||||
k.EnableOutputLayout(DataLayout::byxf);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include "pooling_kernel_gpu_byxf_opt.h"
|
||||
#include "pooling_kernel_gpu_bfyx_block_opt.h"
|
||||
#include "pooling_kernel_gpu_byxf_padding_opt.h"
|
||||
#include "pooling_kernel_gpu_byxf_af32.h"
|
||||
#include "pooling_kernel_gpu_int8_ref.h"
|
||||
#include "pooling_kernel_gpu_b_fs_yx_fsv4.h"
|
||||
#include "pooling_kernel_gpu_fs_b_yx_fsv32.h"
|
||||
@ -35,7 +34,6 @@ pooling_kernel_selector::pooling_kernel_selector() {
|
||||
Attach<PoolingKernelGPUBfyxBlockOpt>();
|
||||
Attach<PoolingKernelGPUByxfPaddingOpt>();
|
||||
Attach<PoolingKernelGPUInt8Ref>();
|
||||
Attach<PoolingKerneGPU_byxf_af32>();
|
||||
Attach<PoolingKerneGPU_b_fs_yx_fsv4>();
|
||||
Attach<PoolingKerneGPU_fs_b_yx_fsv32>();
|
||||
Attach<PoolingKernel_b_fs_yx_fsv16>();
|
||||
|
@ -40,14 +40,12 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::yxfb);
|
||||
k.EnableOutputLayout(DataLayout::bfzyx);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
|
@ -1,87 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey reorder_kernel_byxf_f32_to_byx8_f4_i8::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::F32);
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableInputLayout(DataLayout::byxf);
|
||||
k.EnableOutputLayout(DataLayout::byx8_f4);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
return k;
|
||||
}
|
||||
|
||||
bool reorder_kernel_byxf_f32_to_byx8_f4_i8::Validate(const Params& p, const optional_params& o) const {
|
||||
if (!ReorderKernelBase::Validate(p, o)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const reorder_params& params = static_cast<const reorder_params&>(p);
|
||||
|
||||
if (params.output.X().v % 16 != 0)
|
||||
return false;
|
||||
|
||||
if (params.inputs[0].Feature().v != 3)
|
||||
return false;
|
||||
|
||||
if (params.mode == MeanSubtractMode::IN_BUFFER && params.mean.LogicalSize() != params.inputs[0].Feature().v)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t static get_wg_batch_size(const reorder_params& params) {
|
||||
if (params.inputs[0].Batch().v % 16 == 0)
|
||||
return 16;
|
||||
return 1;
|
||||
}
|
||||
|
||||
reorder_kernel_byxf_f32_to_byx8_f4_i8::DispatchData reorder_kernel_byxf_f32_to_byx8_f4_i8::SetDefault(
|
||||
const reorder_params& params) const {
|
||||
DispatchData kd;
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
|
||||
kd.gws0 = input.X().v;
|
||||
kd.gws1 = input.Y().v;
|
||||
kd.gws2 = input.Batch().v;
|
||||
|
||||
kd.lws0 = 16;
|
||||
kd.lws1 = 1;
|
||||
kd.lws2 = get_wg_batch_size(params);
|
||||
|
||||
return kd;
|
||||
}
|
||||
|
||||
JitConstants reorder_kernel_byxf_f32_to_byx8_f4_i8::GetJitConstants(const reorder_params& params) const {
|
||||
auto jit = ReorderKernelBase::GetJitConstants(params);
|
||||
jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0]));
|
||||
jit.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params)));
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData reorder_kernel_byxf_f32_to_byx8_f4_i8::GetKernelsData(const Params& params,
|
||||
const optional_params& options) const {
|
||||
const reorder_params& orgParams = static_cast<const reorder_params&>(params);
|
||||
return GetCommonKernelsData(orgParams, options, FORCE_PRIORITY_5);
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,32 +0,0 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "reorder_kernel_base.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
class reorder_kernel_byxf_f32_to_byx8_f4_i8 : public ReorderKernelBase {
|
||||
public:
|
||||
reorder_kernel_byxf_f32_to_byx8_f4_i8() : ReorderKernelBase("reorder_data_byxf_f32_to_byx8_f4_i8") {}
|
||||
virtual ~reorder_kernel_byxf_f32_to_byx8_f4_i8() {}
|
||||
|
||||
bool Validate(const Params& p, const optional_params& o) const override;
|
||||
DispatchData SetDefault(const reorder_params& params) const override;
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
JitConstants GetJitConstants(const reorder_params& params) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -33,7 +33,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::bfwzyx);
|
||||
k.EnableInputLayout(DataLayout::bs_f_bsv8__af8);
|
||||
k.EnableInputLayout(DataLayout::bs_f_bsv16__af8);
|
||||
k.EnableInputLayout(DataLayout::bf8_xy16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
|
||||
@ -45,7 +44,6 @@ ParamsKey ReorderKernelFastBatch1::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::bfwzyx);
|
||||
k.EnableOutputLayout(DataLayout::bs_f_bsv8__af8);
|
||||
k.EnableOutputLayout(DataLayout::bs_f_bsv16__af8);
|
||||
k.EnableOutputLayout(DataLayout::bf8_xy16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16);
|
||||
|
||||
|
@ -19,7 +19,6 @@
|
||||
#include "reorder_from_winograd_2x3_kernel.h"
|
||||
#include "reorder_to_winograd_2x3_kernel.h"
|
||||
#include "reorder_kernel_to_yxfb_batched.h"
|
||||
#include "reorder_kernel_byxf_f32_to_byx8_f4_i8.h"
|
||||
#include "reorder_kernel_binary.h"
|
||||
#include "reorder_biplanar_nv12.h"
|
||||
#include "reorder_kernel_fs_b_yx_fsv32_to_bfyx.h"
|
||||
@ -33,7 +32,6 @@ reorder_kernel_selector::reorder_kernel_selector() {
|
||||
Attach<ReorderFromWinograd2x3Kernel>();
|
||||
Attach<ReorderToWinograd2x3Kernel>();
|
||||
Attach<ReorderKernel_to_yxfb_batched>();
|
||||
Attach<reorder_kernel_byxf_f32_to_byx8_f4_i8>();
|
||||
Attach<reorder_biplanar_nv12>();
|
||||
Attach<ReorderKernel_fs_b_yx_fsv32_to_bfyx>();
|
||||
}
|
||||
|
@ -61,8 +61,6 @@ static size_t packing_factor(const resample_params& params) {
|
||||
return 16;
|
||||
case DataLayout::b_fs_yx_fsv4:
|
||||
return 4;
|
||||
case DataLayout::byxf_af32:
|
||||
return 16;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
|
||||
#if FP16_UNIT_USED
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
|
||||
|
||||
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
|
||||
{ \
|
||||
const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
|
||||
@ -64,9 +64,9 @@
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
KERNEL(convolution_bfyx_1x1)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
#if BIAS_TERM
|
||||
__global BIAS_TYPE* biases,
|
||||
#endif
|
||||
@ -107,10 +107,10 @@ KERNEL(convolution_bfyx_1x1)(
|
||||
{
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
|
||||
|
||||
|
||||
uint input_idx = input_offset + k * 8 * xy_block_num * 16;
|
||||
uint filter_idx = filter_offset + k * 8 * 16;
|
||||
|
||||
|
||||
blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
|
||||
blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
|
||||
|
||||
@ -124,11 +124,7 @@ KERNEL(convolution_bfyx_1x1)(
|
||||
|
||||
for(uint i = 0; i < 16; i++)
|
||||
{
|
||||
#if OUTPUT_LAYOUT_BF8_XY16
|
||||
const uint dst_index = GET_DATA_BF8_XY16_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
|
||||
#else
|
||||
const uint dst_index = GET_DATA_INDEX(OUTPUT, b, group_f+i, y, x) + out_split_offset;
|
||||
#endif
|
||||
#if LEFTOVERS
|
||||
if(group_f+i < OUTPUT_FEATURE_NUM)
|
||||
#endif
|
||||
|
@ -1,214 +0,0 @@
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
|
||||
#include "include/data_types.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/mmad.cl"
|
||||
|
||||
#if STRIDE_SIZE_Y == DILATION_SIZE_Y
|
||||
#define BLOCK_Y_SIZE (FILTER_SIZE_Y + (SPLIT_Y - 1))
|
||||
#define LOAD_Y_WITH_STRIDES
|
||||
#else
|
||||
#define BLOCK_Y_SIZE ((SPLIT_Y - 1) * STRIDE_SIZE_Y + (FILTER_SIZE_Y - 1) * (DILATION_SIZE_Y - 1) + FILTER_SIZE_Y)
|
||||
#endif
|
||||
|
||||
#if STRIDE_SIZE_X == DILATION_SIZE_X
|
||||
#define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
|
||||
#define LOAD_X_WITH_STRIDES
|
||||
#else
|
||||
#define FILTER_SIZE_X_PRELOAD FILTER_SIZE_X
|
||||
#define LOAD_X_WITH_STRIDES
|
||||
#define DONT_USE_X_SHIFTS
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||
KERNEL(convolution_gpu_byxf_af32_depthwise)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
#if BIAS_TERM
|
||||
__global BIAS_TYPE* biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint x = get_global_id(1) * OUT_BLOCK_WIDTH;
|
||||
const uint y = get_global_id(2) * SPLIT_Y;
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const uint of = get_global_id(0);
|
||||
const uint b = 0;
|
||||
#else
|
||||
const uint of = (uint)get_global_id(0) % ALIGNED_OFM;
|
||||
const uint b = (uint)get_global_id(0) / ALIGNED_OFM;
|
||||
#endif
|
||||
const uint g = of;
|
||||
|
||||
if (of >= OUTPUT_FEATURE_NUM)
|
||||
return;
|
||||
|
||||
int dotProd[SPLIT_Y] = {0};
|
||||
OUTPUT_TYPE out[SPLIT_Y];
|
||||
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
const uint filter_offset = g*FILTER_GROUPS_PITCH;
|
||||
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + g*FILTER_IFM_NUM;
|
||||
|
||||
// read all weights
|
||||
FILTER_TYPE w[FILTER_IFM_PITCH];
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
|
||||
for (int j = 0; j < FILTER_SIZE_Y; j++) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (int i = 0; i < FILTER_SIZE_X; i++) {
|
||||
w[j * FILTER_SIZE_X + i] = weights[filter_offset + j * FILTER_Y_PITCH + i * FILTER_X_PITCH];
|
||||
}
|
||||
}
|
||||
|
||||
// initial input read
|
||||
INPUT0_TYPE in[FILTER_SIZE_X_PRELOAD * BLOCK_Y_SIZE];
|
||||
__attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
|
||||
for (int i = 0; i < BLOCK_Y_SIZE; i++) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
|
||||
for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
|
||||
#ifdef LOAD_Y_WITH_STRIDES
|
||||
int input_offset_y = input_y + i * DILATION_SIZE_Y;
|
||||
#else
|
||||
int input_offset_y = input_y + i;
|
||||
#endif
|
||||
#ifdef LOAD_X_WITH_STRIDES
|
||||
int input_offset_x = input_x + j * DILATION_SIZE_X;
|
||||
#else
|
||||
int input_offset_x = input_x + j;
|
||||
#endif
|
||||
uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
|
||||
in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
|
||||
}
|
||||
}
|
||||
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD;
|
||||
#endif
|
||||
|
||||
for (int l = 0; l < OUT_BLOCK_WIDTH; l++) {
|
||||
//calculate dotproduct
|
||||
__attribute__((opencl_unroll_hint(SPLIT_Y)))
|
||||
for (int i = 0; i < SPLIT_Y; i++) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_IFM_PITCH)))
|
||||
for (int j = 0; j < FILTER_IFM_PITCH; j++) {
|
||||
#if defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
|
||||
const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
|
||||
dotProd[i] += (int)in[start_pos_y + j] * (int)w[j];
|
||||
#elif defined(LOAD_X_WITH_STRIDES) && !defined(LOAD_Y_WITH_STRIDES)
|
||||
const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
|
||||
const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
|
||||
const uint pos_x = (j % FILTER_SIZE_X);
|
||||
dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
|
||||
#elif defined(LOAD_Y_WITH_STRIDES) && !defined(LOAD_X_WITH_STRIDES)
|
||||
const uint start_pos_y = i * FILTER_SIZE_X_PRELOAD;
|
||||
const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * FILTER_SIZE_X_PRELOAD;
|
||||
const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
|
||||
dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
|
||||
#else
|
||||
const uint start_pos_y = i * STRIDE_SIZE_Y * FILTER_SIZE_X_PRELOAD;
|
||||
const uint pos_y = start_pos_y + (j / FILTER_SIZE_X) * DILATION_SIZE_Y * FILTER_SIZE_X_PRELOAD;
|
||||
const uint pos_x = (j % FILTER_SIZE_X) * DILATION_SIZE_X;
|
||||
dotProd[i] += (int)in[pos_y + pos_x] * (int)w[j];
|
||||
#endif // defined(LOAD_X_WITH_STRIDES) && defined(LOAD_Y_WITH_STRIDES)
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(BLOCK_Y_SIZE)))
|
||||
for (int i = 0; i < BLOCK_Y_SIZE; i++) {
|
||||
// inputs shift
|
||||
#ifndef DONT_USE_X_SHIFTS
|
||||
#if (FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X) > 0
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X)))
|
||||
#endif
|
||||
for (int j = 0; j < FILTER_SIZE_X_PRELOAD - STRIDE_SIZE_X; j++) {
|
||||
in[i * FILTER_SIZE_X_PRELOAD + j] = in[i * FILTER_SIZE_X_PRELOAD + j + STRIDE_SIZE_X];
|
||||
}
|
||||
#endif
|
||||
|
||||
// read additional inputs
|
||||
#ifdef LOAD_Y_WITH_STRIDES
|
||||
int input_offset_y = input_y + i * DILATION_SIZE_Y;
|
||||
#else
|
||||
int input_offset_y = input_y + i;
|
||||
#endif // LOAD_Y_WITH_STRIDES
|
||||
|
||||
#if defined(DONT_USE_X_SHIFTS)
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X_PRELOAD)))
|
||||
for (int j = 0; j < FILTER_SIZE_X_PRELOAD; j++) {
|
||||
int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + j * DILATION_SIZE_X;
|
||||
uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
|
||||
in[i * FILTER_SIZE_X_PRELOAD + j] = input[input_idx];
|
||||
}
|
||||
|
||||
#else
|
||||
{
|
||||
int input_offset_x = input_x + ((l + 1) * STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) * DILATION_SIZE_X;
|
||||
uint input_idx = input_offset + (uint)input_offset_x * INPUT0_X_PITCH + (uint)input_offset_y * INPUT0_Y_PITCH;
|
||||
in[i * FILTER_SIZE_X_PRELOAD + FILTER_SIZE_X_PRELOAD - 1] = input[input_idx];
|
||||
}
|
||||
#endif // defined(DONT_USE_X_SHIFTS)
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(SPLIT_Y)))
|
||||
for (int m = 0; m < SPLIT_Y; m++) {
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
#if OUTPUT_LAYOUT_BYXF_AF32 == 1
|
||||
const uint bias_index = GET_DATA_INDEX(BIAS, b, of, y + m, x + l);
|
||||
#elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
|
||||
const uint bias_index = GET_DATA_B_FS_YX_FSV4_INDEX(BIAS, b, of, y + m, x + l);
|
||||
#else
|
||||
#error "Incorrect output layout"
|
||||
#endif
|
||||
#elif BIAS_PER_OFM
|
||||
const uint bias_index = of;
|
||||
#endif
|
||||
// TODO: Maybe half should be supported as well.
|
||||
float res = (float)dotProd[m] + biases[bias_index];
|
||||
#else
|
||||
float res = (float)dotProd[m];
|
||||
#endif
|
||||
dotProd[m] = 0;
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC;
|
||||
#else
|
||||
FUSED_OPS;
|
||||
#endif
|
||||
out[m] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
out[m] = TO_OUTPUT_TYPE(res);
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(SPLIT_Y)))
|
||||
for (int m = 0; m < SPLIT_Y; m++) {
|
||||
#ifdef SPLIT_LEFTOVERS
|
||||
if (y + m >= OUTPUT_SIZE_Y)
|
||||
continue;
|
||||
#endif
|
||||
const uint dst_index = OUTPUT_GET_INDEX(b, of, y + m, x + l);
|
||||
output[dst_index] = ACTIVATION(out[m], ACTIVATION_PARAMS);
|
||||
}
|
||||
} // OUT_BLOCK_WIDTH
|
||||
}
|
@ -1,124 +0,0 @@
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
|
||||
#include "include/data_types.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/mmad.cl"
|
||||
|
||||
#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
|
||||
#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
|
||||
#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
|
||||
#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_MMAD)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
#if BIAS_TERM
|
||||
__global BIAS_TYPE* biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint x = get_global_id(0);
|
||||
const uint y = get_global_id(1);
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const uint f = get_global_id(2);
|
||||
const uint b = 0;
|
||||
#else
|
||||
const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
|
||||
const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
|
||||
#endif
|
||||
|
||||
int dotProd = 0;
|
||||
|
||||
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
|
||||
|
||||
const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
|
||||
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
|
||||
|
||||
for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
|
||||
{
|
||||
for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
|
||||
{
|
||||
const int input_offset_y = input_y + j * DILATION_SIZE_Y;
|
||||
const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
|
||||
|
||||
if(!zero_y)
|
||||
{
|
||||
for (uint i = 0; i < FILTER_SIZE_X ; ++i)
|
||||
{
|
||||
const int input_offset_x = input_x + i * DILATION_SIZE_X;
|
||||
const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
|
||||
|
||||
if(!zero_x)
|
||||
{
|
||||
uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32;
|
||||
uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
|
||||
|
||||
PACKED_TYPE input_data = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
|
||||
MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes
|
||||
activations.s0 = sub_group_broadcast(input_data, 0);
|
||||
activations.s1 = sub_group_broadcast(input_data, 1);
|
||||
activations.s2 = sub_group_broadcast(input_data, 2);
|
||||
activations.s3 = sub_group_broadcast(input_data, 3);
|
||||
activations.s4 = sub_group_broadcast(input_data, 4);
|
||||
activations.s5 = sub_group_broadcast(input_data, 5);
|
||||
activations.s6 = sub_group_broadcast(input_data, 6);
|
||||
activations.s7 = sub_group_broadcast(input_data, 7);
|
||||
|
||||
int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
|
||||
|
||||
dotProd = MMAD_8(activations, weights_data, dotProd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
|
||||
#elif BIAS_PER_OFM
|
||||
const uint bias_index = f;
|
||||
#endif
|
||||
float res = (float)dotProd + biases[bias_index];
|
||||
#else
|
||||
float res = (float)dotProd;
|
||||
#endif // BIAS_TERM
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
OUTPUT_TYPE result = FUSED_OPS_RESULT;
|
||||
#else
|
||||
OUTPUT_TYPE result = TO_OUTPUT_TYPE(res);
|
||||
#endif
|
||||
|
||||
const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
|
||||
const uint dst_index = OUTPUT_GET_INDEX(b, f, y, x) + out_split_offset;
|
||||
output[dst_index] = result;
|
||||
}
|
||||
|
||||
#undef FILTER_IFM_MMAD_NUM
|
||||
#undef FILTER_OFM_MMAD_NUM
|
||||
#undef FILTER_IFM_ALIGNED
|
||||
#undef FILTER_OFM_ALIGNED
|
@ -1,158 +0,0 @@
|
||||
// Copyright (c) 2016-2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
|
||||
#include "include/data_types.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/mmad.cl"
|
||||
|
||||
#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
|
||||
#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
|
||||
#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
|
||||
#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_MMAD_blocks)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
#if BIAS_TERM
|
||||
__global BIAS_TYPE* biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint x = (uint)get_global_id(0) * OUTPUT_BLOCK_WIDTH;
|
||||
const uint y = (uint)get_global_id(1) * OUTPUT_BLOCK_HEIGHT;
|
||||
#if OUTPUT_BATCH_NUM == 1
|
||||
const uint f = (uint)get_global_id(2);
|
||||
const uint b = 0;
|
||||
#else
|
||||
const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED;
|
||||
const uint b = (uint)get_global_id(2) / FILTER_OFM_ALIGNED;
|
||||
#endif
|
||||
|
||||
int acc[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
|
||||
PACKED_TYPE in[IN_BLOCK_ARRAY_SIZE];
|
||||
|
||||
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
|
||||
|
||||
const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
|
||||
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
|
||||
|
||||
uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH;
|
||||
uint filter_idx = filter_offset;
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
|
||||
{
|
||||
// preload input data
|
||||
for(uint in_block_pos = 0; in_block_pos < IN_BLOCK_ARRAY_SIZE; in_block_pos++)
|
||||
{
|
||||
uint block_x = in_block_pos % IN_BLOCK_WIDTH;
|
||||
uint block_y = in_block_pos / IN_BLOCK_WIDTH;
|
||||
uint input_idx = in_addr + block_x * INPUT0_X_PITCH + block_y * INPUT0_Y_PITCH;
|
||||
in[in_block_pos] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
|
||||
}
|
||||
// end of preloading input data
|
||||
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
|
||||
for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (uint i = 0; i < FILTER_SIZE_X ; ++i)
|
||||
{
|
||||
int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
|
||||
|
||||
__attribute__((opencl_unroll_hint(OUTPUT_BLOCK_HEIGHT)))
|
||||
for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
|
||||
{
|
||||
__attribute__((opencl_unroll_hint(OUTPUT_BLOCK_WIDTH)))
|
||||
for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
|
||||
{
|
||||
PACKED_TYPE input_data = in[(br * STRIDE_SIZE_Y + j) * IN_BLOCK_WIDTH + bc * STRIDE_SIZE_X + i];
|
||||
MAKE_VECTOR_TYPE(PACKED_TYPE, 8) activations; //activations of all lanes
|
||||
activations.s0 = sub_group_broadcast(input_data, 0);
|
||||
activations.s1 = sub_group_broadcast(input_data, 1);
|
||||
activations.s2 = sub_group_broadcast(input_data, 2);
|
||||
activations.s3 = sub_group_broadcast(input_data, 3);
|
||||
activations.s4 = sub_group_broadcast(input_data, 4);
|
||||
activations.s5 = sub_group_broadcast(input_data, 5);
|
||||
activations.s6 = sub_group_broadcast(input_data, 6);
|
||||
activations.s7 = sub_group_broadcast(input_data, 7);
|
||||
|
||||
acc[br * OUTPUT_BLOCK_WIDTH + bc] = MMAD_8(activations, weights_data, acc[br * OUTPUT_BLOCK_WIDTH + bc]);
|
||||
}
|
||||
}
|
||||
filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel
|
||||
}
|
||||
}
|
||||
in_addr += 32; // 4 features per channel * 8 SIMD channels
|
||||
}
|
||||
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
|
||||
#elif BIAS_PER_OFM
|
||||
const uint bias_index = f;
|
||||
#endif
|
||||
#endif // BIAS_TERM
|
||||
|
||||
OUTPUT_TYPE out[OUTPUT_BLOCK_WIDTH * OUTPUT_BLOCK_HEIGHT] = { 0 };
|
||||
for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
|
||||
{
|
||||
for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
|
||||
{
|
||||
#if BIAS_TERM
|
||||
// TODO: Maybe half should be supported as well.
|
||||
float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc] + biases[bias_index];
|
||||
#else
|
||||
float res = (float)acc[br * OUTPUT_BLOCK_WIDTH + bc];
|
||||
#endif
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
out[br * OUTPUT_BLOCK_WIDTH + bc] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
out[br * OUTPUT_BLOCK_WIDTH + bc] = TO_OUTPUT_TYPE(res);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
|
||||
for(uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++)
|
||||
{
|
||||
if(y + br < OUTPUT_SIZE_Y)
|
||||
{
|
||||
for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++)
|
||||
{
|
||||
if(x + bc < OUTPUT_SIZE_X)
|
||||
{
|
||||
const uint dst_index = OUTPUT_GET_INDEX(b, f, y+br, x+bc) + out_split_offset;
|
||||
output[dst_index] = out[br * OUTPUT_BLOCK_WIDTH + bc];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef FILTER_IFM_MMAD_NUM
|
||||
#undef FILTER_OFM_MMAD_NUM
|
||||
#undef FILTER_IFM_ALIGNED
|
||||
#undef FILTER_OFM_ALIGNED
|
@ -1,163 +0,0 @@
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
|
||||
#include "include/data_types.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/imad.cl"
|
||||
|
||||
#if defined(ACCUMULATOR_TYPE)
|
||||
#undef ACCUMULATOR_TYPE
|
||||
#endif
|
||||
|
||||
#if QUANTIZATION_TERM
|
||||
# define ACCUMULATOR_TYPE int
|
||||
# define ACTIVATION_TYPE float
|
||||
# define TO_ACTIVATION_TYPE(x) convert_float(x)
|
||||
#else
|
||||
# define ACCUMULATOR_TYPE INPUT0_TYPE
|
||||
# define ACTIVATION_TYPE INPUT0_TYPE
|
||||
# define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
|
||||
#endif
|
||||
|
||||
|
||||
#define FILTER_IFM_SLICES_NUM ((FILTER_IFM_NUM + 31) / 32)
|
||||
#define FILTER_OFM_NUM_ALIGNED ((FILTER_OFM_NUM + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE * SUB_GROUP_SIZE)
|
||||
|
||||
// we are packing 4 8bit activations per 32 bit
|
||||
#define PACK 4
|
||||
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
KERNEL(fused_conv_eltwise_gpu_af32_imad_1x1)(
|
||||
const __global PACKED_TYPE* input,
|
||||
__global OUTPUT_TYPE* restrict output,
|
||||
const __global uint* weights,
|
||||
#if BIAS_TERM
|
||||
__global BIAS_TYPE* biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint x = (uint)get_global_id(0) * TILE_LENGTH % OUTPUT_SIZE_X;
|
||||
const uint y = (uint)get_global_id(0) * TILE_LENGTH / OUTPUT_SIZE_X;
|
||||
const uint f = (((uint)get_global_id(1) * TILE_DEPTH) % FILTER_OFM_NUM_ALIGNED) / (TILE_DEPTH * SUB_GROUP_SIZE) * (TILE_DEPTH * SUB_GROUP_SIZE);
|
||||
const uint b = ((uint)get_global_id(1) * TILE_DEPTH) / FILTER_OFM_NUM_ALIGNED;
|
||||
const uint lid = get_sub_group_local_id();
|
||||
|
||||
const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
PACKED_TYPE input_slice[TILE_LENGTH];
|
||||
int8 weights_slice;
|
||||
ACCUMULATOR_TYPE accu[TILE_LENGTH][TILE_DEPTH] = {0};
|
||||
|
||||
uint filter_idx = f * FILTER_IFM_SLICES_NUM * 32 / PACK;
|
||||
uint in_addr = (INPUT0_GET_INDEX(b, 0, input_y, input_x)) / PACK;
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < FILTER_IFM_SLICES_NUM; ++k)
|
||||
{
|
||||
// Read 32 input features for each pixel in the tile. 4 features in each int, 8 ints across SIMD
|
||||
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
|
||||
for (uint i = 0; i < TILE_LENGTH; ++i)
|
||||
{
|
||||
uint tmp_addr = in_addr + i * INPUT0_X_PITCH * STRIDE_SIZE_X / PACK;
|
||||
input_slice[i] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*)input + tmp_addr));
|
||||
}
|
||||
|
||||
// Loop through TILE_DEPTH output features
|
||||
__attribute__((opencl_unroll_hint(TILE_DEPTH)))
|
||||
for (uint of = 0; of < TILE_DEPTH; ++of)
|
||||
{
|
||||
// Read 32 weights. 8 ints, 4 weights in each int, each SIMD lane has own weghts
|
||||
weights_slice = as_int8(intel_sub_group_block_read8(weights + filter_idx));
|
||||
|
||||
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
|
||||
for (uint i = 0; i < TILE_LENGTH; ++i)
|
||||
{
|
||||
PACKED_TYPE A_scalar;
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 0); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s0));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 1); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s1));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 2); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s2));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 3); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s3));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 4); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s4));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 5); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s5));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 6); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s6));
|
||||
A_scalar = sub_group_broadcast(input_slice[i], 7); accu[i][of] = IMAD(accu[i][of], AS_INPUT0_TYPE_4(A_scalar), as_char4(weights_slice.s7));
|
||||
}
|
||||
|
||||
filter_idx += 32 * 8 / 4; // 32 features per channel * 8 SIMD channels / sizeof(int)
|
||||
}
|
||||
in_addr += 4 * 8 / 4; // 4 features per channel * 8 SIMD channels / sizeof(int) -> next 32 input features
|
||||
}
|
||||
|
||||
#if TILE_DEPTH == 8
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result[TILE_LENGTH];
|
||||
#elif TILE_DEPTH == 4
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result[TILE_LENGTH];
|
||||
#endif
|
||||
|
||||
uint dst_index = (OUTPUT_GET_INDEX(b, f, y, x)) / PACK;
|
||||
|
||||
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
|
||||
for (uint i = 0; i < TILE_LENGTH; ++i)
|
||||
{
|
||||
|
||||
__attribute__((opencl_unroll_hint(TILE_DEPTH)))
|
||||
for (uint j = 0; j < TILE_DEPTH; ++j)
|
||||
{
|
||||
const uint f2 = f + lid * 4 + (j % 4) + (j / 4 * 32);
|
||||
ACCUMULATOR_TYPE dotProd = accu[i][j];
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
|
||||
#elif BIAS_PER_OFM
|
||||
const uint bias_index = f2;
|
||||
#endif
|
||||
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd) + TO_ACTIVATION_TYPE(biases[bias_index]);
|
||||
#else
|
||||
ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
|
||||
#endif //BIAS_TERM
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
result[i][j] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
result[i][j] = TO_OUTPUT_TYPE(res);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint(TILE_LENGTH)))
|
||||
for (uint i = 0; i < TILE_LENGTH; ++i)
|
||||
{
|
||||
#if TILE_DEPTH == 8
|
||||
intel_sub_group_block_write2((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint2(result[i]));
|
||||
#elif TILE_DEPTH == 4
|
||||
intel_sub_group_block_write((__global uint*)output + dst_index + i * OUTPUT_X_PITCH / PACK, as_uint(result[i]));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#undef FILTER_IFM_SLICES_NUM
|
||||
#undef FILTER_OFM_NUM_ALIGNED
|
||||
#undef ACCUMULATOR_TYPE
|
||||
#undef ACTIVATION_TYPE
|
||||
#undef TO_ACTIVATION_TYPE
|
@ -129,7 +129,6 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#else
|
||||
in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
|
||||
#endif
|
||||
// TODO This will cause errors for byxf_af32 format on input
|
||||
in_addr += (INPUT0_SIZE_X + IWPAD); // move to next row down
|
||||
#endif
|
||||
}
|
||||
@ -191,9 +190,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
if(!zero_c)
|
||||
#endif
|
||||
{
|
||||
#if OUTPUT_LAYOUT_BYXF_AF32 == 1
|
||||
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
|
||||
#elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
|
||||
#if OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
|
||||
uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
|
||||
#elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1
|
||||
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
|
||||
|
@ -99,20 +99,6 @@
|
||||
((b) / (sub_group_size))*CAT(prefix, _BATCH_PITCH) \
|
||||
)
|
||||
|
||||
inline uint FUNC(get_bf8_xy16_index)(uint b, uint f, uint y, uint x, uint x_size, uint y_size, uint f_size, uint offset)
|
||||
{
|
||||
const uint xy_idx = x + y * x_size;
|
||||
const uint xy_offset = (xy_idx % 16) + (xy_idx / 16) * 16 * 8;
|
||||
const uint xy_block_num = (x_size * y_size + 16 - 1) / 16;
|
||||
const uint f_offset = (f % 8) * 16 + (f / 8) * xy_block_num * 16 * 8;
|
||||
const uint f_block_num = (f_size + 8 - 1) / 8;
|
||||
const uint b_offset = b * f_block_num * xy_block_num * 128;
|
||||
|
||||
const size_t idx = offset + xy_offset + f_offset + b_offset;
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
inline uint FUNC(get_b_fs_yx_fsv_index)(uint b, uint f, uint y, uint x,
|
||||
uint x_size, uint y_size, uint f_size, uint b_size,
|
||||
uint b_pad_before, uint b_pad_after,
|
||||
@ -495,93 +481,6 @@ inline uint FUNC(get_os_zyxi_osv16_index)(uint o, uint i, uint z, uint y, uint x
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _SIZE_Z))
|
||||
|
||||
inline uint FUNC(get_byxf_af32_index)(uint b, uint f, uint y, uint x, uint y_pitch, uint b_pitch, uint f_size, uint f_pad_before, uint f_pad_after, uint offset)
|
||||
{
|
||||
const uint f_aligned_to_32 = ((f_size + 31) / 32) * 32;
|
||||
const uint x_pitch = f_pad_before + f_aligned_to_32 + f_pad_after;
|
||||
const uint b_offset = b * b_pitch;
|
||||
const uint xy_offset = x_pitch * x + y_pitch * y;
|
||||
const uint f_offset = f;
|
||||
const size_t idx = offset + xy_offset + b_offset + f_offset;
|
||||
return idx;
|
||||
}
|
||||
|
||||
#define GET_DATA_BYXF_AF32_INDEX(prefix, b, f, y, x) \
|
||||
FUNC_CALL(get_byxf_af32_index)( \
|
||||
b, f, y, x, CAT(prefix, _Y_PITCH), \
|
||||
CAT(prefix, _BATCH_PITCH), \
|
||||
CAT(prefix, _FEATURE_NUM), \
|
||||
CAT(prefix, _PAD_BEFORE_FEATURE_NUM), \
|
||||
CAT(prefix, _PAD_AFTER_FEATURE_NUM), \
|
||||
CAT(prefix, _OFFSET))
|
||||
|
||||
inline uint FUNC(get_byx8_f4_index)(uint b, uint f, uint y, uint x,
|
||||
uint x_pitch, uint y_pitch, uint b_pitch, uint f_size, uint x_size, uint offset)
|
||||
{
|
||||
const uint f_aligned_to_4 = ((f_size + 3) / 4) * 4;
|
||||
const uint x_aligned_to_8 = ((x_size + 7) / 8) * 8;
|
||||
const uint b_offset = b * b_pitch;
|
||||
const uint xy_offset = x * x_pitch + y * y_pitch;
|
||||
const uint f_offset = f;
|
||||
const size_t idx = offset + xy_offset + b_offset + f_offset;
|
||||
return idx;
|
||||
}
|
||||
|
||||
#define GET_DATA_BYX8_F4_INDEX(prefix, b, f, y, x) \
|
||||
FUNC_CALL(get_byx8_f4_index)( \
|
||||
b, f, y, x, CAT(prefix, _X_PITCH), \
|
||||
CAT(prefix, _Y_PITCH), \
|
||||
CAT(prefix, _BATCH_PITCH), \
|
||||
CAT(prefix, _FEATURE_NUM), \
|
||||
CAT(prefix, _SIZE_X), \
|
||||
CAT(prefix, _OFFSET))
|
||||
|
||||
#define GET_DATA_BF8_XY16_INDEX(prefix, b, f, y, x) \
|
||||
FUNC_CALL(get_bf8_xy16_index)( \
|
||||
b, f, y, x, CAT(prefix, _SIZE_X ), \
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _FEATURE_NUM), \
|
||||
CAT(prefix, _OFFSET))
|
||||
|
||||
inline uint FUNC(get_fs_bs_yx_bsv4_fsv32_index)(uint b, uint f, uint y, uint x,
|
||||
uint x_pad_before, uint x_size, uint x_pad_after,
|
||||
uint y_pad_before, uint y_size, uint y_pad_after,
|
||||
uint size_f, uint size_b)
|
||||
{
|
||||
const uint f_32_aligned = ((size_f + 31)/32) * 32;
|
||||
const uint b_4_aligned = ((size_b + 3)/4) * 4;
|
||||
const uint fsv_idx = f % 32;
|
||||
const uint bsv_idx = b % 4;
|
||||
const uint fs_idx = f / 32;
|
||||
const uint bs_idx = b / 4;
|
||||
|
||||
const uint x_pitch = 32 * 4;
|
||||
const uint y_pitch = 32 * 4 * (x_pad_before + x_size + x_pad_after);
|
||||
const uint bs_pitch = y_pitch * (y_pad_before + y_size + y_pad_after);
|
||||
const uint fs_pitch = bs_pitch * (b_4_aligned / 4);
|
||||
uint offset = x_pitch * x_pad_before + y_pitch * y_pad_before;
|
||||
|
||||
size_t idx = offset + fsv_idx + bsv_idx * 32;
|
||||
idx += 32*4 * x;
|
||||
idx += y * y_pitch;
|
||||
idx += bs_idx * bs_pitch;
|
||||
idx += fs_idx * fs_pitch;
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
#define GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(prefix, b, f, y, x) \
|
||||
FUNC_CALL(get_fs_bs_yx_bsv4_fsv32_index)( \
|
||||
b, f, y, x, \
|
||||
CAT(prefix, _PAD_BEFORE_SIZE_X), \
|
||||
CAT(prefix, _SIZE_X), \
|
||||
CAT(prefix, _PAD_AFTER_SIZE_X), \
|
||||
CAT(prefix, _PAD_BEFORE_SIZE_Y), \
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _PAD_AFTER_SIZE_Y), \
|
||||
CAT(prefix, _FEATURE_NUM), \
|
||||
CAT(prefix, _BATCH_NUM))
|
||||
|
||||
#define GET_FILTER_GOIYX(prefix, g, o, i, y, x) \
|
||||
CAT(prefix, _OFFSET) + \
|
||||
(x)*CAT(prefix, _X_PITCH) + \
|
||||
|
@ -172,7 +172,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
|
||||
}
|
||||
#endif
|
||||
|
||||
#if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
|
||||
#if OUTPUT_LAYOUT_B_FS_YX_FSV4
|
||||
const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#if OUTPUT_FEATURE_NUM % 4 == 0
|
||||
*((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;
|
||||
|
@ -1,189 +0,0 @@
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "include/include_all.cl"
|
||||
|
||||
#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
|
||||
#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
|
||||
|
||||
#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
|
||||
|
||||
#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
|
||||
#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
|
||||
|
||||
#if MAX_POOLING
|
||||
#define INIT_VAL ACCUMULATOR_VAL_MIN
|
||||
#elif AVG_POOLING
|
||||
#define INIT_VAL ACCUMULATOR_VAL_ZERO
|
||||
#else
|
||||
#error
|
||||
#endif
|
||||
|
||||
inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
|
||||
{
|
||||
#if MAX_POOLING
|
||||
return ACCUMULATOR_MAX_FUNC(tmp, in);
|
||||
#elif AVG_POOLING
|
||||
return tmp + in;
|
||||
#endif
|
||||
}
|
||||
|
||||
KERNEL(pooling_gpu_byxf_af32)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint x = (uint)get_global_id(0);
|
||||
const uint y = (uint)get_global_id(1);
|
||||
const uint bf = (uint)get_global_id(2);
|
||||
// we process 4 features per workitem that's why we need to divide it
|
||||
const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
|
||||
const uint f = 4 * (bf % (aligned32_features / 4));
|
||||
const uint b = bf / (aligned32_features / 4);
|
||||
|
||||
typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_t;
|
||||
if (x >= OUTPUT_SIZE_X)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
ACCUMULATOR_VEC4 result = INIT_VAL;
|
||||
|
||||
#ifdef CHECK_BOUNDRY
|
||||
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
|
||||
offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
uint num_elementes = 0;
|
||||
#endif
|
||||
|
||||
const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0);
|
||||
for(uint j = 0; j < POOL_SIZE_Y; j++)
|
||||
{
|
||||
int input_offset_y = offset_y + j;
|
||||
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
|
||||
if(!zero_y)
|
||||
{
|
||||
for(uint i = 0; i < POOL_SIZE_X; i++)
|
||||
{
|
||||
int input_offset_x = offset_x + i;
|
||||
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
|
||||
if(!zero)
|
||||
{
|
||||
const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
|
||||
|
||||
input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
|
||||
result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
|
||||
result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
|
||||
result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
|
||||
result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
num_elementes++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
|
||||
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
|
||||
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
|
||||
const uint num_elementes = (hend - offset_y) * (wend - offset_x);
|
||||
#endif
|
||||
#else
|
||||
uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x);
|
||||
|
||||
for(uint j = 0; j < POOL_SIZE_Y; j++)
|
||||
{
|
||||
for(uint i = 0; i < POOL_SIZE_X; i++)
|
||||
{
|
||||
input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
|
||||
result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
|
||||
result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
|
||||
result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
|
||||
result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
|
||||
|
||||
input_idx += INPUT0_X_PITCH;
|
||||
}
|
||||
input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH);
|
||||
}
|
||||
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined AVG_POOLING
|
||||
#if ENABLE_ROUND
|
||||
int4 not_fused_result;
|
||||
for (uint i = 0; i < 4; ++i) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1)));
|
||||
#else
|
||||
not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
|
||||
#endif
|
||||
}
|
||||
#else // ENABLE_ROUND
|
||||
float4 not_fused_result;
|
||||
for (uint i = 0; i < 4; ++i) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1);
|
||||
#else
|
||||
not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
|
||||
#endif
|
||||
}
|
||||
#endif // ENABLE_ROUND
|
||||
#else // AVG_POOLING
|
||||
float4 not_fused_result = convert_float4(result);
|
||||
#endif // AVG_POOLING
|
||||
|
||||
OUTPUT_VEC4 final_result;
|
||||
#if HAS_FUSED_OPS
|
||||
ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result);
|
||||
FUSED_OPS;
|
||||
final_result = FUSED_OPS_RESULT;
|
||||
for(uint op = 0; op < 4; op++)
|
||||
{
|
||||
const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
|
||||
output[output_pos] = final_result[op];
|
||||
}
|
||||
#else
|
||||
final_result = TO_OUTPUT_VEC4(not_fused_result);
|
||||
for(uint op = 0; op < 4; op++)
|
||||
{
|
||||
const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
|
||||
final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS));
|
||||
output[output_pos] = final_result[op];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef INIT_VAL
|
||||
#undef ACCUMULATOR_VEC4
|
||||
|
||||
#undef ACTIVATION_VEC4
|
||||
#undef TO_ACTIVATION_VEC4
|
||||
|
||||
#undef OUTPUT_VEC4
|
||||
#undef TO_OUTPUT_VEC4
|
@ -40,7 +40,7 @@ KERNEL(pooling_gpu_int8_ref)(
|
||||
#endif
|
||||
)
|
||||
{
|
||||
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BYXF_AF32 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
|
||||
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BFZYX
|
||||
const uint x = (uint)get_global_id(0);
|
||||
const uint yz = (uint)get_global_id(1);
|
||||
#if OUTPUT_DIMS == 5
|
||||
|
@ -44,7 +44,7 @@ KERNEL(pooling_gpu)(
|
||||
)
|
||||
{
|
||||
#if OUTPUT_LAYOUT_BFYX || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\
|
||||
OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
|
||||
OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || OUTPUT_LAYOUT_B_FS_YX_FSV4
|
||||
const uint x = (uint)get_global_id(0);
|
||||
#if OUTPUT_DIMS == 5
|
||||
const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
|
||||
|
@ -27,16 +27,8 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
|
||||
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
|
||||
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
|
||||
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined OUTPUT_LAYOUT_BF8_XY16
|
||||
return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
|
||||
return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_BYXF_AF32
|
||||
return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_BYX8_F4
|
||||
return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
|
||||
return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
|
||||
return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_FS_B_YX_FSV32
|
||||
|
@ -1,130 +0,0 @@
|
||||
// Copyright (c) 2016-2017 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "include/reshape_dims.cl"
|
||||
#include "include/fetch.cl"
|
||||
|
||||
#include "include/data_types.cl"
|
||||
|
||||
///////////////////////// Input Index /////////////////////////
|
||||
inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_SIMPLE
|
||||
return GET_DATA_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
|
||||
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
|
||||
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined INPUT0_LAYOUT_BF8_XY16
|
||||
return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_BYXF_AF32
|
||||
return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_BYX8_F4
|
||||
return GET_DATA_BYX8_F4_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_FS_BS_YX_BSV4_FSV32
|
||||
return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV4
|
||||
return GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, y, x);
|
||||
#else
|
||||
#error reorder_data.cl: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////// Output Index /////////////////////////
|
||||
|
||||
inline uint FUNC(get_output_index)(uint b, uint f, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_SIMPLE
|
||||
return GET_DATA_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
|
||||
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
|
||||
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined OUTPUT_LAYOUT_BF8_XY16
|
||||
return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_BYXF_AF32
|
||||
return GET_DATA_BYXF_AF32_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_BYX8_F4
|
||||
return GET_DATA_BYX8_F4_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32
|
||||
return GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV4
|
||||
return GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, b, f, y, x);
|
||||
#else
|
||||
#error reorder_data.cl: output format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
KERNEL (reorder_data_byxf_f32_to_byx8_f4_i8)(
|
||||
const __global INPUT_REORDER_TYPE* input,
|
||||
__global OUTPUT_REORDER_TYPE* output
|
||||
#ifdef MEAN_SUBTRACT_IN_BUFFER
|
||||
, __global MEAN_SUBTRACT_TYPE* mean_subtract
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const uint x = get_global_id(0);
|
||||
const uint y = get_group_id(1);
|
||||
const uint b = (uint)get_group_id(2) * WG_BATCH_SIZE + (uint)get_sub_group_id();
|
||||
|
||||
const uint input_idx = FUNC_CALL(get_input_index)(b, 0, y, x);
|
||||
const uint output_idx = FUNC_CALL(get_output_index)(b, 0, y, x);
|
||||
|
||||
#if defined MEAN_SUBTRACT_INSIDE_PARAMS
|
||||
float4 res;
|
||||
res.s0 = TO_MEAN_TYPE(input[input_idx]);
|
||||
res.s0 = MEAN_OP(res.s0, VALUE_TO_SUBTRACT[0 % VALUE_TO_SUBTRACT_SIZE]);
|
||||
res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
|
||||
res.s1 = MEAN_OP(res.s1, VALUE_TO_SUBTRACT[1 % VALUE_TO_SUBTRACT_SIZE]);
|
||||
res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
|
||||
res.s2 = MEAN_OP(res.s2, VALUE_TO_SUBTRACT[2 % VALUE_TO_SUBTRACT_SIZE]);
|
||||
res.s3 = 0;
|
||||
#elif defined MEAN_SUBTRACT_IN_BUFFER
|
||||
#if defined MEAN_PER_FEATURE
|
||||
MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
|
||||
res.s0 = TO_MEAN_TYPE(input[input_idx]);
|
||||
res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
|
||||
res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
|
||||
res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
|
||||
res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
|
||||
res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
|
||||
res.s3 = 0
|
||||
#else
|
||||
MAKE_VECTOR_TYPE(MEAN_SUBTRACT_TYPE, 4) res;
|
||||
res.s0 = TO_MEAN_TYPE(input[input_idx]);
|
||||
res.s1 = TO_MEAN_TYPE(input[input_idx+1]);
|
||||
res.s2 = TO_MEAN_TYPE(input[input_idx+2]);
|
||||
res.s3 = 0;
|
||||
|
||||
res.s0 = MEAN_OP(res.s0, mean_subtract[0]);
|
||||
res.s1 = MEAN_OP(res.s1, mean_subtract[1]);
|
||||
res.s2 = MEAN_OP(res.s2, mean_subtract[2]);
|
||||
#endif
|
||||
#else
|
||||
MAKE_VECTOR_TYPE(CALC_TYPE, 4) res;
|
||||
res.s0 = TO_CALC_TYPE(input[input_idx]);
|
||||
res.s1 = TO_CALC_TYPE(input[input_idx+1]);
|
||||
res.s2 = TO_CALC_TYPE(input[input_idx+2]);
|
||||
res.s3 = 0;
|
||||
#endif
|
||||
|
||||
char4 out_vals;
|
||||
out_vals.s0 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s0), ACTIVATION_PARAMS_TYPED);
|
||||
out_vals.s1 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s1), ACTIVATION_PARAMS_TYPED);
|
||||
out_vals.s2 = ACTIVATION_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE_SAT(res.s2), ACTIVATION_PARAMS_TYPED);
|
||||
out_vals.s3 = 0;
|
||||
|
||||
__global uint* dst = (__global uint*)output;
|
||||
dst[output_idx/4] = as_uint(out_vals);
|
||||
}
|
@ -30,8 +30,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x
|
||||
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
|
||||
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
|
||||
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined INPUT0_LAYOUT_BF8_XY16
|
||||
return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_B_FS_ZYX_FSV16
|
||||
@ -54,8 +52,6 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
|
||||
#elif defined OUTPUT_LAYOUT_BS_F_BSV8__AF8 || \
|
||||
defined OUTPUT_LAYOUT_BS_F_BSV16__AF8
|
||||
return GET_DATA_BS_FYX_BSV8_INDEX(OUTPUT, b, f, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined OUTPUT_LAYOUT_BF8_XY16
|
||||
return GET_DATA_BF8_XY16_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_B_FS_YX_FSV16
|
||||
return GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_B_FS_ZYX_FSV16
|
||||
|
@ -26,10 +26,6 @@ inline uint FUNC(get_input_index)(uint b, uint f, uint y, uint x)
|
||||
#elif defined INPUT0_LAYOUT_BS_F_BSV8__AF8 || \
|
||||
defined INPUT0_LAYOUT_BS_F_BSV16__AF8
|
||||
return GET_DATA_BS_FYX_BSV8_INDEX(INPUT0, b, f, y, x, SUB_GROUP_SIZE);
|
||||
#elif defined INPUT0_LAYOUT_BF8_XY16
|
||||
return GET_DATA_BF8_XY16_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_BYXF_AF32
|
||||
return GET_DATA_BYXF_AF32_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
return GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, b, f, y, x);
|
||||
#elif defined INPUT0_LAYOUT_FS_B_YX_FSV32
|
||||
|
@ -330,8 +330,6 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
|
||||
raw_index_func_val = "GET_DATA_INDEX_RAW(" + _name + ", b, f, y, x)";
|
||||
} else if (layout == DataLayout::b_fs_yx_fsv16 ||
|
||||
layout == DataLayout::b_fs_yx_fsv32 ||
|
||||
layout == DataLayout::byxf_af32 ||
|
||||
layout == DataLayout::fs_bs_yx_bsv4_fsv32 ||
|
||||
layout == DataLayout::b_fs_yx_fsv4 ||
|
||||
layout == DataLayout::fs_b_yx_fsv32 ||
|
||||
layout == DataLayout::bs_fs_yx_bsv16_fsv16) {
|
||||
|
@ -103,11 +103,7 @@ std::string toString(DataLayout l) {
|
||||
case kernel_selector::DataLayout::b_fs_zyx_fsv32: return "B_FS_ZYX_FSV32";
|
||||
case kernel_selector::DataLayout::bs_f_bsv8__af8: return "BS_F_BSV8__AF8";
|
||||
case kernel_selector::DataLayout::bs_f_bsv16__af8: return "BS_F_BSV16__AF8";
|
||||
case kernel_selector::DataLayout::bf8_xy16: return "BF8_XY16";
|
||||
case kernel_selector::DataLayout::winograd_2x3_s1_data: return "WINOGRAD_2x3_S1_DATA";
|
||||
case kernel_selector::DataLayout::byxf_af32: return "BYXF_AF32";
|
||||
case kernel_selector::DataLayout::byx8_f4: return "BYX8_F4";
|
||||
case kernel_selector::DataLayout::fs_bs_yx_bsv4_fsv32: return "FS_BS_YX_BSV4_FSV32";
|
||||
case kernel_selector::DataLayout::b_fs_yx_fsv4: return "B_FS_YX_FSV4";
|
||||
case kernel_selector::DataLayout::b_fs_yx_32fp: return "B_FS_YX_32FP";
|
||||
case kernel_selector::DataLayout::bfzyx: return "BFZYX";
|
||||
|
@ -272,40 +272,6 @@ layout convolution_inst::calc_output_layout(convolution_node const& node) {
|
||||
return {output_type, format::b_fs_yx_32fp, output_size};
|
||||
}
|
||||
|
||||
// due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
|
||||
// will take byxf and return fs_bs_yx_bsv4_fsv32
|
||||
if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
|
||||
input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
|
||||
return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
|
||||
}
|
||||
|
||||
auto users = node.get_users();
|
||||
if (users.size() == 1 && users.front()->is_type<convolution>()) {
|
||||
auto conv_split = users.front()->as<convolution>().get_split();
|
||||
auto conv_groups = (int32_t)users.front()->as<convolution>().get_groups();
|
||||
|
||||
bool next_is_dw = ((conv_split > 1 && conv_split == output_size.feature[0]) ||
|
||||
(conv_groups > 1 && conv_groups == output_size.feature[0]));
|
||||
|
||||
if (input_layout.data_type == data_types::i8 && input_layout.format == format::b_fs_yx_fsv4 && next_is_dw) {
|
||||
return layout{output_type, cldnn::format::byxf_af32, output_size};
|
||||
}
|
||||
|
||||
auto prev_node = node.get_dependencies().front();
|
||||
if (prev_node->is_type<reorder>())
|
||||
prev_node = prev_node->get_dependencies().front();
|
||||
|
||||
auto prev_is_convo = prev_node->is_type<convolution>();
|
||||
if (prev_is_convo) {
|
||||
auto prev2_node = prev_node->get_dependencies().front();
|
||||
auto prev_input_format = prev2_node->get_output_layout().format;
|
||||
|
||||
if (input_layout.data_type == data_types::i8 && input_layout.format == format::byxf_af32 && !next_is_dw &&
|
||||
prev_input_format == format::b_fs_yx_fsv4) {
|
||||
return layout{output_type, cldnn::format::b_fs_yx_fsv4, output_size};
|
||||
}
|
||||
}
|
||||
}
|
||||
return {output_type, input_layout.format, output_size};
|
||||
}
|
||||
|
||||
|
@ -230,13 +230,6 @@ layout fused_conv_eltwise_inst::calc_output_layout(fused_conv_eltwise_node const
|
||||
output_range.spatial[1],
|
||||
output_range.spatial[2]);
|
||||
|
||||
// due to performance reason for using fs_bs_yx_bsv4_fsv32 first convolution have 3 features, so first conv layer
|
||||
// will take byxf and return fs_bs_yx_bsv4_fsv32
|
||||
if (input_layout.data_type == data_types::i8 && input_layout.format == format::byx8_f4 &&
|
||||
input_layout.size.batch[0] % 4 == 0 && input_layout.size.feature[0] == 3) {
|
||||
return layout{output_type, cldnn::format::fs_bs_yx_bsv4_fsv32, output_size};
|
||||
}
|
||||
|
||||
return {output_type, input_layout.format, output_size};
|
||||
}
|
||||
|
||||
|
@ -150,8 +150,6 @@ attach_concatenation_gpu::attach_concatenation_gpu() {
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create},
|
||||
// MMAD
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), concatenation_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), concatenation_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), concatenation_gpu::create},
|
||||
|
@ -189,8 +189,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfzyx), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
|
||||
// block f16 format
|
||||
@ -202,11 +200,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv16), val_fw);
|
||||
// MMAD
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
|
||||
|
||||
@ -216,7 +209,6 @@ attach_convolution_gpu::attach_convolution_gpu() {
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw);
|
||||
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
|
||||
|
@ -1,192 +0,0 @@
|
||||
/*
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef CODE_PREFIX
|
||||
#define CODE_BEGIN CODE_PREFIX
|
||||
#define CODE_END CODE_POSTFIX
|
||||
#else
|
||||
#define CODE_BEGIN
|
||||
#define CODE_END
|
||||
#endif
|
||||
|
||||
CODE_BEGIN
|
||||
enum neural_memory_format {
|
||||
x_f32,
|
||||
xb_f32, // 1D+batch, float32
|
||||
bx_f32, // 1D+batch, float32
|
||||
yxfb_f32, // 3D+batch, float32
|
||||
byxf_f32, // for convolution_cpu_jit_batch1
|
||||
bfyx_f32, // used in Caffe
|
||||
fyxb_f32, // used in Caffe
|
||||
oiyx_f32, // format used only for weights: o - output feature maps, i - input feature maps
|
||||
byxf_b24_f32, // for convolution_cpu_generic
|
||||
yxoi_o4_f32, // for convolution_cpu_generic
|
||||
os_yxi_sv16_f32, // format used only for weights: os - output slice, i - input feature maps, sv16 - 16 values of single slice
|
||||
bs_yxf_bv24_f32,
|
||||
any=-1
|
||||
};
|
||||
|
||||
#pragma pack(push, 4)
|
||||
typedef struct _neural_memory_tag {
|
||||
uint format;
|
||||
uint feature_offset;
|
||||
uint spatial_offset;
|
||||
uint vector_size;
|
||||
uint data_offset;
|
||||
uint data[1];
|
||||
} neural_memory;
|
||||
|
||||
typedef struct _neural_vector_tag {
|
||||
uint feature_offset;
|
||||
uint spatial_offset;
|
||||
uint raw_size;
|
||||
uint data[1];
|
||||
} neural_vector;
|
||||
#pragma pack(pop)
|
||||
|
||||
// neural_memory accessors
|
||||
__attribute__((overloadable)) __global uint* get_raw(__global neural_memory* mem) { return &(mem->data[0]); }
|
||||
__attribute__((overloadable)) const __global uint* get_raw(const __global neural_memory* mem) { return &(mem->data[0]); }
|
||||
__attribute__((overloadable)) uint get_raw_size(const __global neural_memory* mem) { return mem->vector_size; }
|
||||
|
||||
__attribute__((overloadable)) __global uint* get_batch(__global neural_memory* mem) { return get_raw(mem); }
|
||||
__attribute__((overloadable)) const __global uint* get_batch(const __global neural_memory* mem) { return get_raw(mem); }
|
||||
__attribute__((overloadable)) uint get_batch_size(const __global neural_memory* mem) { return mem->feature_offset; }
|
||||
|
||||
__attribute__((overloadable)) __global uint* get_feature(__global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
|
||||
__attribute__((overloadable)) const __global uint* get_feature(const __global neural_memory* mem) { return &(mem->data[mem->feature_offset]); }
|
||||
__attribute__((overloadable)) uint get_feature_size(const __global neural_memory* mem) { return mem->spatial_offset - mem->feature_offset; }
|
||||
|
||||
__attribute__((overloadable)) __global uint* get_spatial(__global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
|
||||
__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_memory* mem) { return &(mem->data[mem->spatial_offset]); }
|
||||
__attribute__((overloadable)) uint get_spatial_size(const __global neural_memory* mem) { return get_raw_size(mem) - mem->spatial_offset; }
|
||||
|
||||
__attribute__((overloadable)) __global void* get_data(__global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
|
||||
__attribute__((overloadable)) const __global void* get_data(const __global neural_memory* mem) { return &(mem->data[mem->data_offset]); }
|
||||
__attribute__((overloadable)) size_t get_element_size(const __global neural_memory* mem) { return sizeof(float); }
|
||||
|
||||
__attribute__((overloadable)) size_t get_data_size(const __global neural_memory* mem) {
|
||||
size_t result = get_element_size(mem);
|
||||
|
||||
const __global uint* raw = get_raw(mem);
|
||||
uint raw_size = get_raw_size(mem);
|
||||
|
||||
for(uint i = 0; i < raw_size; i++) {
|
||||
result *= raw[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// neural_vector accessors
|
||||
// TODO NOTE: non-const accessors are disabled now, because read-only neural_vector argument is only supported now
|
||||
|
||||
//__attribute__((overloadable)) __global uint* get_raw(__global neural_vector* v) { return &(v->data[0]); }
|
||||
__attribute__((overloadable)) const __global uint* get_raw(const __global neural_vector* v) { return &(v->data[0]); }
|
||||
__attribute__((overloadable)) uint get_raw_size(const __global neural_vector* v) { return v->raw_size; }
|
||||
|
||||
//__attribute__((overloadable)) __global uint* get_batch(__global neural_vector* v) { return get_raw(v); }
|
||||
__attribute__((overloadable)) const __global uint* get_batch(const __global neural_vector* v) { return get_raw(v); }
|
||||
__attribute__((overloadable)) uint get_batch_size(const __global neural_vector* v) { return v->feature_offset; }
|
||||
|
||||
//__attribute__((overloadable)) __global uint* get_feature(__global neural_vector* v) { return &(v->data[v->feature_offset]); }
|
||||
__attribute__((overloadable)) const __global uint* get_feature(const __global neural_vector* v) { return &(v->data[v->feature_offset]); }
|
||||
__attribute__((overloadable)) uint get_feature_size(const __global neural_vector* v) { return v->spatial_offset - v->feature_offset; }
|
||||
|
||||
//__attribute__((overloadable)) __global uint* get_spatial(__global neural_vector* v) { return &(v->data[v->spatial_offset]); }
|
||||
__attribute__((overloadable)) const __global uint* get_spatial(const __global neural_vector* v) { return &(v->data[v->spatial_offset]); }
|
||||
__attribute__((overloadable)) uint get_spatial_size(const __global neural_vector* v) { return get_raw_size(v) - v->spatial_offset; }
|
||||
|
||||
CODE_END
|
||||
|
||||
/*
|
||||
KERNEL(Fully_Connected_GPU)
|
||||
DECALRE_CONSTANT()
|
||||
BEGIN_ARGUMENTS_DECLARATION
|
||||
DECLARE_INPUT_MEMORY_ARGUMENT(input_mem)
|
||||
DECLARE_INPUT_MEMORY_ARGUMENT(weights_mem)
|
||||
DECLARE_INPUT_MEMORY_ARGUMENT(bias_mem)
|
||||
DECLARE_OUTPUT_MEMORY_ARGUMENT(dst_mem)
|
||||
END_ARGUMENTS_DECLARATION
|
||||
CODE_BEGIN
|
||||
#define WEIGHTS { 1.0, 3.2, 4.5, 6.7 }
|
||||
#define WEIGHTS_SIZE { 2, 2 }
|
||||
#define WEIGHTS_DIM 2
|
||||
*/
|
||||
__kernel void Fully_Connected_GPU(__global neural_memory* input_mem, __global neural_memory* weights_mem, __global neural_memory* bias_mem, __global neural_memory* dst_mem)
|
||||
{
|
||||
__global uint* input_size = get_raw(input_mem);
|
||||
__global uint* weights_size = get_raw(weights_mem);
|
||||
__global float* input = (__global float*)get_data(input_mem);
|
||||
__global float* weights = (__global float*)get_data(weights_mem);
|
||||
__global float* bias = (__global float*)get_data(bias_mem);
|
||||
__global float* pDst = (__global float*)get_data(dst_mem);
|
||||
|
||||
const int x = get_global_id(0);
|
||||
|
||||
pDst[x] = 0;
|
||||
uint outXIdx = x / input_size[0];
|
||||
uint inputBatchIdx = x % input_size[0];
|
||||
uint weightYIdx = outXIdx * weights_size[0];
|
||||
for (uint i = 0; i < input_size[2]; i++)
|
||||
{
|
||||
pDst[x] += input[i * input_size[0] + inputBatchIdx] * weights[weightYIdx + i];
|
||||
}
|
||||
pDst[x] += bias[outXIdx];
|
||||
}
|
||||
CODE_END
|
||||
|
||||
CODE_BEGIN
|
||||
__kernel void Convolution_GPU(
|
||||
const __global neural_memory* input_mem,
|
||||
const __global neural_memory* filter_mem,
|
||||
float bias,
|
||||
__global neural_memory* dst_mem,
|
||||
const __global neural_vector* spatial_stride)
|
||||
{
|
||||
|
||||
//
|
||||
const __global uint* input_size = get_raw(input_mem);
|
||||
const __global uint* filter_size = get_raw(filter_mem);
|
||||
const __global uint* dst_size = get_raw(dst_mem);
|
||||
const __global float* input = (const __global float*)get_data(input_mem);
|
||||
const __global float* filter = (const __global float*)get_data(filter_mem);
|
||||
__global float* pDst = (__global float*)get_data(dst_mem);
|
||||
//
|
||||
|
||||
int global_id = get_global_id(0);
|
||||
const int batch_num = dst_size[0];
|
||||
const int batch_offset = global_id % dst_size[0];
|
||||
|
||||
const int idx = global_id / batch_num;
|
||||
|
||||
const int x = (idx % input_size[2]) * get_spatial(spatial_stride)[0];
|
||||
const int y = (idx * get_spatial(spatial_stride)[1]) / input_size[2];
|
||||
|
||||
const int out_offset = idx * batch_num + batch_offset;
|
||||
|
||||
pDst[out_offset] = 0;
|
||||
for (uint i = 0; i < filter_size[4]; i++)
|
||||
{
|
||||
for (uint j = 0; j < filter_size[3]; j++)
|
||||
{
|
||||
int input_idx = (x + j + ((y + i) * input_size[2])) * batch_num + batch_offset;
|
||||
int filter_idx = i * filter_size[3] + j;
|
||||
pDst[out_offset] += input[input_idx] * filter[filter_idx];
|
||||
}
|
||||
}
|
||||
pDst[out_offset] += bias;
|
||||
}
|
||||
CODE_END
|
@ -182,9 +182,6 @@ attach_eltwise_gpu::attach_eltwise_gpu() {
|
||||
{ std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
|
||||
{ std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), eltwise_gpu::create },
|
||||
// MMAD
|
||||
{ std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), eltwise_gpu::create },
|
||||
{ std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), eltwise_gpu::create },
|
||||
{ std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), eltwise_gpu::create },
|
||||
{ std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), eltwise_gpu::create },
|
||||
{ std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), eltwise_gpu::create },
|
||||
{ std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), eltwise_gpu::create },
|
||||
|
@ -100,8 +100,6 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() {
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw},
|
||||
// MMAD
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw},
|
||||
// IMAD
|
||||
|
@ -183,23 +183,13 @@ attach_fused_conv_eltwise_gpu::attach_fused_conv_eltwise_gpu() {
|
||||
fused_conv_eltwise_gpu::create);
|
||||
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
// MMAD
|
||||
implementation_map<fused_conv_eltwise>::add(
|
||||
std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
// IMAD
|
||||
implementation_map<fused_conv_eltwise>::add(
|
||||
std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
implementation_map<fused_conv_eltwise>::add(
|
||||
std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32),
|
||||
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32),
|
||||
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba),
|
||||
fused_conv_eltwise_gpu::create);
|
||||
fused_conv_eltwise_gpu::create);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -206,9 +206,6 @@ attach_pooling_gpu::attach_pooling_gpu() {
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
|
||||
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create);
|
||||
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), pooling_gpu::create);
|
||||
|
@ -113,11 +113,6 @@ attach_quantize_gpu::attach_quantize_gpu() {
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
|
||||
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
|
||||
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw);
|
||||
|
@ -99,11 +99,7 @@ attach_resample_gpu::attach_resample_gpu() {
|
||||
{std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv4), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), resample_gpu::create},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), resample_gpu::create}});
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), resample_gpu::create}});
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -127,10 +127,6 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
|
||||
(l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
|
||||
return false;
|
||||
|
||||
// TODO: If we replace byxf_af32 with byxf we can probably do this optimization, but support in kernels is required
|
||||
if (l.format == format::byxf_af32 && (l.size.feature[0] % 32 != 0 || node.get_primitive()->axis != concatenation::along_f))
|
||||
return false;
|
||||
|
||||
if (l.format == format::bs_fs_yx_bsv16_fsv16)
|
||||
return false;
|
||||
|
||||
|
@ -121,12 +121,9 @@ void prepare_padding::run(program_impl& p) {
|
||||
|
||||
// right now output padding optimization is only available for bfyx format and data type = float32
|
||||
if (conv_layout.format != cldnn::format::bfyx &&
|
||||
conv_layout.format != cldnn::format::bf8_xy16 &&
|
||||
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
|
||||
conv_layout.format != cldnn::format::b_fs_zyx_fsv16 &&
|
||||
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
|
||||
conv_layout.format != cldnn::format::byxf_af32 &&
|
||||
conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
|
||||
conv_layout.format != cldnn::format::b_fs_yx_fsv4 &&
|
||||
conv_layout.format != cldnn::format::fs_b_yx_fsv32 &&
|
||||
conv_layout.format != cldnn::format::b_fs_yx_32fp) {
|
||||
|
@ -760,11 +760,8 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
|
||||
for (auto& dep : eltw_node->get_dependencies()) {
|
||||
format fmt = dep->get_output_layout().format;
|
||||
data_types dep_dt = dep->get_output_layout().data_type;
|
||||
if ((fmt != format::fs_bs_yx_bsv4_fsv32 || dep_dt != data_types::i8) &&
|
||||
(fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
|
||||
if ((fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::i8) &&
|
||||
(fmt != format::b_fs_yx_fsv4 || dep_dt != data_types::u8) &&
|
||||
(fmt != format::byxf_af32 || dep_dt != data_types::i8) &&
|
||||
(fmt != format::byxf_af32 || dep_dt != data_types::u8) &&
|
||||
(fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) &&
|
||||
(fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) &&
|
||||
(fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused))
|
||||
|
@ -372,70 +372,6 @@ void reorder_inputs::run(program_impl& p, layout_optimizer& lo, reorder_factory&
|
||||
auto fmt_map = get_preferred_formats(p, lo);
|
||||
propagate_formats(p, fmt_map, lo);
|
||||
minimize_local_reorders(p, fmt_map, lo);
|
||||
|
||||
// WA START ============================================================================================================
|
||||
if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) {
|
||||
// This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer.
|
||||
//
|
||||
// Find pattern:
|
||||
// mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) ->
|
||||
// Replace with:
|
||||
// mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) ->
|
||||
//
|
||||
// Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32,
|
||||
// but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed.
|
||||
// Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution.
|
||||
for (auto& node_ptr : p.get_processing_order()) {
|
||||
if (!node_ptr->is_in_data_flow() || !node_ptr->is_type<convolution>() || fmt_map.at(node_ptr) != format::byxf_af32)
|
||||
continue;
|
||||
|
||||
auto& conv_node = node_ptr->as<convolution>();
|
||||
|
||||
bool input_path =
|
||||
conv_node.input().get_output_layout().data_type == data_types::i8 &&
|
||||
conv_node.input().is_type<mvn>() &&
|
||||
fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16;
|
||||
bool output_path =
|
||||
conv_node.get_users().size() == 1 &&
|
||||
conv_node.get_users().front()->is_type<mvn>() &&
|
||||
fmt_map.at(conv_node.get_users().front()) == format::bfyx &&
|
||||
conv_node.get_users().front()->get_users().size() == 1 &&
|
||||
!conv_node.get_users().front()->as<mvn>().get_primitive()->across_channels;
|
||||
|
||||
if (!input_path || !output_path)
|
||||
continue;
|
||||
|
||||
auto in_lay = conv_node.input().get_output_layout();
|
||||
auto out_lay = conv_node.get_output_layout();
|
||||
auto wei_lay = conv_node.weights().get_output_layout();
|
||||
bool correct_layouts =
|
||||
// weights
|
||||
wei_lay.data_type == data_types::i8 &&
|
||||
wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 &&
|
||||
// input/output
|
||||
in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 &&
|
||||
in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 &&
|
||||
in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 &&
|
||||
in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720;
|
||||
|
||||
if (!correct_layouts)
|
||||
continue;
|
||||
|
||||
bool correct_conv =
|
||||
conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 &&
|
||||
!conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() &&
|
||||
!conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() &&
|
||||
conv_node.get_primitive()->dilation == tensor(1);
|
||||
|
||||
if (!correct_conv)
|
||||
continue;
|
||||
|
||||
fmt_map.at(node_ptr) = format::b_fs_yx_fsv16;
|
||||
fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16;
|
||||
}
|
||||
}
|
||||
// WA END ==============================================================================================================
|
||||
|
||||
insert_reorders(p, fmt_map, rf);
|
||||
|
||||
for (auto n : p.get_processing_order()) {
|
||||
|
@ -85,16 +85,8 @@ inline std::string fmt_to_str(format fmt) {
|
||||
return "bs_xs_xsv8_bsv16";
|
||||
case format::bs_x_bsv16:
|
||||
return "bs_x_bsv16";
|
||||
case format::bf8_xy16:
|
||||
return "bf8_xy16";
|
||||
case format::winograd_2x3_s1_data:
|
||||
return "winograd_2x3_s1_data";
|
||||
case format::byxf_af32:
|
||||
return "byxf_af32";
|
||||
case format::byx8_f4:
|
||||
return "byx8_f4";
|
||||
case format::fs_bs_yx_bsv4_fsv32:
|
||||
return "fs_bs_yx_bsv4_fsv32";
|
||||
case format::b_fs_yx_fsv4:
|
||||
return "b_fs_yx_fsv4";
|
||||
case format::b_fs_yx_32fp:
|
||||
|
@ -126,16 +126,8 @@ kernel_selector::data_layout to_data_layout(format f) {
|
||||
return kernel_selector::data_layout::bs_f_bsv8__af8;
|
||||
case format::bs_xs_xsv8_bsv16:
|
||||
return kernel_selector::data_layout::bs_f_bsv16__af8;
|
||||
case format::bf8_xy16:
|
||||
return kernel_selector::data_layout::bf8_xy16;
|
||||
case format::winograd_2x3_s1_data:
|
||||
return kernel_selector::data_layout::winograd_2x3_s1_data;
|
||||
case format::byxf_af32:
|
||||
return kernel_selector::data_layout::byxf_af32;
|
||||
case format::byx8_f4:
|
||||
return kernel_selector::data_layout::byx8_f4;
|
||||
case format::fs_bs_yx_bsv4_fsv32:
|
||||
return kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32;
|
||||
case format::b_fs_yx_fsv4:
|
||||
return kernel_selector::data_layout::b_fs_yx_fsv4;
|
||||
case format::b_fs_yx_32fp:
|
||||
@ -185,16 +177,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
|
||||
return cldnn::format::bs_xs_xsv8_bsv8;
|
||||
case kernel_selector::data_layout::bs_f_bsv16__af8:
|
||||
return cldnn::format::bs_x_bsv16;
|
||||
case kernel_selector::data_layout::bf8_xy16:
|
||||
return cldnn::format::bf8_xy16;
|
||||
case kernel_selector::data_layout::winograd_2x3_s1_data:
|
||||
return cldnn::format::winograd_2x3_s1_data;
|
||||
case kernel_selector::data_layout::byxf_af32:
|
||||
return cldnn::format::byxf_af32;
|
||||
case kernel_selector::data_layout::byx8_f4:
|
||||
return cldnn::format::byx8_f4;
|
||||
case kernel_selector::data_layout::fs_bs_yx_bsv4_fsv32:
|
||||
return cldnn::format::fs_bs_yx_bsv4_fsv32;
|
||||
case kernel_selector::data_layout::b_fs_yx_32fp:
|
||||
return cldnn::format::b_fs_yx_32fp;
|
||||
case kernel_selector::data_layout::bfzyx:
|
||||
@ -510,20 +494,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split
|
||||
size_t pitch = 1;
|
||||
auto new_vals = vals;
|
||||
|
||||
if (ks_layout == kernel_selector::Tensor::byxf_af32) {
|
||||
new_vals[3] = align_to(vals[3], 32);
|
||||
}
|
||||
if (ks_layout == kernel_selector::Tensor::b_fs_yx_fsv32) {
|
||||
new_vals[1] = align_to(vals[1], 32);
|
||||
}
|
||||
if (ks_layout == kernel_selector::Tensor::fs_bs_yx_bsv4_fsv32) {
|
||||
new_vals[3] = align_to(vals[3], 32);
|
||||
new_vals[2] = align_to(vals[2], 4);
|
||||
}
|
||||
if (ks_layout == kernel_selector::Tensor::byx8_f4) {
|
||||
new_vals[3] = align_to(vals[3], 4);
|
||||
new_vals[2] = align_to(vals[2], 8);
|
||||
}
|
||||
if (ks_layout == kernel_selector::Tensor::bs_fs_yx_bsv16_fsv16) {
|
||||
new_vals[0] = align_to(vals[0], 16);
|
||||
new_vals[1] = align_to(vals[1], 16);
|
||||
|
@ -183,22 +183,13 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
if (next.is_type<fully_connected>() &&
|
||||
(fmt_prev == format::bfyx || fmt_prev == format::yxfb ||
|
||||
fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::fs_b_yx_fsv32 ||
|
||||
fmt_prev == format::byxf_af32 || fmt_prev == format::b_fs_yx_fsv32 ||
|
||||
fmt_prev == format::b_fs_yx_fsv32 ||
|
||||
(fmt_prev == format::b_fs_yx_fsv4 &&
|
||||
prev_output_layout.size.feature[0] % 32 == 0 &&
|
||||
prev_output_layout.size.spatial[0] == 1 &&
|
||||
prev_output_layout.size.spatial[1] == 1)))
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::b_fs_yx_fsv4 && next.as<convolution>().get_groups() != 1)
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() && fmt_prev == format::byxf_af32 && fmt_next == format::bfyx)
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::byxf_af32 && next.as<convolution>().get_groups() == 1)
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() && fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv4 && is_input_idx(0))
|
||||
return true;
|
||||
|
||||
@ -232,7 +223,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
(fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx) && prev_output_layout.size.feature[0] == 3 &&
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 ||
|
||||
(fmt_next == format::b_fs_yx_fsv4 ||
|
||||
fmt_next == format::bs_fs_yx_bsv16_fsv16))
|
||||
return true;
|
||||
|
||||
@ -727,23 +718,6 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
// nothing to do, just go out from here.
|
||||
} else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim) ||
|
||||
(_output_size_handling_enabled && prim->with_output_size) || node.get_transposed()) {
|
||||
// commented out due to performance reasons, maybe enable in future
|
||||
/*if (current_layout.data_type == data_types::f32 &&
|
||||
current_layout.size.batch[0] % 16 == 0 &&
|
||||
current_layout.format == format::bfyx &&
|
||||
output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
|
||||
prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
|
||||
prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
|
||||
!node.get_transposed())
|
||||
{
|
||||
if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] *
|
||||
current_layout.size.spatial[1]) == 16 && current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
|
||||
{
|
||||
expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
|
||||
expected_format = cldnn::format::bf8_xy16;
|
||||
}
|
||||
}
|
||||
else*/
|
||||
{
|
||||
expected_tensor = current_layout.size;
|
||||
if (current_layout.format == format::b_fs_zyx_fsv16 || current_layout.format == format::bs_fs_zyx_bsv16_fsv16)
|
||||
|
@ -254,8 +254,7 @@ memory_impl::ptr memory_pool::get_from_padded_pool(const layout& layout,
|
||||
((layout.format != format::b_fs_yx_fsv32 && layout.format != format::b_fs_zyx_fsv32) ||
|
||||
(layout.size.feature[0] % 32 == 0)) &&
|
||||
// TODO: check if this condition always correct
|
||||
((layout.format == format::byxf_af32 && layout.size.feature[0] == rec_list._memory->get_layout().size.feature[0]) ||
|
||||
(layout.format != format::byxf_af32 && layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0])) &&
|
||||
layout.size.feature[0] <= rec_list._memory->get_layout().size.feature[0] &&
|
||||
layout.size.batch[0] <= rec_list._memory->get_layout().size.batch[0] &&
|
||||
rec_list._memory->get_layout().format != format::fs_b_yx_fsv32 &&
|
||||
layout.format != format::fs_b_yx_fsv32 &&
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2018-2019 Intel Corporation
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -142,9 +142,7 @@ std::pair<bool, bool> program_helpers::are_layouts_identical(layout const& l1, l
|
||||
return {false, false};
|
||||
if (l1.get_linear_size() != l2.get_linear_size())
|
||||
return {false, false};
|
||||
if ((l1.format == format::bf8_xy16 && l2.format != format::bf8_xy16) ||
|
||||
(l2.format == format::bf8_xy16 && l1.format != format::bf8_xy16) ||
|
||||
(l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
|
||||
if ((l1.format == format::b_fs_yx_fsv4 && l2.format != format::b_fs_yx_fsv4) ||
|
||||
(l2.format == format::b_fs_yx_fsv4 && l1.format != format::b_fs_yx_fsv4) ||
|
||||
(l1.format == format::fs_b_yx_fsv32 && l2.format != format::fs_b_yx_fsv32) ||
|
||||
(l2.format == format::fs_b_yx_fsv32 && l1.format != format::fs_b_yx_fsv32) ||
|
||||
|
@ -6873,7 +6873,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa
|
||||
build_options options;
|
||||
options.set_option(build_option::optimize_data(true));
|
||||
implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
|
||||
options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
|
||||
options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
|
||||
|
||||
network network(engine, topology, options);
|
||||
network.set_input_data("input", input);
|
||||
@ -6893,7 +6893,7 @@ TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_pa
|
||||
|
||||
EXPECT_EQ(output_layout.format, format::bfyx);
|
||||
|
||||
EXPECT_EQ(y_size, output_size.spatial[1]);
|
||||
EXPECT_EQ(y_size, output_size.spatial[1]);
|
||||
EXPECT_EQ(x_size, output_size.spatial[0]);
|
||||
EXPECT_EQ(f_size, output_size.feature[0]);
|
||||
EXPECT_EQ(b_size, output_size.batch[0]);
|
||||
@ -7945,8 +7945,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
.all_test_params(format::bfyx, false, true)
|
||||
.all_test_params(format::bfyx, true, false)
|
||||
.all_test_params(format::b_fs_yx_fsv4)
|
||||
// byxf_af32 - depthwise broken for batch > 1
|
||||
// .smoke_test_params(format::byxf_af32)
|
||||
.all_test_params(format::b_fs_yx_fsv32)
|
||||
.all_test_params(format::b_fs_yx_fsv32, true, true)
|
||||
.all_test_params(format::b_fs_yx_fsv32, false, true)
|
||||
|
@ -1336,7 +1336,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(1, 3),
|
||||
testing::Values(1, 3),
|
||||
testing::Values(3, 32),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_i8_i8_test::PrintToStringParamName
|
||||
);
|
||||
@ -1350,7 +1350,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(1, 3),
|
||||
testing::Values(1, 3),
|
||||
testing::Values(3, 32),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_i8_u8_test::PrintToStringParamName
|
||||
);
|
||||
@ -1364,7 +1364,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(1, 3),
|
||||
testing::Values(1, 3),
|
||||
testing::Values(3, 32),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_i8_f32_test::PrintToStringParamName
|
||||
);
|
||||
@ -1378,7 +1378,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(1, 3),
|
||||
testing::Values(1, 3),
|
||||
testing::Values(3, 32),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_u8_i8_test::PrintToStringParamName
|
||||
);
|
||||
@ -1392,7 +1392,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(1, 3),
|
||||
testing::Values(1, 3),
|
||||
testing::Values(3, 32),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_u8_u8_test::PrintToStringParamName
|
||||
);
|
||||
@ -1406,7 +1406,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(1, 3),
|
||||
testing::Values(1, 3),
|
||||
testing::Values(3, 32),
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32, format::byxf_af32)
|
||||
testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::b_fs_yx_fsv32)
|
||||
),
|
||||
fully_connected_u8_f32_test::PrintToStringParamName
|
||||
);
|
||||
|
@ -117,7 +117,6 @@ TEST(fused_conv_eltwise, basic_image2d)
|
||||
EXPECT_EQ(outputs_act.begin()->first, "eltwise");
|
||||
|
||||
auto output_act = outputs_act.begin()->second.get_memory();
|
||||
auto&& out_act_layout = output_act.get_layout();
|
||||
auto out_act_ptr = output_act.pointer<uint8_t>();
|
||||
|
||||
topology topology_ref(
|
||||
@ -140,7 +139,6 @@ TEST(fused_conv_eltwise, basic_image2d)
|
||||
EXPECT_EQ(outputs_ref.begin()->first, "out");
|
||||
|
||||
auto output_ref = outputs_ref.begin()->second.get_memory();
|
||||
auto&& out_ref_layout = output_ref.get_layout();
|
||||
auto out_ref_ptr = output_ref.pointer<uint8_t>();
|
||||
|
||||
for (int i = 0;i < 3 * 256 * 4;i++) {
|
||||
|
@ -1373,58 +1373,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_shift_swish,
|
||||
bc_test_params{CASE_CONV3D_S8S8_4, 2, 6},
|
||||
}), );
|
||||
|
||||
|
||||
class conv_int8_byxf_af32 : public ConvFusingTest {};
|
||||
TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)),
|
||||
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
scale("scale", "conv_prim", "scale_data"),
|
||||
reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
implementation_desc conv_impl = { format::byxf_af32, "" };
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
TEST_P(conv_int8_byxf_af32, per_element_coeffs) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("eltwise_data", get_mem(get_output_layout(p))),
|
||||
convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
|
||||
eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum),
|
||||
reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
implementation_desc conv_impl = { format::byxf_af32, "" };
|
||||
bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_byxf_af32,
|
||||
::testing::ValuesIn(std::vector<bc_test_params>{
|
||||
bc_test_params{CASE_CONV_U8S8_1, 2, 3},
|
||||
bc_test_params{CASE_CONV_U8S8_2, 2, 3},
|
||||
bc_test_params{CASE_CONV_U8S8_3, 2, 3},
|
||||
bc_test_params{CASE_CONV_U8S8_4, 2, 3},
|
||||
bc_test_params{CASE_CONV_U8S8_6, 2, 3},
|
||||
bc_test_params{CASE_CONV_S8S8_1, 2, 3},
|
||||
bc_test_params{CASE_CONV_S8S8_2, 2, 3},
|
||||
bc_test_params{CASE_CONV_S8S8_3, 2, 3},
|
||||
bc_test_params{CASE_CONV_S8S8_4, 2, 3},
|
||||
bc_test_params{CASE_CONV_S8S8_6, 2, 3},
|
||||
}), );
|
||||
|
||||
class conv_int8_prelu_eltwise : public ConvFusingTest {};
|
||||
TEST_P(conv_int8_prelu_eltwise, basic) {
|
||||
auto p = GetParam();
|
||||
@ -3452,7 +3400,7 @@ struct activation_test_params {
|
||||
#define CASE_ACTIVATION_F32_2 {7, 3, 7, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_3 {1, 14, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_4 {1, 17, 31, 29}, data_types::f32, format::yxfb, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::byxf_af32, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_6 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F32_7 {1, 17, 31, 29}, data_types::f32, format::fyxb, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_3D_F32_0 {3, 16, 13, 13, 13}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx
|
||||
@ -3461,14 +3409,13 @@ struct activation_test_params {
|
||||
#define CASE_ACTIVATION_3D_F32_3 {1, 17, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F32_4 {1, 17, 7, 7, 7}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F32_5 {1, 17, 7, 7, 7}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F32_6 {1, 17, 7, 7, 7}, data_types::f32, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
|
||||
|
||||
#define CASE_ACTIVATION_F16_0 {7, 32, 5, 5}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_2 {7, 16, 7, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_3 {1, 14, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_4 {1, 17, 31, 29}, data_types::f16, format::yxfb, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::byxf_af32, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_6 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_F16_7 {1, 17, 31, 29}, data_types::f16, format::fyxb, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_3D_F16_0 {3, 16, 13, 13, 13}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx
|
||||
@ -3477,7 +3424,6 @@ struct activation_test_params {
|
||||
#define CASE_ACTIVATION_3D_F16_3 {1, 17, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F16_4 {1, 17, 7, 7, 7}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F16_5 {1, 17, 7, 7, 7}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx
|
||||
#define CASE_ACTIVATION_3D_F16_6 {1, 17, 7, 7, 7}, data_types::f16, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx
|
||||
|
||||
#define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_ACTIVATION_U8_2 {1, 12, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
@ -3572,7 +3518,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
activation_test_params{CASE_ACTIVATION_F32_7, 2, 3, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"}, // FIXME - accuracy bug
|
||||
}), );
|
||||
|
||||
class activation_scale_activation_quantize_u8 : public ActivationFusingTest {};
|
||||
@ -3640,7 +3585,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
activation_scale_activation_quantize_u8,
|
||||
::testing::ValuesIn(std::vector<activation_test_params>{
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"}, // FIXME - accuracy bug
|
||||
}), );
|
||||
|
||||
class activation_scale_activation : public ActivationFusingTest {};
|
||||
@ -3697,8 +3641,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref"},
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref"},
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref"},
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_5, 2, 4, "activation_ref"},
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"},
|
||||
|
||||
// InputDataType = UINT8
|
||||
activation_test_params{CASE_ACTIVATION_U8_1, 2, 4, "activation_ref"},
|
||||
@ -3719,8 +3662,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::ValuesIn(std::vector<activation_test_params>{
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug
|
||||
activation_test_params{CASE_ACTIVATION_3D_F16_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug
|
||||
}), );
|
||||
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
@ -4450,30 +4391,21 @@ struct pooling_test_params {
|
||||
#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
|
||||
#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
|
||||
#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx
|
||||
#define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
|
||||
#define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
|
||||
|
||||
#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
|
||||
#define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx
|
||||
#define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
|
||||
#define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
|
||||
|
||||
// Disabled
|
||||
#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx
|
||||
#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
|
||||
#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
|
||||
|
||||
class PoolingFusingTest : public ::BaseFusingTest<pooling_test_params> {
|
||||
public:
|
||||
void execute(pooling_test_params& p) {
|
||||
@ -4674,8 +4606,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
|
||||
|
||||
// Input type: INT8
|
||||
pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
@ -4688,8 +4618,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
|
||||
}), );
|
||||
@ -4697,9 +4625,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
|
||||
pooling_scale_activation_quantize,
|
||||
::testing::ValuesIn(std::vector<pooling_test_params>{
|
||||
pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
|
||||
pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
|
||||
pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
|
||||
pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"}, //currently not enabled, fusing not upported
|
||||
}), );
|
||||
|
||||
@ -4742,8 +4667,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
|
||||
|
||||
// Input type: INT8
|
||||
pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
@ -4754,8 +4677,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
@ -4822,8 +4743,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
|
||||
|
||||
// Input type: INT8
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
@ -4834,24 +4753,12 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu,
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
|
||||
pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
|
||||
}), );
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
|
||||
pooling_scale_activation,
|
||||
::testing::ValuesIn(std::vector<pooling_test_params>{
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
|
||||
pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
|
||||
pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
|
||||
pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
|
||||
pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
|
||||
}), );
|
||||
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
/* -------------------------------- DepthToSpace cases ------------------------------------------------- */
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
|
@ -2619,7 +2619,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
testing::Values(std::tuple<int, int, int>(0, 0, 0)),
|
||||
testing::Values(format::yxfb,
|
||||
format::bfyx,
|
||||
format::byxf_af32,
|
||||
format::b_fs_yx_fsv4,
|
||||
format::b_fs_yx_fsv16,
|
||||
format::b_fs_yx_fsv32)),
|
||||
@ -2737,8 +2736,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
format::b_fs_yx_fsv16,
|
||||
format::fs_b_yx_fsv32,
|
||||
format::b_fs_yx_fsv32,
|
||||
format::b_fs_yx_fsv4,
|
||||
format::fs_bs_yx_bsv4_fsv32)),
|
||||
format::b_fs_yx_fsv4)),
|
||||
testing::internal::DefaultParamName<pooling_random_test_params>);
|
||||
|
||||
TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride)
|
||||
|
@ -766,8 +766,6 @@ INSTANTIATE_TEST_CASE_P(smoke,
|
||||
resample_random_test,
|
||||
testing::ValuesIn(
|
||||
resample_random_test_param_generator()
|
||||
.smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32)
|
||||
.smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32)
|
||||
.smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
|
||||
.smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
|
||||
.smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
|
||||
|
@ -105,16 +105,4 @@ TEST(tensor_api, linear_offsets) {
|
||||
test_tensor_offset({ 2, 19, 4, 3 }, { 1, 18, 3, 2 }, cldnn::format::b_fs_yx_fsv16, 754);
|
||||
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::fs_b_yx_fsv32, 675);
|
||||
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::fs_b_yx_fsv32, 1507);
|
||||
|
||||
// Formats with alignment:
|
||||
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byxf_af32, 675);
|
||||
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byxf_af32, 1507);
|
||||
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::byx8_f4, 331);
|
||||
test_tensor_offset({ 2, 37, 4, 3 }, { 1, 35, 3, 2 }, cldnn::format::byx8_f4, 1755);
|
||||
|
||||
// Non-standard blocked formats:
|
||||
// bf8_xy16 - b_fs_es_fsv8_esv16, where e is flattened yx := x + y * size_x
|
||||
test_tensor_offset({ 2, 5, 4, 3 }, { 1, 3, 1, 2 }, cldnn::format::bf8_xy16, 185);
|
||||
test_tensor_offset({ 2, 19, 7, 3 }, { 1, 18, 3, 2 }, cldnn::format::bf8_xy16, 1441);
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user