[IE CLDNN] int8 batches optimization (#632)
This commit is contained in:
parent
b9d67927fd
commit
f8b2627c3b
@ -83,6 +83,7 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::byxf_af32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
|
@ -0,0 +1,145 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include "common_tools.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
//
|
||||
// Kernel specific constants
|
||||
//
|
||||
#define SIMD_SIZE 16
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
KernelsData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options);
|
||||
}
|
||||
|
||||
JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
|
||||
auto mem_consts = Parent::GetJitConstants(params, kd);
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_scalar = {"",
|
||||
{"out_b", "16 * j + out_f + get_sub_group_local_id()", "out_y", "out_x"},
|
||||
"dequantized",
|
||||
input_dt,
|
||||
1,
|
||||
LoadType::FEATURE_SHUFFLE};
|
||||
conf_scalar.SetLoopAxes({ Tensor::DataChannelName::BATCH }, true);
|
||||
conf_scalar.SetShuffleVarName("i");
|
||||
|
||||
mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
|
||||
return mem_consts;
|
||||
} // GetJitConstants
|
||||
|
||||
ConvolutionKernelBase::DispatchData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::SetDefault(const convolution_params& params, int) const {
|
||||
DispatchData kd;
|
||||
const auto& output = params.output;
|
||||
|
||||
std::vector<size_t> global = {output.X().v, output.Y().v, output.Feature().v / 32 * output.Batch().v};
|
||||
std::vector<size_t> local = {1, 1, SIMD_SIZE};
|
||||
|
||||
kd.gws0 = global[0];
|
||||
kd.gws1 = global[1];
|
||||
kd.gws2 = global[2];
|
||||
|
||||
kd.lws0 = local[0];
|
||||
kd.lws1 = local[1];
|
||||
kd.lws2 = local[2];
|
||||
|
||||
kd.cldnnStyle = {0, 0, 0, 0, 0};
|
||||
kd.gemmStyle = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
kd.efficiency = FORCE_PRIORITY_2;
|
||||
|
||||
return kd;
|
||||
} // SetDefault
|
||||
|
||||
bool Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::Validate(const Params& params, const optional_params& options) const {
|
||||
if (!Parent::Validate(params, options)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
KernelData kd = KernelData::Default<convolution_params>(params);
|
||||
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
|
||||
|
||||
if ((newParams.filterSize.x != newParams.filterSize.y) || newParams.filterSize.x != 1) {
|
||||
// Fitler size needs to be 1x1
|
||||
return false;
|
||||
}
|
||||
|
||||
if (newParams.stride.x != newParams.stride.y) {
|
||||
// Strides must be equal
|
||||
return false;
|
||||
}
|
||||
if (newParams.output.X().v != newParams.output.Y().v) {
|
||||
// W and H must be equal
|
||||
return false;
|
||||
}
|
||||
|
||||
if (newParams.output.Feature().v % 32 != 0) {
|
||||
// output feature size must be divided by 32
|
||||
return false;
|
||||
}
|
||||
|
||||
if (newParams.output.Batch().v % 16 != 0) {
|
||||
// batch size must be divided by 16
|
||||
return false;
|
||||
}
|
||||
|
||||
// check that all fused ops except eltwise have only feature or scalar inputs
|
||||
for (auto& fo : newParams.fused_ops) {
|
||||
if (fo.GetType() == FusedOpType::ELTWISE)
|
||||
continue;
|
||||
for (auto& input : fo.tensors) {
|
||||
if (input.X().v != 1 || input.Y().v != 1 || input.Batch().v != 1)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,49 @@
|
||||
/*
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1 : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1() : ConvolutionKernelBase("convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1") {}
|
||||
virtual ~Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& params, const optional_params& options) const override;
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
|
||||
bool NeedPaddedInput() const override { return true; }
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
|
||||
return { WeightsLayout::os_is_yx_osv16_isv16 };
|
||||
}
|
||||
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,146 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
#include "common_tools.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
//
|
||||
// Kernel specific constants
|
||||
//
|
||||
#define SIMD_SIZE 16
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
|
||||
k.EnableInputWeightsType(WeightsType::INT8);
|
||||
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
KernelsData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options);
|
||||
}
|
||||
|
||||
JitConstants Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
|
||||
auto mem_consts = Parent::GetJitConstants(params, kd);
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf_scalar = {"",
|
||||
{"out_b", "out_f + get_sub_group_local_id()", "out_y", "out_x"},
|
||||
"dequantized",
|
||||
input_dt,
|
||||
1,
|
||||
LoadType::FEATURE_SHUFFLE};
|
||||
conf_scalar.SetLoopAxes({ Tensor::DataChannelName::BATCH }, true);
|
||||
conf_scalar.SetShuffleVarName("i");
|
||||
mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
|
||||
}
|
||||
|
||||
return mem_consts;
|
||||
} // GetJitConstants
|
||||
|
||||
ConvolutionKernelBase::DispatchData Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::SetDefault(const convolution_params& params, int) const {
|
||||
DispatchData kd;
|
||||
const auto& output = params.output;
|
||||
|
||||
std::vector<size_t> global = {output.X().v, output.Y().v, output.Feature().v / 16 * output.Batch().v};
|
||||
std::vector<size_t> local = {1, 1, SIMD_SIZE};
|
||||
|
||||
kd.gws0 = global[0];
|
||||
kd.gws1 = global[1];
|
||||
kd.gws2 = global[2];
|
||||
|
||||
kd.lws0 = local[0];
|
||||
kd.lws1 = local[1];
|
||||
kd.lws2 = local[2];
|
||||
|
||||
kd.cldnnStyle = {0, 0, 0, 0, 0};
|
||||
kd.gemmStyle = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
kd.efficiency = FORCE_PRIORITY_2;
|
||||
|
||||
return kd;
|
||||
} // SetDefault
|
||||
|
||||
bool Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::Validate(const Params& params, const optional_params& options) const {
|
||||
if (!Parent::Validate(params, options)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
KernelData kd = KernelData::Default<convolution_params>(params);
|
||||
convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
|
||||
|
||||
if ((newParams.filterSize.x != newParams.filterSize.y) ||
|
||||
newParams.filterSize.x != 3) {
|
||||
// Fitler size needs to be 3x3
|
||||
return false;
|
||||
}
|
||||
|
||||
if (newParams.stride.x != newParams.stride.y) {
|
||||
// Strides must be equal
|
||||
return false;
|
||||
}
|
||||
if (newParams.output.X().v != newParams.output.Y().v) {
|
||||
// W and H must be equal
|
||||
return false;
|
||||
}
|
||||
|
||||
if (newParams.output.Feature().v % 16 != 0) {
|
||||
// output feature size must be divided by 16
|
||||
return false;
|
||||
}
|
||||
|
||||
if (newParams.output.Batch().v % 16 != 0) {
|
||||
// batch size must be divided by 16
|
||||
return false;
|
||||
}
|
||||
|
||||
// check that all fused ops except eltwise have only feature or scalar inputs
|
||||
for (auto& fo : newParams.fused_ops) {
|
||||
if (fo.GetType() == FusedOpType::ELTWISE)
|
||||
continue;
|
||||
for (auto& input : fo.tensors) {
|
||||
if (input.X().v != 1 || input.Y().v != 1 || input.Batch().v != 1)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,49 @@
|
||||
/*
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "convolution_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
class Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3 : public ConvolutionKernelBase {
|
||||
public:
|
||||
using Parent = ConvolutionKernelBase;
|
||||
Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3() : ConvolutionKernelBase("convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3") {}
|
||||
virtual ~Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
protected:
|
||||
bool Validate(const Params& params, const optional_params& options) const override;
|
||||
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
|
||||
bool NeedPaddedInput() const override { return true; }
|
||||
WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
|
||||
return { WeightsLayout::os_is_yx_osv16_isv16 };
|
||||
}
|
||||
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
}
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -73,6 +73,8 @@
|
||||
#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
|
||||
#include "convolution_kernel_b_fs_yx_fsv16_imad.h"
|
||||
#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
|
||||
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1.h"
|
||||
#include "convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
convolution_kernel_selector::convolution_kernel_selector() {
|
||||
@ -93,6 +95,8 @@ convolution_kernel_selector::convolution_kernel_selector() {
|
||||
|
||||
// bs_fs_yx_bsv16_fsv16
|
||||
Attach<ConvolutionKernel_bfyx_to_bfyx_bsv16_fsv16>();
|
||||
Attach<Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1>();
|
||||
Attach<Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3>();
|
||||
|
||||
// fs_byx_fsv32
|
||||
Attach<ConvolutionKernel_fs_byx_fsv32>();
|
||||
|
@ -0,0 +1,96 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
|
||||
#include "kernel_selector_utils.h"
|
||||
|
||||
//
|
||||
// Kernel specific constants
|
||||
//
|
||||
#define SIMD_SIZE 16
|
||||
|
||||
namespace kernel_selector {
|
||||
ParamsKey Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetSupportedKey() const {
|
||||
ParamsKey k;
|
||||
k.EnableInputDataType(Datatype::INT8);
|
||||
k.EnableInputDataType(Datatype::UINT8);
|
||||
|
||||
k.EnableOutputDataType(Datatype::INT8);
|
||||
k.EnableOutputDataType(Datatype::UINT8);
|
||||
|
||||
k.EnableOutputDataType(Datatype::F32);
|
||||
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
k.EnablePoolType(PoolType::MAX);
|
||||
k.EnablePoolType(PoolType::AVG);
|
||||
k.EnablePoolRemainder(PoolRemainder::FLOOR);
|
||||
k.EnablePoolRemainder(PoolRemainder::CEIL);
|
||||
k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
|
||||
k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
|
||||
k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
|
||||
k.EnableDifferentTypes();
|
||||
return k;
|
||||
}
|
||||
|
||||
PoolingKernelBase::DispatchData Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::SetDefault(const pooling_params& params) const {
|
||||
DispatchData runInfo = PoolingKernelBase::SetDefault(params);
|
||||
|
||||
runInfo.gws0 = params.output.Feature().v/16;
|
||||
runInfo.gws1 = params.output.X().v * params.output.Y().v;
|
||||
runInfo.gws2 = params.output.Batch().v;
|
||||
|
||||
runInfo.lws0 = 1;
|
||||
runInfo.lws1 = 1;
|
||||
runInfo.lws2 = SIMD_SIZE;
|
||||
runInfo.efficiency = FORCE_PRIORITY_1;
|
||||
|
||||
return runInfo;
|
||||
}
|
||||
|
||||
JitConstants Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetJitConstants(const pooling_params& params, DispatchData kd) const {
|
||||
auto jit = PoolingKernelBase::GetJitConstants(params, kd);
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
|
||||
FusedOpsConfiguration conf = {"", {"b", "f", "y", "x"}, "pool_result[i]", input_dt, 1};
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
|
||||
return jit;
|
||||
}
|
||||
|
||||
KernelsData Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::GetKernelsData(const Params& params, const optional_params& options) const {
|
||||
return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
|
||||
}
|
||||
|
||||
bool Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16::Validate(const Params& params, const optional_params& options) const {
|
||||
if (!PoolingKernelBase::Validate(params, options)) {
|
||||
return false;
|
||||
}
|
||||
auto p = dynamic_cast<const pooling_params&>(params);
|
||||
|
||||
if (p.inputs[0].Feature().v % 16 != 0)
|
||||
return false;
|
||||
|
||||
if (p.inputs[0].Batch().v % 16 != 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -0,0 +1,41 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "pooling_kernel_base.h"
|
||||
#include <vector>
|
||||
|
||||
namespace kernel_selector {
|
||||
class Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16 : public PoolingKernelBase {
|
||||
public:
|
||||
Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16() : PoolingKernelBase("pooling_gpu_bs_fs_yx_bsv16_fsv16") {}
|
||||
virtual ~Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16() {}
|
||||
|
||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
bool Validate(const Params&, const optional_params&) const override;
|
||||
DispatchData SetDefault(const pooling_params& params) const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::ELTWISE,
|
||||
FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION};
|
||||
}
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
|
||||
};
|
||||
} // namespace kernel_selector
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
// Copyright (c) 2019-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -28,6 +28,7 @@
|
||||
#include "pooling_kernel_gpu_b_fs_yx_fsv16.h"
|
||||
#include "pooling_kernel_gpu_bsv16_fsv16.h"
|
||||
#include "pooling_kernel_gpu_b_fs_yx_fsv16_imad.h"
|
||||
#include "pooling_kernel_gpu_bs_fs_yx_bsv16_fsv16.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
@ -46,6 +47,7 @@ pooling_kernel_selector::pooling_kernel_selector() {
|
||||
Attach<PoolingKernel_b_fs_yx_fsv16>();
|
||||
Attach<PoolingKernel_bsv16_fsv16>();
|
||||
Attach<PoolingKernelGPU_b_fs_yx_fsv16_imad>();
|
||||
Attach<Pooling_kernel_gpu_bs_fs_yx_bsv_16_fsv16>();
|
||||
}
|
||||
|
||||
KernelsData pooling_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
|
@ -41,6 +41,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
k.EnableInputLayout(DataLayout::byxf_af32);
|
||||
k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::yxfb);
|
||||
k.EnableOutputLayout(DataLayout::bfzyx);
|
||||
@ -50,6 +51,7 @@ ParamsKey QuantizeKernelScaleShift::GetSupportedKey() const {
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32);
|
||||
k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
|
||||
k.EnableTensorOffset();
|
||||
k.EnableTensorPitches();
|
||||
k.EnableBatching();
|
||||
|
@ -0,0 +1,144 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/imad.cl"
|
||||
#if QUANTIZATION_TERM
|
||||
#define ACCUMULATOR_TYPE int
|
||||
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
|
||||
#define ACTIVATION_TYPE float
|
||||
#define TO_ACTIVATION_TYPE(x) convert_float(x)
|
||||
#else
|
||||
#define ACCUMULATOR_TYPE INPUT0_TYPE
|
||||
#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
|
||||
#define ACTIVATION_TYPE INPUT0_TYPE
|
||||
#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
|
||||
#endif
|
||||
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
|
||||
#define OUTPUT_TYPE16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
|
||||
#define BATCH_SLICE_SIZE 16
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
__global OUTPUT_TYPE *output,
|
||||
const __global FILTER_TYPE *weights,
|
||||
#if BIAS_TERM
|
||||
const __global BIAS_TYPE *biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint out_x = (uint)get_global_id(0);
|
||||
const uint out_y = (uint)get_global_id(1);
|
||||
const uint out_f = (uint)get_group_id(2) * 32 % OUTPUT_FEATURE_NUM;
|
||||
const uint out_b = ((uint)get_group_id(2) * 32 / OUTPUT_FEATURE_NUM) * 16 + get_sub_group_local_id();
|
||||
|
||||
ACCUMULATOR_TYPE dotProd[32] = {0};
|
||||
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
const uint weights_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
|
||||
const uint input_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
|
||||
const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
|
||||
const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
|
||||
|
||||
uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + get_sub_group_local_id(), 0, 0, 0);
|
||||
uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16 + get_sub_group_local_id(), 0, 0, 0);
|
||||
|
||||
uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, out_b, 0, input_y, input_x);
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < INPUT0_FEATURE_NUM / 16; k++) {
|
||||
uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
|
||||
uint4 weights_val = vload4(0, (__global uint *)(weights + filter_idx));
|
||||
uint4 weights_val2 = vload4(0, (__global uint *)(weights + filter_idx2));
|
||||
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint j = 0; j < 16; j++) {
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
|
||||
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val2.s0, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val2.s1, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val2.s2, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val2.s3, j))));
|
||||
}
|
||||
filter_idx += weights_x_pitch;
|
||||
filter_idx2 += weights_x_pitch;
|
||||
input_idx += input_fs_pitch;
|
||||
}
|
||||
|
||||
OUTPUT_TYPE16 results = 0;
|
||||
|
||||
__attribute__((opencl_unroll_hint(2)))
|
||||
for (uint j = 0; j < 2; j++) {
|
||||
const uint dst_index = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, out_b, 16 * j + out_f, out_y, out_x);
|
||||
#if BIAS_TERM
|
||||
ACTIVATION_TYPE bias = biases[out_f + 16 * j + get_sub_group_local_id()];
|
||||
#endif
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint i = 0; i < 16; i++) {
|
||||
|
||||
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
|
||||
#if BIAS_TERM
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + intel_sub_group_shuffle(bias, i);
|
||||
#else
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i];
|
||||
#endif
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC
|
||||
#else
|
||||
FUSED_OPS
|
||||
#endif
|
||||
OUTPUT_TYPE res = FUSED_OPS_RESULT;
|
||||
results[i] = res;
|
||||
#else
|
||||
results[i] = TO_OUTPUT_TYPE(dequantized);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
vstore4(as_uint4(results), 0, ((__global uint *)(output + dst_index)));
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint z = 0; z < 16; z++) {
|
||||
output[dst_index + z] = results[z];
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#undef BLOCK_LOAD_INPUTS
|
||||
#undef IN_BLOCK_WIDTH
|
||||
#undef IN_BLOCK_HEIGHT
|
||||
#undef PACK
|
||||
#undef AS_TYPE_N_
|
||||
#undef AS_TYPE_N
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
#undef NUM_FILTERS
|
@ -0,0 +1,141 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/common.cl"
|
||||
#include "include/fetch.cl"
|
||||
#include "include/imad.cl"
|
||||
#if QUANTIZATION_TERM
|
||||
#define ACCUMULATOR_TYPE int
|
||||
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
|
||||
#define ACTIVATION_TYPE float
|
||||
#define TO_ACTIVATION_TYPE(x) convert_float(x)
|
||||
#else
|
||||
#define ACCUMULATOR_TYPE INPUT0_TYPE
|
||||
#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
|
||||
#define ACTIVATION_TYPE INPUT0_TYPE
|
||||
#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
|
||||
#endif
|
||||
#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
|
||||
#define OUTPUT_TYPE16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
|
||||
#define BATCH_SLICE_SIZE 16
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
__global OUTPUT_TYPE *output,
|
||||
const __global FILTER_TYPE *weights,
|
||||
#if BIAS_TERM
|
||||
const __global BIAS_TYPE *biases,
|
||||
#endif
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
FUSED_OPS_DECLS,
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint out_x = (uint)get_global_id(0);
|
||||
const uint out_y = (uint)get_global_id(1);
|
||||
const uint out_f = (uint)get_group_id(2) * 16 % OUTPUT_FEATURE_NUM;
|
||||
const uint out_b = ((uint)get_group_id(2) * 16 / OUTPUT_FEATURE_NUM) * 16 + get_sub_group_local_id();
|
||||
|
||||
ACCUMULATOR_TYPE dotProd[16] = {0};
|
||||
const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
const uint weights_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
|
||||
const uint input_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
|
||||
const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
|
||||
const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
|
||||
const uint y_diff = input_y_pitch - 3 * input_x_pitch;
|
||||
const uint f_diff = input_fs_pitch - 3 * input_y_pitch;
|
||||
uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, out_b, 0, input_y, input_x);
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint k = 0; k < INPUT0_FEATURE_NUM / 16; k++) {
|
||||
uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + get_sub_group_local_id(), k*16, 0, 0);
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint y = 0; y < FILTER_SIZE_Y; y++) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (uint x = 0; x < FILTER_SIZE_X; x++) {
|
||||
uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
|
||||
uint4 weights_val = vload4(0, (__global uint *)(weights + filter_idx));
|
||||
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint j = 0; j < 16; j++) {
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
|
||||
}
|
||||
filter_idx += weights_x_pitch;
|
||||
input_idx += input_x_pitch;
|
||||
}
|
||||
input_idx += y_diff;
|
||||
}
|
||||
input_idx += f_diff;
|
||||
}
|
||||
|
||||
OUTPUT_TYPE16 results = 0;
|
||||
const uint dst_index = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, out_b, out_f, out_y, out_x);
|
||||
#if BIAS_TERM
|
||||
ACTIVATION_TYPE bias = biases[out_f + get_sub_group_local_id()];
|
||||
#endif
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint i = 0; i < 16; i++) {
|
||||
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
|
||||
#if BIAS_TERM
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[i] + intel_sub_group_shuffle(bias, i);
|
||||
#else
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[i];
|
||||
#endif
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC
|
||||
#else
|
||||
FUSED_OPS
|
||||
#endif
|
||||
OUTPUT_TYPE res = FUSED_OPS_RESULT;
|
||||
results[i] = res;
|
||||
#else
|
||||
results[i] = TO_OUTPUT_TYPE(dequantized);
|
||||
|
||||
#endif
|
||||
}
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
vstore4(as_uint4(results), 0, ((__global uint *)(output + dst_index)));
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint z = 0; z < 16; z++) {
|
||||
output[dst_index + z] = results[z];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef BLOCK_LOAD_INPUTS
|
||||
#undef IN_BLOCK_WIDTH
|
||||
#undef IN_BLOCK_HEIGHT
|
||||
#undef PACK
|
||||
#undef AS_TYPE_N_
|
||||
#undef AS_TYPE_N
|
||||
#undef AS_INPUT0_TYPE_4
|
||||
#undef NUM_FILTERS
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2018-2019 Intel Corporation
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -195,7 +195,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
|
||||
#elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
|
||||
uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
|
||||
#elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1
|
||||
#elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1 || OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16 == 1
|
||||
uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
|
||||
#else
|
||||
#error "Incorrect output layout"
|
||||
|
@ -0,0 +1,184 @@
|
||||
// Copyright (c) 2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "include/include_all.cl"
|
||||
#include "include/data_types.cl"
|
||||
|
||||
#define ALIGN_TO(val, multiple) (((val) + (multiple)-1) / (multiple) * (multiple))
|
||||
|
||||
#define AS_TYPE(type, val) CAT(as_, type)(val)
|
||||
#define IN_VEC16 MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
|
||||
#define OUT_VEC16 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
|
||||
#define CONVERT_OUT CAT(convert_, OUTPUT_TYPE)
|
||||
#define CONVERT_OUT_VEC16 CAT(convert_, OUT_VEC16)
|
||||
#define BATCH_SLICE_SIZE 16
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
#if MAX_POOLING
|
||||
#define INIT_VAL -128
|
||||
#elif AVG_POOLING
|
||||
#define INIT_VAL 0
|
||||
#else
|
||||
#error
|
||||
#endif
|
||||
|
||||
inline int FUNC(apply_pooling)(int tmp, int in) {
|
||||
#if MAX_POOLING
|
||||
return max(tmp, in);
|
||||
#elif AVG_POOLING
|
||||
return tmp + in;
|
||||
#endif
|
||||
}
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
KERNEL(pooling_gpu_bs_fs_yx_bsv16_fsv16)(const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
, FUSED_OPS_DECLS
|
||||
#endif
|
||||
) {
|
||||
const uint f = (uint)get_global_id(0) * 16;
|
||||
const uint y = (uint)get_global_id(1) / OUTPUT_SIZE_X;
|
||||
const uint x = (uint)get_global_id(1) % OUTPUT_SIZE_X;
|
||||
const uint b = (uint)get_group_id(2) * 16 + get_sub_group_local_id();
|
||||
|
||||
const int offset_x = (int)x * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
const int offset_y = (int)y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
|
||||
const uint input_x_pitch = BATCH_SLICE_SIZE * FEATURE_SLICE_SIZE;
|
||||
const uint input_y_pitch = input_x_pitch * (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X);
|
||||
const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
|
||||
int16 result = INIT_VAL;
|
||||
|
||||
#ifdef CHECK_BOUNDRY
|
||||
uint batch_and_feature_offset = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, 0, 0);
|
||||
if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || offset_y + POOL_SIZE_Y < 0 ||
|
||||
offset_y >= INPUT0_SIZE_Y) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
uint num_elements = 0;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
|
||||
for (uint j = 0; j < POOL_SIZE_Y; j++) {
|
||||
int input_offset_y = offset_y + j;
|
||||
bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
|
||||
if (!zero_y) {
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
|
||||
for (uint i = 0; i < POOL_SIZE_X; i++) {
|
||||
int input_offset_x = offset_x + i;
|
||||
bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
|
||||
if (!zero) {
|
||||
const uint input_idx =
|
||||
batch_and_feature_offset + input_offset_y * input_y_pitch + input_offset_x * input_x_pitch;
|
||||
int4 int_data = vload4(0, (__global int*)(input + input_idx));
|
||||
IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint z = 0; z < 16; z++)
|
||||
result[z] = FUNC_CALL(apply_pooling)(result[z], (int)ch16_data[z]);
|
||||
#ifdef DYNAMIC_KERNEL_DIVIDER
|
||||
num_elements++;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
|
||||
const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
|
||||
const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
|
||||
const uint num_elements = (hend - offset_y) * (wend - offset_x);
|
||||
#endif
|
||||
#else // !CHECK_BOUNDRY
|
||||
uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, offset_y, offset_x);
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
|
||||
for (uint j = 0; j < POOL_SIZE_Y; j++) {
|
||||
__attribute__((opencl_unroll_hint(POOL_SIZE_X)))
|
||||
for (uint i = 0; i < POOL_SIZE_X; i++) {
|
||||
int4 int_data = vload4(0, (__global int*)(input + input_idx));
|
||||
IN_VEC16 ch16_data = AS_TYPE(IN_VEC16, int_data);
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint z = 0; z < 16; z++)
|
||||
result[z] = FUNC_CALL(apply_pooling)(result[z], (int)ch16_data[z]);
|
||||
|
||||
input_idx += input_x_pitch;
|
||||
}
|
||||
input_idx += (input_y_pitch - POOL_SIZE_X * input_x_pitch);
|
||||
}
|
||||
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
const uint num_elements = POOL_SIZE_X * POOL_SIZE_Y;
|
||||
#endif
|
||||
#endif
|
||||
#if defined AVG_POOLING
|
||||
#if ENABLE_ROUND
|
||||
int16 pool_result;
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint i = 0; i < 16; i++) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
|
||||
#else
|
||||
result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
float16 pool_result;
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint i = 0; i < 16; i++) {
|
||||
#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
|
||||
pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
|
||||
#else
|
||||
pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
|
||||
#endif
|
||||
}
|
||||
#endif // ENABLE_ROUND
|
||||
#else // AVG_POOLING
|
||||
int16 pool_result;
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint i = 0; i < 16; ++i) {
|
||||
pool_result[i] = result[i];
|
||||
}
|
||||
#endif // AVG_POOLING
|
||||
OUT_VEC16 final_result = (OUTPUT_TYPE)(0);
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint i = 0; i < 16; ++i) {
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC
|
||||
#else
|
||||
FUSED_OPS
|
||||
#endif
|
||||
final_result[i] = FUSED_OPS_RESULT;
|
||||
#else
|
||||
final_result[i] = pool_result[i];
|
||||
#endif
|
||||
}
|
||||
const uint output_pos = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, b, f, y, x);
|
||||
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
vstore4(as_uint4(final_result), 0, ((__global uint*)(output + output_pos)));
|
||||
#else
|
||||
*((__global OUT_VEC16*)(output + output_pos)) = final_result;
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef ALIGN_TO
|
||||
#undef AS_TYPE
|
||||
#undef IN_VEC16
|
||||
#undef OUT_VEC16
|
||||
#undef CONVERT_OUT
|
||||
#undef CONVERT_OUT_VEC16
|
||||
#undef INIT_VAL
|
@ -1093,6 +1093,7 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura
|
||||
|
||||
auto vec_size = conf.vec_size;
|
||||
auto idx = conf.bfzyx_idx_order;
|
||||
auto fused_op_config = conf;
|
||||
|
||||
std::string load_decls = "";
|
||||
static int i = 0;
|
||||
@ -1103,11 +1104,19 @@ JitConstants FusedOpsCodeGenerator::MakeLoadJitConstants(const FusedOpsConfigura
|
||||
if (reuse_index) {
|
||||
load_decls += "\\\n\tint " + reused_idx + " = " + GetIdx(0, idx_desc{idx, desc.tensors[0]}, safe_load) + ";";
|
||||
}
|
||||
// TODO: add some generic way to support shuffled feature, lets say possibility to add separate config for each fused op
|
||||
if (desc.GetType() == KernelType::ELTWISE && conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE) {
|
||||
std::string sub_group_local_id_str = "get_sub_group_local_id()";
|
||||
size_t found_sub = conf.bfzyx_idx_order[1].rfind(sub_group_local_id_str);
|
||||
if (found_sub != std::string::npos)
|
||||
fused_op_config.bfzyx_idx_order[1].replace(found_sub, sub_group_local_id_str.length(), fused_op_config.shuffle_var_name);
|
||||
}
|
||||
|
||||
for (auto op_input_id : GetRequiredInputs()) {
|
||||
load_decls += "\\\n\t" + GetInputTypeName(op_input_id, vec_size) + " " + GetInputVarName(op_input_id) + " = " +
|
||||
GetJitLoad(conf, op_input_id, prim_output, reuse_index, reused_idx) + ";";
|
||||
GetJitLoad(fused_op_config, op_input_id, prim_output, reuse_index, reused_idx) + ";";
|
||||
}
|
||||
|
||||
jit.AddConstant(MakeJitConstant("FUSED_OP"+std::to_string(desc.op_id)+"_LOAD" + conf.suffix, load_decls));
|
||||
|
||||
return jit;
|
||||
@ -1121,13 +1130,20 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
std::string op_decls = "";
|
||||
auto vec_size = conf.vec_size;
|
||||
auto idx = conf.bfzyx_idx_order;
|
||||
std::string shuffle_var = conf.shuffle_var_name;
|
||||
bool is_shuffled = false;
|
||||
|
||||
out_var = GetOutputVarName(in_var);
|
||||
out_type = desc.output_tensor.GetDType();
|
||||
|
||||
if (conf.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE &&
|
||||
(desc.GetType() == KernelType::SCALE || desc.GetType() == KernelType::QUANTIZE)) {
|
||||
is_shuffled = true;
|
||||
}
|
||||
|
||||
std::vector<std::string> in_vars_converted;
|
||||
for (size_t i = 0; i < desc.tensors.size(); i++) {
|
||||
auto in_name = GetInputVarName(i);
|
||||
auto in_name = GetInputVarName(i, is_shuffled, shuffle_var);
|
||||
if (desc.tensors[0].GetDType() != desc.output_tensor.GetDType()) {
|
||||
in_name = ConvertToOutputType(in_name, vec_size);
|
||||
}
|
||||
@ -1163,17 +1179,17 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
|
||||
}
|
||||
|
||||
auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size)
|
||||
: ConvertToType(GetInputVarName(p->out_scale_idx), tmp_type, vec_size);
|
||||
: ConvertToType(GetInputVarName(p->out_scale_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
|
||||
auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), tmp_type, vec_size)
|
||||
: ConvertToType(GetInputVarName(p->out_shift_idx), tmp_type, vec_size);
|
||||
: ConvertToType(GetInputVarName(p->out_shift_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
|
||||
auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), tmp_type, vec_size)
|
||||
: ConvertToType(GetInputVarName(p->in_scale_idx), tmp_type, vec_size);
|
||||
: ConvertToType(GetInputVarName(p->in_scale_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
|
||||
auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), tmp_type, vec_size)
|
||||
: ConvertToType(GetInputVarName(p->in_shift_idx), tmp_type, vec_size);
|
||||
: ConvertToType(GetInputVarName(p->in_shift_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
|
||||
auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), tmp_type, vec_size)
|
||||
: ConvertToType(GetInputVarName(p->in_range_lo_idx), tmp_type, vec_size);
|
||||
: ConvertToType(GetInputVarName(p->in_range_lo_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
|
||||
auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), tmp_type, vec_size)
|
||||
: ConvertToType(GetInputVarName(p->in_range_hi_idx), tmp_type, vec_size);
|
||||
: ConvertToType(GetInputVarName(p->in_range_hi_idx, is_shuffled, shuffle_var), tmp_type, vec_size);
|
||||
|
||||
if (p->has_clamp) {
|
||||
op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");";
|
||||
@ -1353,7 +1369,10 @@ std::string FusedOpsCodeGenerator::GetInputPtrName(size_t input_id) const {
|
||||
return GetTypeStr() + std::to_string(desc.op_id) + "_input" + std::to_string(input_id);
|
||||
}
|
||||
|
||||
std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id) const {
|
||||
std::string FusedOpsCodeGenerator::GetInputVarName(size_t input_id, bool is_shuffled, std::string shuffle_var) const {
|
||||
if (is_shuffled)
|
||||
return "intel_sub_group_shuffle(" + GetTypeStr() + std::to_string(desc.op_id) + "_data" +
|
||||
std::to_string(input_id) + ", " + shuffle_var + ")";
|
||||
return GetTypeStr() + std::to_string(desc.op_id) + "_data" + std::to_string(input_id);
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2018 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -353,7 +353,7 @@ public:
|
||||
bool reuse_index = false, std::string reused_idx = "") const;
|
||||
std::string GetIdx(size_t input_id, idx_desc idx, bool should_be_safe) const;
|
||||
std::string GetInputPtrName(size_t input_id) const;
|
||||
std::string GetInputVarName(size_t input_id) const;
|
||||
std::string GetInputVarName(size_t input_id, bool is_shuffled = false, std::string shuffle_var = "") const;
|
||||
std::string GetOutputVarName(std::string input_var_name) const;
|
||||
std::string ConvertToOutputType(std::string var, size_t vec_size = 1) const;
|
||||
std::string ConvertToType(std::string var, Datatype dt, size_t vec_size = 1) const;
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -99,12 +99,15 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
|
||||
|
||||
bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
|
||||
can_all_use_preload &= can_use_preload;
|
||||
|
||||
bool can_preload_eltwise = true;
|
||||
if (params.fused_ops[i].GetType() == FusedOpType::ELTWISE &&
|
||||
c.load_type == FusedOpsConfiguration::LoadType::FEATURE_SHUFFLE)
|
||||
can_preload_eltwise = false;
|
||||
fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
|
||||
fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
|
||||
if (can_use_preload)
|
||||
if (can_use_preload && can_preload_eltwise)
|
||||
fused_ops_preload += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
|
||||
if (c.allow_for_partial_preload && !can_use_preload)
|
||||
if (c.allow_for_partial_preload && (!can_use_preload || !can_preload_eltwise))
|
||||
fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
|
||||
fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -440,7 +440,8 @@ struct base_activation_params {
|
||||
struct FusedOpsConfiguration {
|
||||
enum class LoadType {
|
||||
LT_UNALIGNED = 0,
|
||||
LT_ALIGNED_READ = 1
|
||||
LT_ALIGNED_READ = 1,
|
||||
FEATURE_SHUFFLE = 2
|
||||
};
|
||||
|
||||
enum class BoundaryCheck {
|
||||
@ -476,6 +477,8 @@ struct FusedOpsConfiguration {
|
||||
// If allow_for_partial_preload is false, then it's required that all fused_ops can be preloaded.
|
||||
// If allow_for_partial_preload is true, then not preloaded fused_ops will be loaded in FUSED_OPS_CALC.
|
||||
bool allow_for_partial_preload;
|
||||
// Load index for shuffle fused op
|
||||
std::string shuffle_var_name;
|
||||
|
||||
FusedOpsConfiguration(std::string suffix,
|
||||
std::vector<std::string> bfzyx_idx_order,
|
||||
@ -487,7 +490,8 @@ struct FusedOpsConfiguration {
|
||||
IndexType index_type = IndexType::TENSOR_COORD,
|
||||
Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT,
|
||||
std::vector<Tensor::DataChannelName> loop_axes = {},
|
||||
bool allow_for_partial_preload = false)
|
||||
bool allow_for_partial_preload = false,
|
||||
std::string shuffle_var_name = "")
|
||||
: suffix(suffix)
|
||||
, bfzyx_idx_order(bfzyx_idx_order)
|
||||
, input_var_name(input_var_name)
|
||||
@ -498,7 +502,8 @@ struct FusedOpsConfiguration {
|
||||
, boundary_check(boundary_check)
|
||||
, index_type(index_type)
|
||||
, loop_axes(loop_axes)
|
||||
, allow_for_partial_preload(allow_for_partial_preload) { }
|
||||
, allow_for_partial_preload(allow_for_partial_preload)
|
||||
, shuffle_var_name(shuffle_var_name) { }
|
||||
|
||||
FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; }
|
||||
FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; }
|
||||
@ -509,6 +514,7 @@ struct FusedOpsConfiguration {
|
||||
loop_axes = std::move(val);
|
||||
allow_for_partial_preload = partial_preload;
|
||||
return *this; }
|
||||
FusedOpsConfiguration& SetShuffleVarName(std::string val) { shuffle_var_name = val; return *this; }
|
||||
};
|
||||
|
||||
// Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program_impl::fuse_nodes
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -225,6 +225,8 @@ attach_convolution_gpu::attach_convolution_gpu() {
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
// Copyright (c) 2019-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -110,6 +110,9 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() {
|
||||
{std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw},
|
||||
{std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw},
|
||||
// fs_b_yx_fsv32
|
||||
{std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw},
|
||||
});
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2019 Intel Corporation
|
||||
// Copyright (c) 2019-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -199,6 +199,8 @@ attach_pooling_gpu::attach_pooling_gpu() {
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
|
||||
// MMAD
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), pooling_gpu::create);
|
||||
implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), pooling_gpu::create);
|
||||
|
@ -171,6 +171,10 @@ attach_quantize_gpu::attach_quantize_gpu() {
|
||||
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
|
||||
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2018-2019 Intel Corporation
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -123,6 +123,7 @@ void prepare_padding::run(program_impl& p) {
|
||||
if (conv_layout.format != cldnn::format::bfyx &&
|
||||
conv_layout.format != cldnn::format::bf8_xy16 &&
|
||||
conv_layout.format != cldnn::format::b_fs_yx_fsv16 &&
|
||||
conv_layout.format != cldnn::format::bs_fs_yx_bsv16_fsv16 &&
|
||||
conv_layout.format != cldnn::format::byxf_af32 &&
|
||||
conv_layout.format != cldnn::format::fs_bs_yx_bsv4_fsv32 &&
|
||||
conv_layout.format != cldnn::format::b_fs_yx_fsv4 &&
|
||||
|
@ -321,8 +321,10 @@ void remove_redundant_reorders::run(program_impl& p) {
|
||||
|
||||
auto& usr = node->get_users().front();
|
||||
auto& dep = node->get_dependency(0);
|
||||
if (!usr->is_type<quantize>() || node->get_output_layout().format != format::bfyx ||
|
||||
(dep.get_output_layout().format != format::b_fs_yx_fsv16 && dep.get_output_layout().format != format::fs_b_yx_fsv32))
|
||||
if (!usr->is_type<quantize>() ||
|
||||
(dep.get_output_layout().format != format::b_fs_yx_fsv16 &&
|
||||
dep.get_output_layout().format != format::fs_b_yx_fsv32 &&
|
||||
dep.get_output_layout().format != format::bfyx))
|
||||
continue;
|
||||
|
||||
dep.merge_output_padding(node->get_output_layout().data_padding);
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
// Copyright (c) 2018-2019 Intel Corporation
|
||||
// Copyright (c) 2018-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -157,6 +157,7 @@ private:
|
||||
const layout &weights_layout,
|
||||
std::shared_ptr<const convolution> conv);
|
||||
bool convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
|
||||
const layout& weights_layout,
|
||||
std::shared_ptr<const convolution> conv);
|
||||
bool convolution_fs_b_yx_fsv32_opt(const layout& input_layout,
|
||||
const layout& weights_layout,
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016-2019 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -206,6 +206,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
|
||||
return cldnn::format::bfwzyx;
|
||||
case kernel_selector::data_layout::b_fs_yx_fsv4:
|
||||
return cldnn::format::b_fs_yx_fsv4;
|
||||
case kernel_selector::data_layout::bs_fs_yx_bsv16_fsv16:
|
||||
return cldnn::format::bs_fs_yx_bsv16_fsv16;
|
||||
case kernel_selector::data_layout::nv12:
|
||||
return cldnn::format::nv12;
|
||||
case kernel_selector::data_layout::image_2d_rgba:
|
||||
|
@ -211,7 +211,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
(fmt_next == format::b_fs_yx_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) ||
|
||||
(fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
|
||||
(prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3)))
|
||||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3) ||
|
||||
(fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3 &&
|
||||
(next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8))))
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
@ -221,12 +223,14 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
|
||||
(prev_output_layout.size.feature[0] == 3 || (prev_output_layout.size.feature[0] == 4 && (prev_dt == data_types::u8 || prev_dt == data_types::i8))))))
|
||||
return true;
|
||||
|
||||
if (next.is_type<quantize>() && fmt_prev == format::bfyx && fmt_next == format::b_fs_yx_fsv16)
|
||||
if (next.is_type<quantize>() && fmt_prev == format::bfyx && (fmt_next == format::b_fs_yx_fsv16 ||
|
||||
fmt_next == format::bs_fs_yx_bsv16_fsv16 || fmt_next == format::b_fs_yx_fsv4))
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
fmt_prev == format::bfyx && prev_output_layout.size.feature[0] == 3 &&
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32))
|
||||
(fmt_prev == format::b_fs_yx_fsv4 || fmt_prev == format::bfyx) && prev_output_layout.size.feature[0] == 3 &&
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::byxf_af32 ||
|
||||
fmt_next == format::bs_fs_yx_bsv16_fsv16))
|
||||
return true;
|
||||
|
||||
if (next.is_type<convolution>() &&
|
||||
@ -246,7 +250,8 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
|
||||
return true;
|
||||
|
||||
if (prev.is_type<quantize>() &&
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 || fmt_next == format::b_fs_yx_fsv16))
|
||||
(fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 ||
|
||||
fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::bs_fs_yx_bsv16_fsv16))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@ -450,14 +455,24 @@ bool layout_optimizer::convolution_b_fs_zyx_fsv16_opt(layout const &input_layout
|
||||
}
|
||||
|
||||
bool layout_optimizer::convolution_bs_fs_yx_bsv16_fsv16_opt(const layout &input_layout,
|
||||
const layout& weights_layout,
|
||||
std::shared_ptr<const convolution> conv) {
|
||||
// A set of rules that define when bs_fs_yx_bsv16_fsv16 mem format can be used
|
||||
bool correct_batch = input_layout.size.batch[0] > 16;
|
||||
bool correct_batch = input_layout.size.batch[0] >= 16;
|
||||
bool correct_feature = (input_layout.size.feature[0] % 16 == 0 || input_layout.size.feature[0] == 3) && conv->output_size.feature[0] % 16 == 0;
|
||||
bool fp16_ver = input_layout.data_type == data_types::f16 && input_layout.size.batch[0] % 32 == 0;
|
||||
bool fp32_ver = input_layout.data_type == data_types::f32 && input_layout.size.batch[0] % 16 == 0;
|
||||
bool single_group = conv->groups == 1;
|
||||
return (fp16_ver || fp32_ver) && correct_feature && correct_batch && single_group;
|
||||
bool int8_sup = (input_layout.data_type == data_types::i8 || input_layout.data_type == data_types::u8) &&
|
||||
input_layout.size.batch[0] % 16 == 0 && weights_layout.data_type == data_types::i8 &&
|
||||
(conv->activations_zero_points.empty() && conv->weights_zero_points.empty());
|
||||
auto ks_x = weights_layout.size.spatial[0];
|
||||
auto ks_y = weights_layout.size.spatial[1];
|
||||
int8_sup &= (input_layout.size.spatial[2] == 1 && ((ks_x == 1 && ks_y == 1) || (ks_x == 3 && ks_y == 3) || (ks_x == 7 && ks_y == 7)) &&
|
||||
input_layout.size.batch[0] % 16 == 0 && weights_layout.size.batch[0] % 32 == 0 && conv->groups == 1 &&
|
||||
conv->split() == 1 && conv->dilation == tensor{1});
|
||||
|
||||
return (int8_sup || fp16_ver || fp32_ver) && correct_feature && correct_batch && single_group;
|
||||
}
|
||||
|
||||
bool layout_optimizer::convolution_fs_b_yx_fsv32_opt(layout const& input_layout,
|
||||
@ -630,7 +645,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
const float cond_denom = _total_conv > 0 ? 1.0f / static_cast<float>(_total_conv) : 1.0f;
|
||||
|
||||
if ((input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8)) {
|
||||
if ((_optimization_attributes.b_fs_yx_fsv16_network &&
|
||||
if ((_optimization_attributes.bs_fs_yx_bsv16_fsv16_network && expected_tensor.batch[0] % 16 == 0 &&
|
||||
convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
|
||||
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
|
||||
} else if ((_optimization_attributes.b_fs_yx_fsv16_network &&
|
||||
convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
|
||||
expected_format = cldnn::format::b_fs_yx_fsv16;
|
||||
} else {
|
||||
@ -651,7 +669,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
expected_tensor = current_layout.size;
|
||||
expected_format = cldnn::format::bfzyx;
|
||||
} else if (_optimization_attributes.bs_fs_yx_bsv16_fsv16_network &&
|
||||
convolution_bs_fs_yx_bsv16_fsv16_opt(node.input().get_output_layout(), prim)) {
|
||||
convolution_bs_fs_yx_bsv16_fsv16_opt(node.input().get_output_layout(), output_or_weights_layout, prim)) {
|
||||
expected_tensor = current_layout.size;
|
||||
expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16;
|
||||
} else if (_optimization_attributes.fs_b_yx_fsv32_network && !node.get_transposed() &&
|
||||
@ -781,6 +799,11 @@ format layout_optimizer::get_preferred_format(program_node& node) {
|
||||
output_layout,
|
||||
node.as<detection_output>(),
|
||||
layout{ data_types::f32, format::bfyx, tensor{} }).format;
|
||||
} else if (node.is_type<quantize>()) {
|
||||
auto layout = node.get_output_layout();
|
||||
if ((layout.data_type == data_types::i8 || layout.data_type == data_types::u8) &&
|
||||
layout.size.batch[0] % 16 == 0)
|
||||
expected = format::b_fs_yx_fsv4;
|
||||
} else if (node.is_type<reorder>() || node.is_type<input_layout>()) {
|
||||
expected = node.get_output_layout().format;
|
||||
} else if (node.is_type<reshape>()) {
|
||||
@ -850,7 +873,7 @@ bool layout_optimizer::is_format_optimized(const convolution_node& node, const f
|
||||
case format::fs_b_yx_fsv32:
|
||||
return convolution_fs_b_yx_fsv32_opt(input_layout, weights_layout, prim);
|
||||
case format::bs_fs_yx_bsv16_fsv16:
|
||||
return convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, prim);
|
||||
return convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, weights_layout, prim);
|
||||
default:
|
||||
throw std::invalid_argument(
|
||||
"[Layout optimizer] Other formats in is_format_optimized(...) method are not implemented!");
|
||||
|
@ -1186,7 +1186,8 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
|
||||
prim.type() != cldnn::scale::type_id() &&
|
||||
prim.type() != cldnn::softmax::type_id() &&
|
||||
prim.type() != cldnn::fully_connected::type_id() &&
|
||||
prim.type() != cldnn::generic_layer::type_id())
|
||||
prim.type() != cldnn::generic_layer::type_id() &&
|
||||
prim.type() != cldnn::quantize::type_id())
|
||||
can_use_bs_fs_yx_bsv16_fsv16 = false;
|
||||
}
|
||||
|
||||
|
@ -8376,6 +8376,25 @@ struct params_generator : std::vector<convolution_random_test_all_params> {
|
||||
return *this;
|
||||
}
|
||||
|
||||
params_generator& bs_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
|
||||
std::vector<int> strides = { 1, 2 };
|
||||
for (auto s : strides) {
|
||||
// 1x1
|
||||
push_back(convolution_random_test_all_params{
|
||||
// feature input filter stride offset dilation bias groups
|
||||
//batch in out x y x y x y x y x y
|
||||
16, 32, 32, { 4, 4 }, { 1, 1 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
|
||||
push_back(convolution_random_test_all_params{
|
||||
16, 32, 32, { 9, 9 }, { 1, 1 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
|
||||
// 3x3
|
||||
push_back(convolution_random_test_all_params{
|
||||
16, 32, 32, { 4, 4 }, { 3, 3 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
|
||||
push_back(convolution_random_test_all_params{
|
||||
16, 32, 32, { 9, 9 }, { 3, 3 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
params_generator& all_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
|
||||
return smoke_test_params(input_format, asymm_weights, asymm_data, padded_input)
|
||||
.extra_test_params(input_format, asymm_weights, asymm_data, padded_input);
|
||||
@ -8421,6 +8440,7 @@ INSTANTIATE_TEST_CASE_P(
|
||||
.smoke_test_params(format::b_fs_yx_fsv32, false, false, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
|
||||
.smoke_test_params(format::b_fs_yx_fsv16)
|
||||
.bs_test_params(format::bs_fs_yx_bsv16_fsv16)
|
||||
),
|
||||
to_string_convolution_all_params
|
||||
);
|
||||
|
@ -404,6 +404,8 @@ public:
|
||||
#define CASE_CONV_U8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_U8S8_7 {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_U8S8_8 {1, 3, 4, 5}, {1, 32, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_U8S8_9 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_U8S8_10 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_CONV_S8S8_1 {1, 15, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_S8S8_2 {1, 15, 5, 5}, {1, 30, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
@ -413,6 +415,8 @@ public:
|
||||
#define CASE_CONV_S8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_S8S8_7 {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_S8S8_8 {1, 3, 4, 5}, {1, 32, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_S8S8_9 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx
|
||||
#define CASE_CONV_S8S8_10 {16, 32, 5, 5}, {16, 32, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_CONV3D_U8S8_1 {1, 15, 5, 4, 5}, {1, 30, 3, 2, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
|
||||
#define CASE_CONV3D_U8S8_2 {1, 15, 5, 5, 5}, {1, 30, 3, 3, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
|
||||
@ -1536,10 +1540,14 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_quantize_i8,
|
||||
bc_test_params{CASE_CONV_U8S8_2, 2, 4},
|
||||
bc_test_params{CASE_CONV_U8S8_3, 2, 4},
|
||||
bc_test_params{CASE_CONV_U8S8_4, 2, 4},
|
||||
bc_test_params{CASE_CONV_U8S8_9, 2, 4},
|
||||
bc_test_params{CASE_CONV_U8S8_10, 2, 4},
|
||||
bc_test_params{CASE_CONV_S8S8_1, 2, 4},
|
||||
bc_test_params{CASE_CONV_S8S8_2, 2, 4},
|
||||
bc_test_params{CASE_CONV_S8S8_3, 2, 4},
|
||||
bc_test_params{CASE_CONV_S8S8_4, 2, 4},
|
||||
bc_test_params{CASE_CONV_S8S8_9, 2, 4},
|
||||
bc_test_params{CASE_CONV_S8S8_10, 2, 4},
|
||||
|
||||
bc_test_params{CASE_CONV3D_U8S8_1, 2, 4},
|
||||
bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
|
||||
|
@ -2599,6 +2599,20 @@ INSTANTIATE_TEST_CASE_P(
|
||||
format::b_fs_yx_fsv32)),
|
||||
testing::internal::DefaultParamName<pooling_random_test_params>);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
batched_low_precision,
|
||||
pooling_random_test,
|
||||
testing::Combine(
|
||||
testing::Values(16),
|
||||
testing::Values(16, 32),
|
||||
testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
|
||||
testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
|
||||
testing::Values(std::tuple<int, int>(1, 1)),
|
||||
testing::Values(std::tuple<int, int>(0, 0)),
|
||||
testing::Values(format::bs_fs_yx_bsv16_fsv16)
|
||||
),
|
||||
testing::internal::DefaultParamName<pooling_random_test_params>);
|
||||
|
||||
template <typename InputT, pooling_mode Mode>
|
||||
class pooling_scale_random_test_base : public pooling_random_test_base<InputT, Mode> {
|
||||
public:
|
||||
|
Loading…
Reference in New Issue
Block a user