[IE CLDNN] Fully Connected layer 3d support (#2709)
This commit is contained in:
parent
f2f5e99f9f
commit
b91047e9fe
@ -43,6 +43,7 @@
|
||||
#include <legacy/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp>
|
||||
#include <legacy/transformations/convert_opset1_to_legacy/convert_prior_to_ie_prior.hpp>
|
||||
#include <legacy/transformations/convert_opset1_to_legacy/convert_nms_5_to_legacy.hpp>
|
||||
#include <legacy/transformations/convert_opset1_to_legacy/reshape_fully_connected.hpp>
|
||||
#include <legacy/convert_function_to_cnn_network.hpp>
|
||||
#include <legacy/ie_util_internal.hpp>
|
||||
#include <legacy/graph_transformer.h>
|
||||
@ -241,11 +242,17 @@ InferenceEngine::ICNNNetwork::Ptr clDNNEngine::CloneAndTransformNetwork(const In
|
||||
transformer.transform(nGraphFunc);
|
||||
}
|
||||
|
||||
const auto reshape_fc_callback = [](const std::shared_ptr<const ::ngraph::Node>& node) -> bool {
|
||||
return node->input_value(0).get_shape().size() <= 3lu;
|
||||
};
|
||||
|
||||
{
|
||||
ngraph::pass::Manager manager = ngraph::pass::Manager();
|
||||
manager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
|
||||
manager.register_pass<ngraph::pass::UnrollTensorIterator>();
|
||||
manager.set_callback(transformations_callback);
|
||||
auto pass_config = manager.get_pass_config();
|
||||
pass_config->set_callback<ngraph::pass::ReshapeFullyConnected>(reshape_fc_callback);
|
||||
manager.run_passes(nGraphFunc);
|
||||
}
|
||||
|
||||
|
@ -1084,6 +1084,8 @@ void Program::CreateWeightAndBiasPrimitives(cldnn::topology& topology,
|
||||
case FullyConnected: {
|
||||
groupSize = 1;
|
||||
outFeatures = static_cast<cldnn::tensor::value_type>(layer->outData[0]->getTensorDesc().getDims()[1]);
|
||||
if (in0dims.size() == 3)
|
||||
outFeatures = static_cast<cldnn::tensor::value_type>(layer->outData[0]->getTensorDesc().getDims()[2]);
|
||||
switch (in0dims.size()) {
|
||||
case 4:
|
||||
weightDimsVec = { TensorValue(layer->outData[0]->getTensorDesc().getDims().back()),
|
||||
@ -1093,8 +1095,8 @@ void Program::CreateWeightAndBiasPrimitives(cldnn::topology& topology,
|
||||
break;
|
||||
case 3:
|
||||
weightDimsVec = { TensorValue(layer->outData[0]->getTensorDesc().getDims().back()),
|
||||
TensorValue(in0dims[1]),
|
||||
TensorValue(in0dims[2]),
|
||||
1,
|
||||
1 };
|
||||
break;
|
||||
case 2:
|
||||
@ -2927,11 +2929,14 @@ void Program::CreateFullyConnectedPrimitive(cldnn::topology& topology, Inference
|
||||
IE_ASSERT(weightPrimID.size() == 1);
|
||||
IE_ASSERT(biasPrimID.size() <= 1);
|
||||
|
||||
auto outDims = layer->outData[0]->getTensorDesc().getDims().size();
|
||||
auto fcPrim = cldnn::fully_connected(fcLayerName,
|
||||
inputPrimitives[0],
|
||||
weightPrimID[0],
|
||||
biasPrimID.empty() ? "" : biasPrimID[0],
|
||||
DataTypeFromPrecision(fcLayer->outData[0]->getTensorDesc().getPrecision()));
|
||||
DataTypeFromPrecision(fcLayer->outData[0]->getTensorDesc().getPrecision()),
|
||||
cldnn::padding(),
|
||||
layer->outData[0]->getTensorDesc().getDims().size());
|
||||
|
||||
topology.add(fcPrim);
|
||||
|
||||
|
@ -61,10 +61,12 @@ struct fully_connected : public primitive_base<fully_connected> {
|
||||
const primitive_id& input,
|
||||
const primitive_id& weights,
|
||||
const primitive_id& bias = "",
|
||||
const padding& output_padding = padding())
|
||||
const padding& output_padding = padding(),
|
||||
const size_t input_size = 2)
|
||||
: primitive_base(id, {input}, output_padding),
|
||||
weights(weights),
|
||||
bias(bias)
|
||||
bias(bias),
|
||||
input_size(input_size)
|
||||
{}
|
||||
|
||||
/// @brief Constructs fully connected layer.
|
||||
@ -77,16 +79,20 @@ struct fully_connected : public primitive_base<fully_connected> {
|
||||
const primitive_id& weights,
|
||||
const primitive_id& bias,
|
||||
const data_types data_type,
|
||||
const padding& output_padding = padding())
|
||||
const padding& output_padding = padding(),
|
||||
const size_t input_size = 2)
|
||||
: primitive_base(id, { input }, output_padding, optional_data_type{data_type}),
|
||||
weights(weights),
|
||||
bias(bias)
|
||||
bias(bias),
|
||||
input_size(input_size)
|
||||
{}
|
||||
|
||||
/// @brief Primitive id containing weights data.
|
||||
primitive_id weights;
|
||||
/// @brief Primitive id containing bias data.
|
||||
primitive_id bias;
|
||||
/// @brief Primitive dimension size.
|
||||
size_t input_size;
|
||||
|
||||
protected:
|
||||
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
|
||||
|
@ -52,6 +52,7 @@ ParamsKey FullyConnected_bf_tiled::GetSupportedKey() const {
|
||||
k.EnableInputLayout(DataLayout::bf);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bf);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableBatching();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
@ -68,11 +69,17 @@ bool FullyConnected_bf_tiled::Validate(const Params& params, const optional_para
|
||||
|
||||
auto& fc_params = static_cast<const fully_connected_params&>(params);
|
||||
auto& input = fc_params.inputs[0];
|
||||
auto& output = fc_params.output;
|
||||
|
||||
// Block reads must be aligned to 4 bytes, for fp16 we can correct for offset misalignment,
|
||||
// but we need to ensure that batch pitch preserves alignment.
|
||||
if (input.GetDType() == Datatype::F16 && input.Batch().pitch % 2 != 0 && input.Batch().v > 1)
|
||||
if (input.GetDType() == Datatype::F16) {
|
||||
if (input.Batch().pitch % 2 != 0 && input.Batch().v > 1)
|
||||
return false;
|
||||
// for 3d case we have to check feature alignment as well
|
||||
if (output.GetLayout() == DataLayout::bfyx && input.Feature().pitch % 2 != 0 && input.Feature().v > 1)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (input.GetLayout() == DataLayout::bfyx) {
|
||||
// Padding on input is not supported.
|
||||
@ -83,6 +90,12 @@ bool FullyConnected_bf_tiled::Validate(const Params& params, const optional_para
|
||||
return false;
|
||||
}
|
||||
|
||||
// We don't support 4d output
|
||||
if (fc_params.output.GetLayout() == DataLayout::bfyx) {
|
||||
if (input.X().v > 1)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -127,13 +140,20 @@ struct TuneParamsSelector {
|
||||
|
||||
bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, const tune_params& tparams) {
|
||||
// Check divisibility by dispatch tile sizes.
|
||||
if (params.output.Batch().v % (tparams.tile_b * tparams.dispatch_bsv) != 0)
|
||||
size_t output_f = params.output.Feature().v;
|
||||
size_t output_b = params.output.Batch().v;
|
||||
if (params.output.GetLayout() == DataLayout::bfyx) {
|
||||
output_b *= params.output.Feature().v;
|
||||
output_f = params.output.Y().v;
|
||||
}
|
||||
|
||||
if (output_b % (tparams.tile_b * tparams.dispatch_bsv) != 0)
|
||||
return false;
|
||||
if (CeilDiv(params.output.Feature().v, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0)
|
||||
if (CeilDiv(output_f, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0)
|
||||
return false;
|
||||
|
||||
// Same result can be achieved with smaller tile_ofm.
|
||||
if (params.output.Feature().v <= (tparams.tile_ofm / 2) * simd)
|
||||
if (output_f <= (tparams.tile_ofm / 2) * simd)
|
||||
return false;
|
||||
// No weights layout for such huge tile ofm.
|
||||
if (tparams.tile_ofm * simd > 64)
|
||||
@ -163,6 +183,12 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
|
||||
|
||||
size_t batch = params.output.Batch().v;
|
||||
size_t output_f = params.output.Feature().v;
|
||||
|
||||
// 3d output
|
||||
if (params.output.GetLayout() == DataLayout::bfyx) {
|
||||
batch *= params.output.Feature().v;
|
||||
output_f = params.output.Y().v;
|
||||
}
|
||||
Datatype dtype = params.inputs[0].GetDType();
|
||||
|
||||
auto selector = TuneParamsSelector(params);
|
||||
@ -219,6 +245,10 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au
|
||||
|
||||
size_t feature_threads = CeilDiv(params.output.Feature().v, tparams.tile_ofm * simd);
|
||||
size_t batch_threads = params.output.Batch().v / tparams.tile_b;
|
||||
if (params.output.GetLayout() == DataLayout::bfyx) {
|
||||
feature_threads = CeilDiv(params.output.Y().v, tparams.tile_ofm * simd);
|
||||
batch_threads = (params.output.Batch().v * params.output.Feature().v) / tparams.tile_b;
|
||||
}
|
||||
|
||||
dispatchData.gws[0] = feature_threads * batch_threads * simd;
|
||||
dispatchData.gws[1] = 1;
|
||||
@ -252,7 +282,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
|
||||
|
||||
jit.Merge(MakeConstantLoopUnrollJitConstants(dispatchData.tile_m));
|
||||
|
||||
bool realign_fp16_offset = params.inputs[0].GetDType() == Datatype::F16 && params.output.GetFirstElementOffset() % 2 != 0;
|
||||
bool realign_fp16_offset = params.inputs[0].GetDType() == Datatype::F16 && params.inputs[0].GetFirstElementOffset() % 2 != 0;
|
||||
jit.AddConstant(MakeJitConstant("REALIGN_FP16_OFFSET", realign_fp16_offset));
|
||||
|
||||
auto activation_dt = GetActivationType(params);
|
||||
@ -260,13 +290,32 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
|
||||
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
|
||||
jit.Merge(MakeActivationJitConstants(params.activations, activation_dt, "_TYPED"));
|
||||
|
||||
// for 3d output we are treating spatial as features
|
||||
if (params.output.GetLayout() == DataLayout::bfyx) {
|
||||
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.output.Y().v));
|
||||
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.output.Y().pitch));
|
||||
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
|
||||
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.output.Feature().pitch));
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
|
||||
}
|
||||
else {
|
||||
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.output.Feature().v));
|
||||
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.output.Feature().pitch));
|
||||
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
|
||||
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.output.Batch().pitch));
|
||||
}
|
||||
|
||||
size_t output_f = params.output.GetLayout() == DataLayout::bfyx ? params.output.Y().v : params.output.Feature().v;
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto boundary_check = BoundaryCheck::DISABLED;
|
||||
if (params.output.Feature().v % (dispatchData.tile_n * simd) != 0)
|
||||
if (output_f % (dispatchData.tile_n * simd) != 0)
|
||||
boundary_check = BoundaryCheck::ENABLED;
|
||||
|
||||
std::vector<std::string> idx_order = {"(out_b + bi)", "out_f", "0", "0"};
|
||||
if (params.output.GetLayout() == DataLayout::bfyx)
|
||||
idx_order = {"(out_b + bi) % OUTPUT_BATCH_NUM", "(out_b + bi) / OUTPUT_BATCH_NUM", "out_f", "0"};
|
||||
FusedOpsConfiguration conf = { "",
|
||||
{"(out_b + bi)", "out_f", "0", "0"},
|
||||
idx_order,
|
||||
"activated[bi]",
|
||||
activation_dt,
|
||||
dispatchData.tile_n,
|
||||
@ -284,6 +333,9 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
|
||||
const optional_params &options,
|
||||
const int autoTuneIndex) const {
|
||||
auto& fc_params = static_cast<const fully_connected_params&>(params);
|
||||
size_t output_b = fc_params.output.Batch().v;
|
||||
if (fc_params.output.GetLayout() == DataLayout::bfyx)
|
||||
output_b *= fc_params.output.Feature().v;
|
||||
|
||||
if (autoTuneIndex >= 0 && autoTuneIndex < (int)auto_tune_params.size()
|
||||
&& !TuneParamsSelector::VerifyTuneParams(fc_params, auto_tune_params[autoTuneIndex]))
|
||||
@ -298,9 +350,9 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
|
||||
weights_layout = WeightsLayout::os_iyx_osv64;
|
||||
|
||||
float estimated_time = DONT_USE_IF_HAVE_SOMETHING_ELSE;
|
||||
if (fc_params.output.Batch().v > 1 && fc_params.inputs[0].GetDType() == Datatype::F32)
|
||||
if (output_b > 1 && fc_params.inputs[0].GetDType() == Datatype::F32)
|
||||
estimated_time = FORCE_PRIORITY_3;
|
||||
if (fc_params.output.Batch().v > 1 && fc_params.inputs[0].GetDType() == Datatype::F16)
|
||||
if (output_b > 1 && fc_params.inputs[0].GetDType() == Datatype::F16)
|
||||
estimated_time = FORCE_PRIORITY_4;
|
||||
|
||||
return GetCommonKernelsData(params,
|
||||
|
@ -34,8 +34,10 @@ ParamsKey FullyConnected_bfyx_Ref::GetSupportedKey() const {
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableInputLayout(DataLayout::bf);
|
||||
k.EnableInputLayout(DataLayout::bfyx);
|
||||
k.EnableOutputLayout(DataLayout::bf);
|
||||
k.EnableOutputLayout(DataLayout::fb);
|
||||
k.EnableOutputLayout(DataLayout::bfyx);
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
@ -50,7 +52,11 @@ FullyConnected_bfyx_Ref::DispatchData FullyConnected_bfyx_Ref::SetDefault(const
|
||||
int) const {
|
||||
auto dispatchData = Parent::SetDefault(params);
|
||||
|
||||
dispatchData.gws = { params.output.Feature().v, params.output.Batch().v, 1 };
|
||||
std::vector<size_t> global = {params.output.Feature().v, params.output.Batch().v, 1};
|
||||
if (params.output.GetLayout() == DataLayout::bfyx)
|
||||
global = {params.output.Feature().v * params.output.Y().v, params.output.Batch().v, 1};
|
||||
|
||||
dispatchData.gws = global;
|
||||
dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);
|
||||
|
||||
return dispatchData;
|
||||
@ -69,13 +75,14 @@ JitConstants FullyConnected_bfyx_Ref::GetJitConstants(const fully_connected_para
|
||||
accumulator_dt = Datatype::F32;
|
||||
activation_dt = Datatype::F32;
|
||||
}
|
||||
|
||||
if (params.output.GetLayout() == DataLayout::bfyx)
|
||||
jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
|
||||
jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
|
||||
jit.Merge(MakeTypeJitConstants(accumulator_dt, "ACCUMULATOR"));
|
||||
jit.Merge(MakeActivationJitConstants(params.activations, activation_dt, "_TYPED"));
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
FusedOpsConfiguration conf = { "", {"b", "ofm", "y", "x"}, "dequantized", activation_dt, 1 };
|
||||
FusedOpsConfiguration conf = { "", {"b", "ofm", "oym", "0"}, "dequantized", activation_dt, 1 };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
return jit;
|
||||
@ -126,6 +133,10 @@ bool FullyConnected_bfyx_Ref::Validate(const Params& params, const optional_para
|
||||
if (!is_quantization && !has_fused_op)
|
||||
return false;
|
||||
|
||||
// We don't support 4d output
|
||||
if (fc_params.output.GetLayout() == DataLayout::bfyx && fc_params.output.X().v > 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -66,9 +66,21 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
const auto& output = params.output;
|
||||
size_t input_feature = input.Feature().v;
|
||||
size_t input_batch = input.Batch().v;
|
||||
size_t output_feature = output.Feature().v;
|
||||
size_t output_batch = output.Batch().v;
|
||||
// for 3d case
|
||||
if (output.GetLayout() == DataLayout::bfyx) {
|
||||
input_batch *= input.Feature().v;
|
||||
input_feature = input.Y().v;
|
||||
output_batch *= output.Feature().v;
|
||||
output_feature = output.Y().v;
|
||||
}
|
||||
|
||||
tuning_data.sub_group_size = 8;
|
||||
if (input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1 && input.Batch().v == 1) {
|
||||
if (input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 &&
|
||||
((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) {
|
||||
// Known cases for TGL where simd16 works better than simd8
|
||||
bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;
|
||||
bool simd16_exception_2 = input.Feature().v == 21504 && output.Feature().v == 512;
|
||||
@ -79,14 +91,14 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get
|
||||
|
||||
size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
|
||||
|
||||
tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size != 0 ?
|
||||
input.Feature().v / sub_group_pack_size :
|
||||
tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input_feature % sub_group_pack_size != 0 ?
|
||||
input_feature / sub_group_pack_size :
|
||||
input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 ?
|
||||
CeilDiv(input.Feature().v, 32) % 2 == 0 ? CeilDiv(input.Feature().v, 64) : CeilDiv(input.Feature().v, 64) - 1 :
|
||||
CeilDiv(input.Feature().v, sub_group_pack_size);
|
||||
CeilDiv(input_feature, 32) % 2 == 0 ? CeilDiv(input_feature, 64) : CeilDiv(input_feature, 64) - 1 :
|
||||
CeilDiv(input_feature, sub_group_pack_size);
|
||||
|
||||
bool slm_div_factor_exception = input.Batch().v == 300 && input.Feature().v == 2048 &&
|
||||
output.Batch().v == 300 && (output.Feature().v == 324 || output.Feature().v == 81);
|
||||
bool slm_div_factor_exception = input_batch == 300 && input_feature == 2048 &&
|
||||
output_batch == 300 && (output_feature == 324 || output_feature == 81);
|
||||
|
||||
if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception)
|
||||
while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
|
||||
@ -120,7 +132,11 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons
|
||||
auto dispatchData = Parent::SetDefault(params);
|
||||
const auto& output = params.output;
|
||||
|
||||
dispatchData.gws = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
|
||||
std::vector<size_t> global = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
|
||||
if (output.GetLayout() == DataLayout::bfyx)
|
||||
global = { Align(output.Y().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, output.Feature().v };
|
||||
|
||||
dispatchData.gws = global;
|
||||
dispatchData.lws = { tuning_data.work_group_size, 1, 1 };
|
||||
|
||||
return dispatchData;
|
||||
@ -133,6 +149,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
auto& input = params.inputs[0];
|
||||
auto& output = params.output;
|
||||
auto& weights = params.weights;
|
||||
|
||||
size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
|
||||
@ -181,6 +198,9 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
|
||||
bool has_feature_leftovers = (input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size) ||
|
||||
(input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 && CeilDiv(input.Feature().v, 32) % 2);
|
||||
|
||||
if (output.GetLayout() == DataLayout::bfyx)
|
||||
has_feature_leftovers = input.Y().v % sub_group_pack_size;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", has_feature_leftovers));
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", tuning_data.feature_blocks_count));
|
||||
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||
@ -200,9 +220,24 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
|
||||
jit.AddConstant(MakeJitConstant("SPLIT_SPATIAL", split_spatial));
|
||||
jit.AddConstant(MakeJitConstant("SPATIAL_MAJOR", spatial_major));
|
||||
|
||||
if (output.GetLayout() == DataLayout::bfyx) {
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_PITCH", input.Y().pitch));
|
||||
jit.AddConstant(MakeJitConstant("OUT_FEATURE_NUM", output.Y().v));
|
||||
jit.AddConstant(MakeJitConstant("IN_FEATURE_NUM", input.Y().v));
|
||||
jit.AddConstant(MakeJitConstant("IS_3D", true));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_PITCH", input.Feature().pitch));
|
||||
jit.AddConstant(MakeJitConstant("OUT_FEATURE_NUM", output.Feature().v));
|
||||
jit.AddConstant(MakeJitConstant("IN_FEATURE_NUM", input.Feature().v));
|
||||
}
|
||||
|
||||
if (!params.fused_ops.empty()) {
|
||||
auto input_dt = GetActivationType(params);
|
||||
FusedOpsConfiguration conf = { "", {"batch", "feature", "0", "0"}, "dequantized", input_dt, 1 };
|
||||
std::vector<std::string> idx_order = {"batch", "feature", "0", "0"};
|
||||
if (output.GetLayout() == DataLayout::bfyx)
|
||||
idx_order = {"batch", "skip_f", "feature", "0"};
|
||||
|
||||
FusedOpsConfiguration conf = { "", idx_order, "dequantized", input_dt, 1 };
|
||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||
}
|
||||
|
||||
|
@ -49,6 +49,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
const uint feature = (uint)get_group_id(0) * feature_per_wg + (uint)get_global_id(0) % feature_per_wg;
|
||||
const uint feature_block = lid0 / feature_per_wg;
|
||||
const uint batch = (uint)get_global_id(1);
|
||||
const uint skip_f = (uint)get_global_id(2);
|
||||
|
||||
int dotProd = 0;
|
||||
|
||||
@ -56,7 +57,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
#if INPUT0_DIMS == 5
|
||||
const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0, 0);
|
||||
#else
|
||||
const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0);
|
||||
const uint input_offset = INPUT0_GET_INDEX(batch, skip_f, 0, 0);
|
||||
#endif
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
@ -221,17 +222,17 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
#endif // SPATIAL_MAJOR
|
||||
|
||||
#if !SPLIT_SPATIAL
|
||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
|
||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * FEATURE_PITCH;
|
||||
#else
|
||||
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH +
|
||||
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * FEATURE_PITCH +
|
||||
zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
||||
#endif // !SPLIT_SPATIAL
|
||||
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
|
||||
|
||||
MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
|
||||
for (uint i = 0; i < 4; i++) {
|
||||
if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < INPUT0_FEATURE_NUM) {
|
||||
input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH];
|
||||
if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < IN_FEATURE_NUM) {
|
||||
input_data_u[i] = input[input_idx + (sglid * 4 + i) * FEATURE_PITCH];
|
||||
}
|
||||
}
|
||||
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
||||
@ -269,7 +270,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
}
|
||||
#endif // HAS_FEATURE_LEFTOVERS
|
||||
|
||||
if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUTPUT_FEATURE_NUM)
|
||||
if (OUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUT_FEATURE_NUM)
|
||||
return;
|
||||
|
||||
#if BIAS_TERM
|
||||
@ -284,7 +285,11 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
float dequantized = (float)dotProd;
|
||||
#endif // BIAS_TERM
|
||||
|
||||
#if IS_3D
|
||||
const uint out_idx = OUTPUT_GET_INDEX(batch, skip_f, feature, 0);
|
||||
#else
|
||||
const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0);
|
||||
#endif
|
||||
|
||||
#if HAS_FUSED_OPS
|
||||
FUSED_OPS;
|
||||
|
@ -67,13 +67,27 @@
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// Check alignment restrictions for using block writes on output.
|
||||
#define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * OUTPUT_BATCH_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)
|
||||
#define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * TILE_OUT_B_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)
|
||||
|
||||
#if !REALIGN_FP16_OFFSET
|
||||
# if OUTPUT_3D
|
||||
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_SIZE_Y
|
||||
# else
|
||||
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
|
||||
# endif
|
||||
#else
|
||||
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
|
||||
# if OUTPUT_3D
|
||||
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_SIZE_Y - 1)
|
||||
# else
|
||||
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if OUTPUT_3D
|
||||
# define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y
|
||||
#else
|
||||
# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
@ -97,25 +111,24 @@ KERNEL(fc)(
|
||||
// full dispatch pipeline.
|
||||
uint feature_mini_block = gid % DISPATCH_FSV;
|
||||
uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV;
|
||||
uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(OUTPUT_FEATURE_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
|
||||
uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(OUTPUT_FEATURE_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
|
||||
uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
|
||||
uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
|
||||
|
||||
uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD);
|
||||
uint out_b = (batch_mega_block * DISPATCH_BSV + batch_mini_block) * TILE_B;
|
||||
uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * TILE_B);
|
||||
|
||||
ACCUMULATOR_VEC_TYPE acc[TILE_B] = { };
|
||||
INPUT_VEC_TYPE in_0[TILE_B] = { };
|
||||
|
||||
FILTER_VEC_TYPE wei = 0;
|
||||
|
||||
uint weights_offset = out_f * INPUT0_ELEMENTS_COUNT;
|
||||
uint input_offset = out_b * INPUT0_BATCH_PITCH + INPUT0_OFFSET;
|
||||
uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
|
||||
uint weights_offset = out_f * INPUT_ELEMENTS_COUNT;
|
||||
|
||||
#if REALIGN_FP16_OFFSET
|
||||
// For fp16 we need to ensure that all block reads are aligned to 4 byte (2 words) boundary.
|
||||
// To do this solve first input feature separately.
|
||||
{
|
||||
INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * INPUT0_BATCH_PITCH];
|
||||
INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * TILE_IN_B_PITCH];
|
||||
MAKE_VECTOR_TYPE(FILTER_TYPE, TILE_OFM) tmp_wei = BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset);
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
@ -135,13 +148,12 @@ KERNEL(fc)(
|
||||
// Load input.
|
||||
#define LOAD_IN_0(bi) do { \
|
||||
in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \
|
||||
input_offset += INPUT0_BATCH_PITCH; \
|
||||
input_offset += TILE_IN_B_PITCH; \
|
||||
} while (false)
|
||||
|
||||
CONST_LOOP(TILE_B, LOAD_IN_0);
|
||||
#undef LOAD_IN_0
|
||||
input_offset += TILE_IFM * SIMD - INPUT0_BATCH_PITCH * TILE_B;
|
||||
|
||||
input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
|
||||
// NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
|
||||
// but significantly degrades readability and generality of code.
|
||||
// It doesn't also show noticable performance improvement on tested configurations.
|
||||
@ -172,13 +184,12 @@ KERNEL(fc)(
|
||||
{
|
||||
#define LOAD_IN_0(bi) do { \
|
||||
in_0[bi] = INPUT_BLOCK_READ(input, input_offset); \
|
||||
input_offset += INPUT0_BATCH_PITCH; \
|
||||
input_offset += TILE_IN_B_PITCH; \
|
||||
} while (false)
|
||||
|
||||
CONST_LOOP(TILE_B, LOAD_IN_0);
|
||||
#undef LOAD_IN_0
|
||||
input_offset += TILE_IFM * SIMD - INPUT0_BATCH_PITCH * TILE_B;
|
||||
|
||||
input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
|
||||
wei = FILTER_BLOCK_READ(weights, weights_offset);
|
||||
@ -210,7 +221,7 @@ KERNEL(fc)(
|
||||
}
|
||||
|
||||
#if BIAS_TERM
|
||||
#if OUTPUT_FEATURE_NUM % (TILE_OFM * SIMD) == 0
|
||||
#if TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0
|
||||
BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
|
||||
#else
|
||||
BIAS_VEC_TYPE bias = 0;
|
||||
@ -242,12 +253,12 @@ KERNEL(fc)(
|
||||
#endif
|
||||
// =====================================================================================================================================
|
||||
// Write results
|
||||
uint output_offset = out_f * OUTPUT_FEATURE_PITCH + out_b * OUTPUT_BATCH_PITCH + OUTPUT_OFFSET;
|
||||
uint output_offset = out_f * TILE_OUT_F_PITCH + out_b * TILE_OUT_B_PITCH + OUTPUT_OFFSET;
|
||||
|
||||
if (USE_BLOCK_WRITE && (OUTPUT_FEATURE_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= OUTPUT_FEATURE_NUM)) {
|
||||
if (USE_BLOCK_WRITE && (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= TILE_OUT_F_NUM)) {
|
||||
#define WRITE_OUTPUT(bi) do { \
|
||||
OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]); \
|
||||
output_offset += OUTPUT_BATCH_PITCH; \
|
||||
output_offset += TILE_OUT_B_PITCH; \
|
||||
} while (false)
|
||||
|
||||
CONST_LOOP(TILE_B, WRITE_OUTPUT);
|
||||
@ -258,8 +269,8 @@ KERNEL(fc)(
|
||||
// TODO: Investigate why below code doesn't compile and check how it affects performance.
|
||||
//#define WRITE_OUTPUT_FEATURE(fi) do { \
|
||||
// const bool should_write = \
|
||||
// OUTPUT_FEATURE_NUM % (TILE_OFM * SIMD) == 0 || \
|
||||
// out_f + (fi) * SIMD + get_sub_group_local_id() < OUTPUT_FEATURE_NUM; \
|
||||
// TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || \
|
||||
// out_f + (fi) * SIMD + get_sub_group_local_id() < TILE_OUT_F_NUM; \
|
||||
// if (should_write) { \
|
||||
// output[output_offset] = result[out_bi][fi]; \
|
||||
// } \
|
||||
@ -269,7 +280,7 @@ KERNEL(fc)(
|
||||
//#define WRITE_OUTPUT(bi) do { \
|
||||
// const uint out_bi = bi; \
|
||||
// CONST_LOOP(TILE_OFM, WRITE_OUTPUT_FEATURE); \
|
||||
// output_offset += OUTPUT_BATCH_PITCH - TILE_OFM * SIMD; \
|
||||
// output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD; \
|
||||
// } while (false)
|
||||
//
|
||||
//CONST_LOOP(TILE_B, WRITE_OUTPUT);
|
||||
@ -279,14 +290,14 @@ KERNEL(fc)(
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
const bool should_write =
|
||||
OUTPUT_FEATURE_NUM % (TILE_OFM * SIMD) == 0 ||
|
||||
out_f + fi * SIMD + get_sub_group_local_id() < OUTPUT_FEATURE_NUM;
|
||||
TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 ||
|
||||
out_f + fi * SIMD + get_sub_group_local_id() < TILE_OUT_F_NUM;
|
||||
if (should_write) {
|
||||
output[output_offset] = ((OUTPUT_TYPE*)(&result[bi]))[fi];
|
||||
}
|
||||
output_offset += SIMD;
|
||||
}
|
||||
output_offset += OUTPUT_BATCH_PITCH - TILE_OFM * SIMD;
|
||||
output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD;
|
||||
}
|
||||
}
|
||||
// =====================================================================================================================================
|
||||
|
@ -27,10 +27,31 @@ KERNEL(fc)(
|
||||
#endif
|
||||
)
|
||||
{
|
||||
#if OUTPUT_3D
|
||||
const uint oxfm = get_global_id(0);
|
||||
const uint b = get_global_id(1);
|
||||
const uint oym = oxfm % OUTPUT_SIZE_Y;
|
||||
const uint ofm = oxfm / OUTPUT_SIZE_Y;
|
||||
|
||||
ACCUMULATOR_TYPE dotProd = ACCUMULATOR_VAL_ZERO;
|
||||
|
||||
for (uint y = 0; y < INPUT0_SIZE_Y; ++y)
|
||||
{
|
||||
for(uint x = 0; x < INPUT0_SIZE_X; ++x )
|
||||
{
|
||||
const uint input0_idx = GET_DATA_INDEX(INPUT0, b, ofm, y, x);
|
||||
const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
|
||||
dotProd += (ACCUMULATOR_TYPE)(input[input0_idx] * weights[filter_idx]);
|
||||
}
|
||||
}
|
||||
|
||||
const uint dst_index = GET_DATA_INDEX(OUTPUT, b, ofm, oym, 0);
|
||||
const uint bias_index = oym;
|
||||
#else
|
||||
const uint ofm = get_global_id(0);
|
||||
const uint b = get_global_id(1);
|
||||
|
||||
ACCUMULATOR_TYPE dotProd = (ACCUMULATOR_TYPE)0;
|
||||
ACCUMULATOR_TYPE dotProd = ACCUMULATOR_VAL_ZERO;
|
||||
|
||||
for (uint ifm = 0; ifm < INPUT0_FEATURE_NUM; ++ifm)
|
||||
{
|
||||
@ -46,9 +67,10 @@ KERNEL(fc)(
|
||||
}
|
||||
|
||||
const uint dst_index = GET_DATA_INDEX(OUTPUT, b, ofm, 0, 0);
|
||||
const uint bias_index = ofm;
|
||||
#endif
|
||||
|
||||
#if BIAS_TERM
|
||||
const uint bias_index = ofm;
|
||||
ACTIVATION_TYPE dequantized = dotProd + biases[bias_index];
|
||||
#else
|
||||
ACTIVATION_TYPE dequantized = dotProd;
|
||||
|
@ -54,6 +54,10 @@ bool is_batch_after_spatial(const std::string order) {
|
||||
format::type get_preferred_format(const fully_connected_node& node) {
|
||||
auto input_layout = node.input().get_output_layout();
|
||||
|
||||
// for 3d output we have to chose bfyx format
|
||||
if (node.get_primitive()->input_size == 3)
|
||||
return format::bfyx;
|
||||
|
||||
if (data_type_traits::is_floating_point(input_layout.data_type) &&
|
||||
(is_batch_after_spatial(input_layout.format.order()) ||
|
||||
input_layout.format == format::bs_x_bsv16 ||
|
||||
@ -107,6 +111,9 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node
|
||||
}
|
||||
|
||||
auto output_size = tensor(input_layout.size.batch[0], weights_layout.size.batch[0], 1, 1);
|
||||
if (desc->input_size == 3) {
|
||||
output_size = tensor(input_layout.size.batch[0], input_layout.size.feature[0], 1, weights_layout.size.batch[0]);
|
||||
}
|
||||
format output_format = get_preferred_format(node);
|
||||
|
||||
return layout(output_type, output_format, output_size);
|
||||
|
@ -57,10 +57,11 @@ public:
|
||||
arg.get_program());
|
||||
fc_optional_params.allowInputReordering = true;
|
||||
|
||||
fc_params.output = fc_params.output.FlattenFeatureAndSpatials();
|
||||
|
||||
const auto primitive = arg.get_primitive();
|
||||
|
||||
if (primitive->input_size != 3)
|
||||
fc_params.output = fc_params.output.FlattenFeatureAndSpatials();
|
||||
|
||||
if (arg.get_output_layout().data_type == data_types::i8 ||
|
||||
arg.get_output_layout().data_type == data_types::u8) {
|
||||
fc_params.quantization = kernel_selector::QuantizationType::SYMMETRIC;
|
||||
|
@ -1150,6 +1150,109 @@ INSTANTIATE_TEST_CASE_P(smoke_bfyx_batched,
|
||||
::testing::Values(format::bfyx),
|
||||
::testing::Values("")), );
|
||||
|
||||
|
||||
template <typename InputT, typename WeightsT, typename BiasT, typename OutputT>
|
||||
struct fully_connected_random_test_3d : ::testing::TestWithParam<fully_connected_test_params> {
|
||||
void run_test() {
|
||||
size_t batch, input_f, input_x, input_y, output_y;
|
||||
format::type input_format, output_format;
|
||||
std::string kernel;
|
||||
|
||||
std::tie(batch, input_f, input_x, input_y, output_y, input_format, output_format, kernel) = GetParam();
|
||||
|
||||
auto input_data = generate_smart_random_4d<InputT>(batch, input_f, input_y, input_x);
|
||||
auto weights_data = generate_smart_random_4d<WeightsT>(output_y, input_y, 1, 1);
|
||||
auto bias_data = generate_smart_random_2d<BiasT>(1, output_y);
|
||||
|
||||
auto eng = get_test_engine();
|
||||
auto net = network_test(eng);
|
||||
auto input = net.add_input_layout<InputT, 4>("input", input_format, std::move(input_data));
|
||||
auto weights = net.add_data<WeightsT, 4>("weights", format::oiyx, std::move(weights_data));
|
||||
auto bias = net.add_data<BiasT, 2>("bias", format::bfyx, std::move(bias_data));
|
||||
auto fc = net.add_fully_connected_3d<OutputT>("fc", input, weights, bias, implementation_desc{ output_format, kernel }, 3);
|
||||
|
||||
net.run(build_options(build_option::optimize_data(true)));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
using fully_connected_random_test_f32_3d = fully_connected_random_test_3d<float, float, float, float>;
|
||||
using fully_connected_random_test_f16_3d = fully_connected_random_test_3d<FLOAT16, FLOAT16, FLOAT16, FLOAT16>;
|
||||
using fully_connected_random_test_i8_3d = fully_connected_random_test_3d<int8_t, int8_t, int8_t, float>;
|
||||
|
||||
TEST_P(fully_connected_random_test_f32_3d, basic) {
|
||||
run_test();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke,
|
||||
fully_connected_random_test_f32_3d,
|
||||
::testing::Combine(
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(1),
|
||||
::testing::Values(1,3,16),
|
||||
::testing::Values(1,3,16),
|
||||
::testing::Values(format::bfyx),
|
||||
::testing::Values(format::any),
|
||||
::testing::Values("")), );
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_big,
|
||||
fully_connected_random_test_f32_3d,
|
||||
::testing::Combine(
|
||||
::testing::Values(3),
|
||||
::testing::Values(16, 17, 32),
|
||||
::testing::Values(1),
|
||||
::testing::Values(17, 32),
|
||||
::testing::Values(17, 32),
|
||||
::testing::Values(format::bfyx),
|
||||
::testing::Values(format::any),
|
||||
::testing::Values("")), );
|
||||
|
||||
TEST_P(fully_connected_random_test_f16_3d, basic) {
|
||||
run_test();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke,
|
||||
fully_connected_random_test_f16_3d,
|
||||
::testing::Combine(
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(1),
|
||||
::testing::Values(1,3,16),
|
||||
::testing::Values(1,3,16),
|
||||
::testing::Values(format::bfyx),
|
||||
::testing::Values(format::any),
|
||||
::testing::Values("")), );
|
||||
|
||||
TEST_P(fully_connected_random_test_i8_3d, basic) {
|
||||
run_test();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke,
|
||||
fully_connected_random_test_i8_3d,
|
||||
::testing::Combine(
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(1),
|
||||
::testing::Values(1,3,16),
|
||||
::testing::Values(1,3,16),
|
||||
::testing::Values(format::bfyx),
|
||||
::testing::Values(format::any),
|
||||
::testing::Values("")), );
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_big,
|
||||
fully_connected_random_test_i8_3d,
|
||||
::testing::Combine(
|
||||
::testing::Values(1,3),
|
||||
::testing::Values(16,17),
|
||||
::testing::Values(1),
|
||||
::testing::Values(17, 32),
|
||||
::testing::Values(17, 32),
|
||||
::testing::Values(format::bfyx),
|
||||
::testing::Values(format::any),
|
||||
::testing::Values("")), );
|
||||
|
||||
|
||||
struct quantization_t {
|
||||
VF<float> input_low;
|
||||
VF<float> input_high;
|
||||
|
@ -368,6 +368,38 @@ public:
|
||||
layout get_per_channel_layout(T& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
|
||||
}
|
||||
|
||||
size_t get_fc_output_dim_size(bc_test_params& p) {
|
||||
size_t size = 2;
|
||||
for (auto i : p.out_shape.spatial) {
|
||||
if (i > 1)
|
||||
size++;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
layout get_fc_weights_layout(T& p) {
|
||||
cldnn::tensor weights_tensor;
|
||||
if (p.out_shape.spatial[1] > 1) {
|
||||
// 3d case
|
||||
weights_tensor = cldnn::tensor(p.kernel.batch[0], p.kernel.feature[0], 1, 1);
|
||||
}
|
||||
else {
|
||||
weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]),
|
||||
spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2]));
|
||||
}
|
||||
return layout{p.weights_type, p.weights_format, weights_tensor};
|
||||
}
|
||||
|
||||
layout get_fc_bias_layout(T& p) {
|
||||
if (p.out_shape.spatial[1] > 1) {
|
||||
// 3d case
|
||||
return layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.spatial[1], 1, 1} };
|
||||
}
|
||||
else {
|
||||
return layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} };
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class ResamplePrimitiveFusingTest : public ::BaseFusingTest<resample_test_params> {
|
||||
@ -557,10 +589,16 @@ public:
|
||||
#define CASE_FC_FP32_1 {1, 1, 3, 1}, {1, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_2 {2, 1, 3, 1}, {2, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3 {2, 32, 1, 1}, {2, 16, 1, 1}, {16, 32, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_1 {5, 3, 1, 3}, {5, 3, 1, 5}, {5, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_2 {2, 1, 1, 1}, {2, 1, 1, 32}, {32, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_3 {2, 32, 1, 32}, {2, 32, 1, 16}, {16, 32, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_FC_U8S8_1 {1, 1, 3, 1}, {1, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_2 {2, 1, 3, 1}, {2, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3 {2, 32, 1, 1}, {2, 16, 1, 1}, {16, 32, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_1 {2, 32, 1, 3}, {2, 32, 1, 16}, {16, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_2 {1, 1, 1, 3}, {1, 1, 1, 32}, {32, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_3 {2, 3, 1, 1}, {2, 3, 1, 15}, {15, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_NORMALIZE_I8_1 {1, 2, 3, 3}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
|
||||
|
||||
@ -2475,9 +2513,9 @@ class fc_fp32_activation : public FCFusingTest {};
|
||||
TEST_P(fc_fp32_activation, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
fully_connected("fc_prim", "input", "weights", "bias"),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", padding(), get_fc_output_dim_size(p)),
|
||||
activation("activation", "fc_prim", activation_func::abs),
|
||||
reorder("reorder_bfyx", "activation", p.default_format, data_types::f32)
|
||||
);
|
||||
@ -2490,14 +2528,17 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_fp32_activation, ::testing::ValuesIn(std
|
||||
bc_test_params{ CASE_FC_FP32_1, 2, 3 },
|
||||
bc_test_params{ CASE_FC_FP32_2, 2, 3 },
|
||||
bc_test_params{ CASE_FC_FP32_3, 2, 3 },
|
||||
bc_test_params{ CASE_FC_FP32_3D_1, 2, 3 },
|
||||
bc_test_params{ CASE_FC_FP32_3D_2, 2, 3 },
|
||||
bc_test_params{ CASE_FC_FP32_3D_3, 2, 3 },
|
||||
}), );
|
||||
|
||||
class fc_fp32_bias : public FCFusingTest {};
|
||||
TEST_P(fc_fp32_bias, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
fully_connected("fc_prim", "input", "weights", ""),
|
||||
eltwise("bias_add", {"fc_prim", "bias"}, eltwise_mode::sum),
|
||||
reorder("reorder_bfyx", "bias_add", p.default_format, data_types::f32)
|
||||
@ -2517,10 +2558,10 @@ class fc_int8_scale : public FCFusingTest {};
|
||||
TEST_P(fc_int8_scale, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data"),
|
||||
reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
|
||||
);
|
||||
@ -2532,10 +2573,10 @@ TEST_P(fc_int8_scale, basic) {
|
||||
TEST_P(fc_int8_scale, fp16_scale_out) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data", optional_data_type{data_types::f16}),
|
||||
reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
|
||||
);
|
||||
@ -2549,19 +2590,22 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale,
|
||||
bc_test_params{ CASE_FC_U8S8_1, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_2, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_3, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_1, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_2, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
|
||||
}), );
|
||||
|
||||
class fc_int8_quantize_u8 : public FCFusingTest {};
|
||||
TEST_P(fc_int8_quantize_u8, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), 0)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 255)),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
|
||||
quantize("quantize", "fc_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
|
||||
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||
);
|
||||
@ -2575,20 +2619,23 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu_fc, fc_int8_quantize_u8,
|
||||
bc_test_params{CASE_FC_U8S8_1, 2, 3},
|
||||
bc_test_params{CASE_FC_U8S8_2, 2, 3},
|
||||
bc_test_params{CASE_FC_U8S8_3, 2, 3},
|
||||
bc_test_params{ CASE_FC_U8S8_3D_1, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_2, 2, 3 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
|
||||
}), );
|
||||
|
||||
class fc_int8_scale_quantize_i8 : public FCFusingTest {};
|
||||
TEST_P(fc_int8_scale_quantize_i8, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -127)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data"),
|
||||
quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||
@ -2602,6 +2649,9 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale_quantize_i8,
|
||||
bc_test_params{CASE_FC_U8S8_1, 2, 4},
|
||||
bc_test_params{CASE_FC_U8S8_2, 2, 4},
|
||||
bc_test_params{CASE_FC_U8S8_3, 2, 4},
|
||||
bc_test_params{ CASE_FC_U8S8_3D_1, 2, 4 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_2, 2, 4 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_3, 2, 4 },
|
||||
}), );
|
||||
|
||||
|
||||
@ -2610,14 +2660,14 @@ class fc_int8_scale_activation_quantize_i8 : public FCFusingTest {};
|
||||
TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("weights", get_mem(get_fc_weights_layout(p))),
|
||||
data("bias", get_mem(get_fc_bias_layout(p))),
|
||||
data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -127)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data"),
|
||||
activation("activation_scale", "scale", activation_func::exp),
|
||||
quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
@ -2633,6 +2683,14 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8,
|
||||
bc_test_params{CASE_FC_U8S8_1, 2, 5},
|
||||
bc_test_params{CASE_FC_U8S8_2, 2, 5},
|
||||
bc_test_params{CASE_FC_U8S8_3, 2, 5},
|
||||
|
||||
bc_test_params{ CASE_FC_U8S8_3D_1, 2, 5 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_2, 2, 5 },
|
||||
bc_test_params{ CASE_FC_U8S8_3D_3, 2, 5 },
|
||||
|
||||
bc_test_params{ CASE_FC_FP32_3D_1, 3, 5 },
|
||||
bc_test_params{ CASE_FC_FP32_3D_2, 3, 5 },
|
||||
bc_test_params{ CASE_FC_FP32_3D_3, 3, 5 },
|
||||
}), );
|
||||
|
||||
|
||||
|
@ -144,7 +144,6 @@ template <typename T>
|
||||
struct reference_tensor_typed<T, 4> : reference_tensor {
|
||||
using vector_type = VVVVF<T>;
|
||||
reference_tensor_typed(vector_type data) : reference(std::move(data)) {}
|
||||
|
||||
void compare(cldnn::memory actual) override {
|
||||
auto ptr = actual.pointer<T>();
|
||||
for (size_t bi = 0; bi < reference.size(); ++bi) {
|
||||
@ -231,6 +230,36 @@ VVF<OutputT> fully_connected_reference_typed(VVVVF<InputT>& input, VVVVF<Weights
|
||||
return output;
|
||||
}
|
||||
|
||||
template <typename OutputT,
|
||||
typename InputT,
|
||||
typename WeightsT,
|
||||
typename BiasT,
|
||||
typename AccT = typename fully_connected_accumulator<InputT>::type>
|
||||
VVVVF<OutputT> fully_connected_reference_typed_3d(VVVVF<InputT>& input, VVVVF<WeightsT>& weights, VF<BiasT>& bias) {
|
||||
size_t input_f = input[0].size();
|
||||
size_t input_y = input[0][0].size();
|
||||
size_t input_x = input[0][0][0].size();
|
||||
size_t output_b = input.size(); // input is assumed to be bfyx
|
||||
size_t output_f = weights.size(); // weights is assumed to be bfyx
|
||||
size_t weights_f = weights[0].size(); // weights is assumed to be bfyx
|
||||
VVVVF<OutputT> output(output_b, VVVF<OutputT>(input_f, VVF<OutputT>(output_f, VF<OutputT>(1))));
|
||||
OutputT res;
|
||||
for (size_t b = 0; b < output_b; ++b) {
|
||||
for (size_t n = 0; n < input_f; ++n) {
|
||||
for (size_t f = 0; f < output_f; ++f) {
|
||||
res = bias[f];
|
||||
for (size_t y = 0; y < input_y; ++y) {
|
||||
for (size_t x = 0; x < input_x; ++x) {
|
||||
res += (OutputT)input[b][n][y][x] * (OutputT)weights[f][y][0][0];
|
||||
}
|
||||
}
|
||||
output[b][n][f][0] = (OutputT)res;
|
||||
}
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
// =====================================================================================================================
|
||||
// Network test
|
||||
struct reference_node_interface {
|
||||
@ -300,6 +329,22 @@ public:
|
||||
return add_node(id, reference_tensor_typed<T, 2>(output_data), { input, weights, bias });
|
||||
}
|
||||
|
||||
template <typename T, typename InputT, size_t InputN, typename WeightsT, typename BiasT>
|
||||
typename reference_node<T, 4>::ptr add_fully_connected_3d(cldnn::primitive_id id,
|
||||
std::shared_ptr<reference_node<InputT, InputN>> input,
|
||||
std::shared_ptr<reference_node<WeightsT, InputN>> weights,
|
||||
std::shared_ptr<reference_node<BiasT, 2>> bias,
|
||||
cldnn::implementation_desc force = cldnn::implementation_desc{cldnn::format::any, ""},
|
||||
size_t input_dim_size = 3) {
|
||||
topo.add(cldnn::fully_connected(id, input->id, weights->id, bias->id, cldnn::type_to_data_type<T>::value, cldnn::padding(), input_dim_size));
|
||||
if (force.output_format != cldnn::format::any || force.kernel_name != "")
|
||||
forced_impls[id] = force;
|
||||
VVVVF<T> output_data = fully_connected_reference_typed_3d<T>(input->reference.reference,
|
||||
weights->reference.reference,
|
||||
bias->reference.reference[0]);
|
||||
return add_node(id, reference_tensor_typed<T, 4>(output_data), {input, weights, bias});
|
||||
}
|
||||
|
||||
cldnn::network build_network(cldnn::build_options opts) {
|
||||
opts.set_option(cldnn::build_option::force_implementations(forced_impls));
|
||||
auto net = cldnn::network(eng, topo, opts);
|
||||
|
Loading…
Reference in New Issue
Block a user