[IE CLDNN] Fully connected MMAD simd16 improvements (#3394)
This commit is contained in:
parent
7fe21dc6ee
commit
b6f311b463
@ -413,6 +413,9 @@ struct layout {
|
||||
if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
|
||||
sizes[0] = align_to(sizes[0], 8);
|
||||
sizes[1] = align_to(sizes[1], 32);
|
||||
} else if (this->format == cldnn::format::os_is_yx_isa8_osv16_isv4 && !(is_aligned_to(sizes[0], 16)) && !(is_aligned_to(sizes[1], 32))) {
|
||||
sizes[0] = align_to(sizes[0], 16);
|
||||
sizes[1] = align_to(sizes[1], 32);
|
||||
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) {
|
||||
sizes[0] = align_to(sizes[0], 32);
|
||||
sizes[1] = align_to(sizes[1], 32);
|
||||
|
11
inference-engine/thirdparty/clDNN/api/tensor.hpp
vendored
11
inference-engine/thirdparty/clDNN/api/tensor.hpp
vendored
@ -162,6 +162,8 @@ struct format {
|
||||
///< convolution, F(6,3) -- filter 3x3 with stride 1
|
||||
os_is_yx_isa8_osv8_isv4, ///< format for weights for MMAD convolution
|
||||
os_is_zyx_isa8_osv8_isv4, ///< format for weights for MMAD convolution
|
||||
os_is_yx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD
|
||||
os_is_zyx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD
|
||||
os_is_yx_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD convolution
|
||||
os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution
|
||||
os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution
|
||||
@ -273,8 +275,10 @@ struct format {
|
||||
{ image_2d_weights_c1_b_fyx, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
|
||||
{ lstm_weights_dio, { 1, 1, 2, 0, 0, "oixy", "oixy?", {}}},
|
||||
{ os_is_yx_isa8_osv8_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
|
||||
{ os_is_yx_isa8_osv16_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
|
||||
{ os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
|
||||
{ os_is_zyx_isa8_osv8_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}}},
|
||||
{ os_is_zyx_isa8_osv16_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}}},
|
||||
{ os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {{0, 32}, {1, 32}}}},
|
||||
{ os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{0, 32}, {1, 32}}}},
|
||||
{ is_o_yx_isv32, { 1, 1, 2, 0, 0, "oyxi", "oixy?", {{1, 32}}}},
|
||||
@ -995,6 +999,13 @@ public:
|
||||
my_sizes[1] = align_to(my_sizes[1], 32);
|
||||
adjusted_coords[0] = align_to(adjusted_coords[0], 8);
|
||||
adjusted_coords[1] = align_to(adjusted_coords[1], 32);
|
||||
} else if (fmt == cldnn::format::os_is_yx_isa8_osv16_isv4 &&
|
||||
!(is_aligned_to(my_sizes[0], 16)) &&
|
||||
!(is_aligned_to(my_sizes[1], 32))) {
|
||||
my_sizes[0] = align_to(my_sizes[0], 16);
|
||||
my_sizes[1] = align_to(my_sizes[1], 32);
|
||||
adjusted_coords[0] = align_to(adjusted_coords[0], 16);
|
||||
adjusted_coords[1] = align_to(adjusted_coords[1], 32);
|
||||
} else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(my_sizes[0], 32)) && !(is_aligned_to(my_sizes[1], 32))) {
|
||||
my_sizes[0] = align_to(my_sizes[0], 32);
|
||||
my_sizes[1] = align_to(my_sizes[1], 32);
|
||||
|
@ -86,8 +86,10 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
|
||||
{ WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb, { 0, 1, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::dlstm_dir_io, { 1, 0, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_yx_isa8_osv8_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_yx_isa8_osv16_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_zyx_isa8_osv8_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_zyx_isa8_osv16_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
|
||||
{ WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, 2, 3, 4, -1, -1, -1 } },
|
||||
{ WeightsLayout::is_o_yx_isv32, { 1, 2, -1, 0, 3, -1, -1, -1 } },
|
||||
@ -457,6 +459,16 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
|
||||
newDims[3] = RoundUp(newDims[3], 32);
|
||||
newDims[4] = RoundUp(newDims[4], 8);
|
||||
break;
|
||||
case os_is_yx_isa8_osv16_isv4:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[3] = RoundUp(newDims[3], 16);
|
||||
newDims[2] = RoundUp(newDims[2], 32);
|
||||
break;
|
||||
case os_is_zyx_isa8_osv16_isv4:
|
||||
assert(newDims.size() == 5);
|
||||
newDims[3] = RoundUp(newDims[3], 32);
|
||||
newDims[4] = RoundUp(newDims[4], 16);
|
||||
break;
|
||||
case os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
assert(newDims.size() == 4);
|
||||
newDims[3] = RoundUp(newDims[3], 32);
|
||||
@ -693,6 +705,9 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
|
||||
} else if (l == os_is_yx_isa8_osv8_isv4 || l == os_is_yx_isa8_osv8_isv4_swizzled_by_4) {
|
||||
ret[0].pitch = 256;
|
||||
ret[1].pitch = ret[0].pitch * ret[0].v;
|
||||
} else if (l == os_is_yx_isa8_osv16_isv4) {
|
||||
ret[0].pitch = 512;
|
||||
ret[1].pitch = ret[0].pitch * ret[0].v;
|
||||
} else if (l == os_i_yxs_osv4_yxsv4) {
|
||||
ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 4;
|
||||
ret[3].pitch = ret[2].v * RoundUp(ret[0].v * ret[1].v, 4);
|
||||
|
@ -107,6 +107,8 @@ enum WeightsLayout {
|
||||
dlstm_dir_io, // dlstm weights layout direction, input_size, 4* hiden_size
|
||||
os_is_yx_isa8_osv8_isv4, // for MMAD convolution
|
||||
os_is_zyx_isa8_osv8_isv4, // for MMAD convolution
|
||||
os_is_yx_isa8_osv16_isv4, // for fully connected MMAD
|
||||
os_is_zyx_isa8_osv16_isv4, // for fully connected MMAD
|
||||
os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28,
|
||||
// 1,5...
|
||||
os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28,
|
||||
|
@ -61,27 +61,62 @@ bool FullyConnectedKernelMMAD::Validate(const Params& params, const optional_par
|
||||
return true;
|
||||
}
|
||||
|
||||
FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::SetTuningParams(const fully_connected_params& params) const {
|
||||
FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::GetTuningParams(const fully_connected_params& params) const {
|
||||
FullyConnectedTuningData tuning_data;
|
||||
|
||||
const auto& input = params.inputs[0];
|
||||
const auto& output = params.output;
|
||||
|
||||
size_t feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0 ?
|
||||
input.Feature().v / 32 : CeilDiv(input.Feature().v, 32);
|
||||
tuning_data.sub_group_size = 8;
|
||||
if (input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1 && input.Batch().v == 1) {
|
||||
// Known cases for TGL where simd16 works better than simd8
|
||||
bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;
|
||||
bool simd16_exception_2 = input.Feature().v == 21504 && output.Feature().v == 512;
|
||||
|
||||
if (feature_blocks_count)
|
||||
while (feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
|
||||
if (simd16_exception_1 || simd16_exception_2)
|
||||
tuning_data.sub_group_size = 16;
|
||||
}
|
||||
|
||||
size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
|
||||
|
||||
tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size != 0 ?
|
||||
input.Feature().v / sub_group_pack_size :
|
||||
input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 ?
|
||||
CeilDiv(input.Feature().v, 32) % 2 == 0 ? CeilDiv(input.Feature().v, 64) : CeilDiv(input.Feature().v, 64) - 1 :
|
||||
CeilDiv(input.Feature().v, sub_group_pack_size);
|
||||
|
||||
bool slm_div_factor_exception = input.Batch().v == 300 && input.Feature().v == 2048 &&
|
||||
output.Batch().v == 300 && (output.Feature().v == 324 || output.Feature().v == 81);
|
||||
|
||||
if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception)
|
||||
while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
|
||||
(tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size))
|
||||
tuning_data.slm_div_factor *= 2;
|
||||
|
||||
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||
|
||||
tuning_data.full_unroll_factor = tuning_data.feature_blocks_count / tuning_data.slm_div_factor;
|
||||
|
||||
if (tuning_data.sub_group_size == 16) {
|
||||
tuning_data.unroll_factor = 1;
|
||||
} else {
|
||||
size_t temp_unroll_factor = 3;
|
||||
|
||||
if (tuning_data.full_unroll_factor > 3) {
|
||||
while (tuning_data.full_unroll_factor % temp_unroll_factor)
|
||||
temp_unroll_factor--;
|
||||
tuning_data.unroll_factor = temp_unroll_factor;
|
||||
} else {
|
||||
tuning_data.unroll_factor = tuning_data.full_unroll_factor;
|
||||
}
|
||||
}
|
||||
|
||||
return tuning_data;
|
||||
}
|
||||
|
||||
FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params,
|
||||
int) const {
|
||||
FullyConnectedTuningData tuning_data = SetTuningParams(params);
|
||||
FullyConnectedTuningData tuning_data = GetTuningParams(params);
|
||||
auto dispatchData = Parent::SetDefault(params);
|
||||
const auto& output = params.output;
|
||||
|
||||
@ -92,84 +127,65 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons
|
||||
}
|
||||
|
||||
JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params,
|
||||
const DispatchData& dispatchData) const {
|
||||
FullyConnectedTuningData tuning_data = SetTuningParams(params);
|
||||
const DispatchData& runInfo) const {
|
||||
FullyConnectedTuningData tuning_data = GetTuningParams(params);
|
||||
|
||||
auto jit = Parent::GetJitConstants(params, dispatchData);
|
||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||
|
||||
auto& input = params.inputs[0];
|
||||
auto& weights = params.weights;
|
||||
|
||||
size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
|
||||
if (input.GetDims().size() == 5) {
|
||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
|
||||
if (tuning_data.sub_group_size == 8) {
|
||||
if (input.GetDims().size() == 5) {
|
||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)"));
|
||||
}
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)"));
|
||||
if (input.GetDims().size() == 5) {
|
||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
|
||||
} else {
|
||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)"));
|
||||
}
|
||||
}
|
||||
|
||||
Datatype input_packed_type = Datatype::INT32;
|
||||
Datatype filter_packed_type = Datatype::INT32;
|
||||
|
||||
if (input.GetDType() == Datatype::UINT8) {
|
||||
input_packed_type = Datatype::UINT32;
|
||||
} else if (input.GetDType() == Datatype::INT8) {
|
||||
input_packed_type = Datatype::INT32;
|
||||
}
|
||||
|
||||
if (weights.GetDType() == WeightsType::UINT8) {
|
||||
filter_packed_type = Datatype::UINT32;
|
||||
} else if (weights.GetDType() == WeightsType::INT8) {
|
||||
filter_packed_type = Datatype::INT32;
|
||||
}
|
||||
|
||||
jit.Merge(MakeTypeJitConstants(input_packed_type, "INPUT_PACKED"));
|
||||
jit.Merge(MakeTypeJitConstants(filter_packed_type, "FILTER_PACKED"));
|
||||
jit.Merge(MakeTypeJitConstants(input.GetDType() == Datatype::UINT8 ? Datatype::UINT32 : Datatype::INT32, "INPUT_PACKED"));
|
||||
jit.Merge(MakeTypeJitConstants(weights.GetDType() == WeightsType::UINT8 ? Datatype::UINT32 : Datatype::INT32, "FILTER_PACKED"));
|
||||
|
||||
auto filter_spatial_size = weights.X().v * weights.Y().v * weights.Z().v;
|
||||
int filter_spatial_pitch = 4 * 8 * 8;
|
||||
auto filter_spatial_pitch = 8 * sub_group_pack_size;
|
||||
auto filter_fblock_pitch = tuning_data.sub_group_size == 8 ?
|
||||
filter_spatial_size * filter_spatial_pitch :
|
||||
filter_spatial_size * filter_spatial_pitch * 2;
|
||||
|
||||
jit.AddConstant(MakeJitConstant("FILTER_SPATIAL_SIZE", filter_spatial_size));
|
||||
jit.AddConstant(MakeJitConstant("MMAD_FILTER_SPATIAL_PITCH", filter_spatial_pitch));
|
||||
jit.AddConstant(MakeJitConstant("MMAD_FILTER_FBLOCK_PITCH", filter_spatial_size * filter_spatial_pitch));
|
||||
jit.AddConstant(MakeJitConstant("MMAD_FILTER_FBLOCK_PITCH", filter_fblock_pitch));
|
||||
|
||||
size_t input_x_pitch = input.X().pitch;
|
||||
size_t input_y_pitch = input.Y().pitch;
|
||||
size_t input_z_pitch = input.Z().pitch;
|
||||
|
||||
if (input.GetLayout() == DataLayout::bfyx) {
|
||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32));
|
||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", sub_group_pack_size));
|
||||
} else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
|
||||
input_x_pitch = 32;
|
||||
input_y_pitch *= 32;
|
||||
input_z_pitch *= 32;
|
||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32));
|
||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * sub_group_pack_size));
|
||||
}
|
||||
|
||||
bool has_feature_leftovers = (input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size) ||
|
||||
(input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 && CeilDiv(input.Feature().v, 32) % 2);
|
||||
|
||||
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", has_feature_leftovers));
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", tuning_data.feature_blocks_count));
|
||||
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||
|
||||
size_t feature_blocks_count;
|
||||
size_t temp_unroll_factor = 9, unroll_factor, full_unroll_factor;
|
||||
|
||||
if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) {
|
||||
feature_blocks_count = input.Feature().v / 32;
|
||||
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true));
|
||||
} else {
|
||||
feature_blocks_count = CeilDiv(input.Feature().v, 32);
|
||||
}
|
||||
|
||||
full_unroll_factor = feature_blocks_count / tuning_data.slm_div_factor;
|
||||
|
||||
if (full_unroll_factor > 9) {
|
||||
while (full_unroll_factor % temp_unroll_factor)
|
||||
temp_unroll_factor--;
|
||||
unroll_factor = temp_unroll_factor;
|
||||
} else {
|
||||
unroll_factor = full_unroll_factor;
|
||||
}
|
||||
|
||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", feature_blocks_count));
|
||||
jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", unroll_factor));
|
||||
jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", full_unroll_factor));
|
||||
jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", tuning_data.unroll_factor));
|
||||
jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", tuning_data.full_unroll_factor));
|
||||
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
|
||||
|
||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch));
|
||||
@ -197,10 +213,9 @@ KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const
|
||||
auto fc_params = static_cast<const fully_connected_params&>(params);
|
||||
auto& input = fc_params.inputs[0];
|
||||
|
||||
auto w_layout = WeightsLayout::os_is_yx_isa8_osv8_isv4;
|
||||
if (input.GetDims().size() == 5) {
|
||||
w_layout = WeightsLayout::os_is_zyx_isa8_osv8_isv4;
|
||||
}
|
||||
auto w_layout = GetTuningParams(fc_params).sub_group_size == 16 ?
|
||||
input.GetDims().size() == 4 ? WeightsLayout::os_is_yx_isa8_osv16_isv4 : WeightsLayout::os_is_zyx_isa8_osv16_isv4 :
|
||||
input.GetDims().size() == 4 ? WeightsLayout::os_is_yx_isa8_osv8_isv4 : WeightsLayout::os_is_zyx_isa8_osv8_isv4;
|
||||
|
||||
KernelsData res = {};
|
||||
for (size_t i = 0; i < autoTuneOptions.size(); i++) {
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2016 Intel Corporation
|
||||
// Copyright (c) 2016-2020 Intel Corporation
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -30,20 +30,25 @@ public:
|
||||
ParamsKey GetSupportedKey() const override;
|
||||
|
||||
struct FullyConnectedTuningData {
|
||||
const size_t sub_group_size = 8;
|
||||
const size_t pack_size = 4;
|
||||
size_t sub_group_size = 8;
|
||||
size_t slm_div_factor = 1;
|
||||
size_t work_group_size = 1;
|
||||
size_t feature_blocks_count;
|
||||
size_t unroll_factor;
|
||||
size_t full_unroll_factor;
|
||||
};
|
||||
|
||||
protected:
|
||||
JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override;
|
||||
JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
|
||||
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
|
||||
std::vector<FusedOpType> GetSupportedFusedOps() const override {
|
||||
return { FusedOpType::QUANTIZE,
|
||||
FusedOpType::SCALE,
|
||||
FusedOpType::ACTIVATION };
|
||||
FusedOpType::ACTIVATION,
|
||||
FusedOpType::ELTWISE };
|
||||
}
|
||||
bool Validate(const Params& params, const optional_params& options) const override;
|
||||
FullyConnectedTuningData SetTuningParams(const fully_connected_params& params) const;
|
||||
FullyConnectedTuningData GetTuningParams(const fully_connected_params& params) const;
|
||||
};
|
||||
} // namespace kernel_selector
|
||||
|
@ -19,10 +19,17 @@
|
||||
#include "include/fetch.cl"
|
||||
#include "include/mmad.cl"
|
||||
|
||||
#define INPUT_PACKED_TYPE_8 CAT(INPUT_PACKED_TYPE, 8)
|
||||
#define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8)
|
||||
#define INPUT_PACKED_TYPE_8 CAT(INPUT_PACKED_TYPE, 8)
|
||||
#define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8)
|
||||
#define INPUT_PACKED_TYPE_VEC CAT(INPUT_PACKED_TYPE, SUB_GROUP_SIZE)
|
||||
#define FILTER_PACKED_TYPE_VEC CAT(FILTER_PACKED_TYPE, SUB_GROUP_SIZE)
|
||||
|
||||
#define AS_TYPE(type, val) CAT(as_, type)(val)
|
||||
#define BLOCK_READ(ptr) intel_sub_group_block_read((const __global uint*)(ptr))
|
||||
#define BLOCK_READ_8(ptr) intel_sub_group_block_read8((const __global uint*)(ptr))
|
||||
|
||||
#define MMAD CAT(MMAD_, SUB_GROUP_SIZE)
|
||||
|
||||
#define AS_TYPE(type, val) CAT(as_, type)(val)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
KERNEL(fully_connected_gpu_MMAD)(
|
||||
@ -64,25 +71,27 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
uint k = feature_block * FULL_UNROLL_FACTOR;
|
||||
#else
|
||||
for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR)
|
||||
#endif
|
||||
#endif // FULL_UNROLL_FACTOR < 2
|
||||
{
|
||||
# if !SPLIT_SPATIAL
|
||||
#if !SPLIT_SPATIAL
|
||||
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
||||
# else
|
||||
#else
|
||||
for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
|
||||
for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
|
||||
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
|
||||
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
|
||||
#endif
|
||||
#endif // !SPLIT_SPATIAL
|
||||
|
||||
#else // SPATIAL_MAJOR
|
||||
# if !SPLIT_SPATIAL
|
||||
|
||||
#if !SPLIT_SPATIAL
|
||||
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
||||
# else
|
||||
#else
|
||||
for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi)
|
||||
for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
|
||||
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
|
||||
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
|
||||
# endif
|
||||
#endif // !SPLIT_SPATIAL
|
||||
|
||||
#if FULL_UNROLL_FACTOR < 2
|
||||
for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k)
|
||||
@ -90,21 +99,20 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
uint k = feature_block * FULL_UNROLL_FACTOR;
|
||||
#else
|
||||
for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR)
|
||||
#endif
|
||||
#endif // FULL_UNROLL_FACTOR < 2
|
||||
{
|
||||
#endif
|
||||
#endif // SPATIAL_MAJOR
|
||||
|
||||
#if !SPLIT_SPATIAL
|
||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH;
|
||||
#else
|
||||
uint input_idx = input_offset + k * MMAD_INPUT_FBLOCK_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
||||
#endif
|
||||
#endif // !SPLIT_SPATIAL
|
||||
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH;
|
||||
|
||||
#if UNROLL_FACTOR < 2
|
||||
uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx));
|
||||
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
||||
|
||||
INPUT_PACKED_TYPE_8 activations;
|
||||
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx));
|
||||
INPUT_PACKED_TYPE_VEC activations;
|
||||
|
||||
activations.s0 = sub_group_broadcast(input_data, 0);
|
||||
activations.s1 = sub_group_broadcast(input_data, 1);
|
||||
@ -114,27 +122,44 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
activations.s5 = sub_group_broadcast(input_data, 5);
|
||||
activations.s6 = sub_group_broadcast(input_data, 6);
|
||||
activations.s7 = sub_group_broadcast(input_data, 7);
|
||||
#if SUB_GROUP_SIZE == 16
|
||||
activations.s8 = sub_group_broadcast(input_data, 8);
|
||||
activations.s9 = sub_group_broadcast(input_data, 9);
|
||||
activations.sa = sub_group_broadcast(input_data, 0xa);
|
||||
activations.sb = sub_group_broadcast(input_data, 0xb);
|
||||
activations.sc = sub_group_broadcast(input_data, 0xc);
|
||||
activations.sd = sub_group_broadcast(input_data, 0xd);
|
||||
activations.se = sub_group_broadcast(input_data, 0xe);
|
||||
activations.sf = sub_group_broadcast(input_data, 0xf);
|
||||
#endif // SUB_GROUP_SIZE == 16
|
||||
|
||||
uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx));
|
||||
FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
|
||||
|
||||
dotProd = MMAD_8(activations, weights_data, dotProd);
|
||||
FILTER_PACKED_TYPE_VEC weights_data;
|
||||
#if SUB_GROUP_SIZE == 8
|
||||
weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
|
||||
#else
|
||||
weights_data.lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
|
||||
weights_data.hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 8 * 4));
|
||||
#endif // SUB_GROUP_SIZE == 8
|
||||
|
||||
dotProd = MMAD(activations, weights_data, dotProd);
|
||||
#else // UNROLL_FACTOR < 2
|
||||
INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
|
||||
FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR];
|
||||
FILTER_PACKED_TYPE_VEC weights_data[UNROLL_FACTOR];
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||
input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input +
|
||||
input_idx + kb * MMAD_INPUT_FBLOCK_PITCH)));
|
||||
|
||||
uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
|
||||
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0);
|
||||
input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH));
|
||||
#if SUB_GROUP_SIZE == 8
|
||||
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
|
||||
#else
|
||||
weights_data[kb].lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
|
||||
weights_data[kb].hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 32 + kb * MMAD_FILTER_FBLOCK_PITCH));
|
||||
#endif // SUB_GROUP_SIZE
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||
INPUT_PACKED_TYPE_8 in;
|
||||
INPUT_PACKED_TYPE_VEC in;
|
||||
|
||||
in.s0 = sub_group_broadcast(input_data[kb], 0);
|
||||
in.s1 = sub_group_broadcast(input_data[kb], 1);
|
||||
@ -144,8 +169,17 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
in.s5 = sub_group_broadcast(input_data[kb], 5);
|
||||
in.s6 = sub_group_broadcast(input_data[kb], 6);
|
||||
in.s7 = sub_group_broadcast(input_data[kb], 7);
|
||||
|
||||
dotProd = MMAD_8(in, weights_data[kb], dotProd);
|
||||
#if SUB_GROUP_SIZE == 16
|
||||
in.s8 = sub_group_broadcast(input_data[kb], 8);
|
||||
in.s9 = sub_group_broadcast(input_data[kb], 9);
|
||||
in.sa = sub_group_broadcast(input_data[kb], 0xa);
|
||||
in.sb = sub_group_broadcast(input_data[kb], 0xb);
|
||||
in.sc = sub_group_broadcast(input_data[kb], 0xc);
|
||||
in.sd = sub_group_broadcast(input_data[kb], 0xd);
|
||||
in.se = sub_group_broadcast(input_data[kb], 0xe);
|
||||
in.sf = sub_group_broadcast(input_data[kb], 0xf);
|
||||
#endif // SUB_GROUP_SIZE == 16
|
||||
dotProd = MMAD(in, weights_data[kb], dotProd);
|
||||
}
|
||||
#endif // UNROLL_FACTOR < 2
|
||||
}
|
||||
@ -174,6 +208,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
#endif // !SPLIT_SPATIAL
|
||||
|
||||
#else // SPATIAL_MAJOR
|
||||
|
||||
#if !SPLIT_SPATIAL
|
||||
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
||||
#else // !SPLIT_SPATIAL
|
||||
@ -182,24 +217,27 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
|
||||
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
|
||||
#endif // !SPLIT_SPATIAL
|
||||
|
||||
#endif // SPATIAL_MAJOR
|
||||
|
||||
#if !SPLIT_SPATIAL
|
||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
|
||||
#else // !SPLIT_SPATIAL
|
||||
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
||||
#endif // !SPLIT_SPATIAL
|
||||
#else
|
||||
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH +
|
||||
zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
||||
#endif // !SPLIT_SPATIAL
|
||||
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
|
||||
|
||||
MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
|
||||
for (uint i = 0; i < 4; i++) {
|
||||
if (FEATURE_BLOCKS_COUNT * 32 + sglid * 4 + i < INPUT0_FEATURE_NUM) {
|
||||
if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < INPUT0_FEATURE_NUM) {
|
||||
input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH];
|
||||
}
|
||||
}
|
||||
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
||||
|
||||
INPUT_PACKED_TYPE_8 activations; //activations of all lanes
|
||||
INPUT_PACKED_TYPE_VEC activations;
|
||||
|
||||
activations.s0 = sub_group_broadcast(input_data, 0);
|
||||
activations.s1 = sub_group_broadcast(input_data, 1);
|
||||
activations.s2 = sub_group_broadcast(input_data, 2);
|
||||
@ -208,11 +246,26 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
activations.s5 = sub_group_broadcast(input_data, 5);
|
||||
activations.s6 = sub_group_broadcast(input_data, 6);
|
||||
activations.s7 = sub_group_broadcast(input_data, 7);
|
||||
#if SUB_GROUP_SIZE == 16
|
||||
activations.s8 = sub_group_broadcast(input_data, 8);
|
||||
activations.s9 = sub_group_broadcast(input_data, 9);
|
||||
activations.sa = sub_group_broadcast(input_data, 0xa);
|
||||
activations.sb = sub_group_broadcast(input_data, 0xb);
|
||||
activations.sc = sub_group_broadcast(input_data, 0xc);
|
||||
activations.sd = sub_group_broadcast(input_data, 0xd);
|
||||
activations.se = sub_group_broadcast(input_data, 0xe);
|
||||
activations.sf = sub_group_broadcast(input_data, 0xf);
|
||||
#endif // SUB_GROUP_SIZE == 16
|
||||
|
||||
uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx));
|
||||
FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
|
||||
FILTER_PACKED_TYPE_VEC weights_data;
|
||||
#if SUB_GROUP_SIZE == 8
|
||||
weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
|
||||
#else
|
||||
weights_data.lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
|
||||
weights_data.hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 32));
|
||||
#endif // SUB_GROUP_SIZE == 8
|
||||
|
||||
dotProd = MMAD_8(activations, weights_data, dotProd);
|
||||
dotProd = MMAD(activations, weights_data, dotProd);
|
||||
}
|
||||
#endif // HAS_FEATURE_LEFTOVERS
|
||||
|
||||
@ -220,16 +273,16 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
return;
|
||||
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
#if BIAS_PER_OUTPUT
|
||||
const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0);
|
||||
#elif BIAS_PER_OFM
|
||||
const uint bias_index = feature;
|
||||
#endif
|
||||
#endif // BIAS_PER_OUTPUT
|
||||
|
||||
float dequantized = (float)dotProd + biases[bias_index];
|
||||
#else // BIAS_TERM
|
||||
#else
|
||||
float dequantized = (float)dotProd;
|
||||
#endif
|
||||
#endif // BIAS_TERM
|
||||
|
||||
const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0);
|
||||
|
||||
@ -240,7 +293,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
output[out_idx] = res;
|
||||
#else
|
||||
output[out_idx] = TO_OUTPUT_TYPE(dequantized);
|
||||
#endif
|
||||
#endif // HAS_FUSED_OPS
|
||||
|
||||
#if SLM_DIV_FACTOR > 1
|
||||
}
|
||||
@ -249,4 +302,11 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
|
||||
#undef INPUT_PACKED_TYPE_8
|
||||
#undef FILTER_PACKED_TYPE_8
|
||||
#undef INPUT_PACKED_TYPE_VEC
|
||||
#undef FILTER_PACKED_TYPE_VEC
|
||||
|
||||
#undef BLOCK_READ
|
||||
#undef BLOCK_READ_8
|
||||
|
||||
#undef MMAD
|
||||
#undef AS_TYPE
|
||||
|
@ -715,6 +715,63 @@ inline uint FUNC(get_os_is_zyx_isa8_osv8_isv4_index)(uint o, uint i, uint z, uin
|
||||
CAT(prefix, _OFM_NUM), \
|
||||
CAT(prefix, _OFFSET))
|
||||
|
||||
inline uint FUNC(get_os_is_yx_isa8_osv16_isv4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset)
|
||||
{
|
||||
const uint f_32_aligned = ((size_ifm + 31)/32) * 32;
|
||||
const uint isv2_idx = i % 4;
|
||||
const uint osv_idx = o % 16;
|
||||
const uint isv1_idx = (i / 4) % 8;
|
||||
const uint is_idx = i / 32;
|
||||
const uint os_idx = o / 16;
|
||||
|
||||
size_t idx = offset + isv2_idx + 4 * (osv_idx + 16 * isv1_idx);
|
||||
idx += x * 4 * 8 * 16;
|
||||
idx += y * size_x * 4 * 8 * 16;
|
||||
idx += is_idx * size_y * size_x * 4 * 8 * 16;
|
||||
idx += os_idx * (f_32_aligned/32) * size_y * size_x * 4 * 8 * 16;
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
#define GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(prefix, o, i, y, x) \
|
||||
FUNC_CALL(get_os_is_yx_isa8_osv16_isv4_index)( \
|
||||
o, i, y, x, CAT(prefix, _SIZE_X ), \
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _IFM_NUM), \
|
||||
CAT(prefix, _OFM_NUM), \
|
||||
CAT(prefix, _OFFSET))
|
||||
|
||||
inline uint FUNC(get_os_is_zyx_isa8_osv16_isv4_index)(uint o, uint i, uint z, uint y, uint x,
|
||||
uint size_x, uint size_y, uint size_z,
|
||||
uint size_ifm, uint size_ofm, uint offset)
|
||||
{
|
||||
const uint ifm_slices = (size_ifm + 31)/32;
|
||||
const uint isv2_idx = i % 4;
|
||||
const uint osv_idx = o % 16;
|
||||
const uint isv1_idx = (i / 4) % 8;
|
||||
const uint is_idx = i / 32;
|
||||
const uint os_idx = o / 16;
|
||||
|
||||
size_t idx = offset + isv2_idx + 4 * (osv_idx + 16 * isv1_idx);
|
||||
idx += x * 4 * 8 * 16;
|
||||
idx += y * size_x * 4 * 8 * 16;
|
||||
idx += z * size_y * size_x * 4 * 8 * 16;
|
||||
idx += is_idx * size_z * size_y * size_x * 4 * 8 * 16;
|
||||
idx += os_idx * ifm_slices * size_z * size_y * size_x * 4 * 8 * 16;
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
#define GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(prefix, o, i, z, y, x) \
|
||||
FUNC_CALL(get_os_is_zyx_isa8_osv16_isv4_index)( \
|
||||
o, i, z, y, x, \
|
||||
CAT(prefix, _SIZE_X ), \
|
||||
CAT(prefix, _SIZE_Y), \
|
||||
CAT(prefix, _SIZE_Z), \
|
||||
CAT(prefix, _IFM_NUM), \
|
||||
CAT(prefix, _OFM_NUM), \
|
||||
CAT(prefix, _OFFSET))
|
||||
|
||||
inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset)
|
||||
{
|
||||
const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32;
|
||||
|
@ -783,6 +783,7 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attrib
|
||||
}
|
||||
|
||||
#define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C)
|
||||
#define MMAD_16(A, B, C) FUNC_CALL(mmad16)(A, B, C)
|
||||
#define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C)
|
||||
#define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C)
|
||||
#define MMAD_16x16(A, B, C) FUNC_CALL(mmad16x16)(A, B, C)
|
||||
|
@ -48,6 +48,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
|
||||
return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4
|
||||
return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, z, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_YX_ISA8_OSV16_ISV4
|
||||
return GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(INPUT0, o, i, y, x);
|
||||
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV16_ISV4
|
||||
return GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(INPUT0, o, i, z, y, x);
|
||||
#elif defined INPUT0_LAYOUT_IS_O_YX_ISV32
|
||||
return GET_FILTER_IS_O_YX_ISV32(INPUT0, o, i, y, x);
|
||||
#elif defined INPUT0_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4
|
||||
@ -156,6 +160,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
|
||||
return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4
|
||||
return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, z, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_YX_ISA8_OSV16_ISV4
|
||||
return GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(OUTPUT, o, i, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV16_ISV4
|
||||
return GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(OUTPUT, o, i, z, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_IS_O_YX_ISV32
|
||||
return GET_FILTER_IS_O_YX_ISV32(OUTPUT, o, i, y, x);
|
||||
#elif defined OUTPUT_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4
|
||||
|
@ -330,8 +330,10 @@ std::string toString(WeightsLayout layout) {
|
||||
case WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb: return "IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB";
|
||||
case WeightsLayout::dlstm_dir_io: return "DLSTM_DIR_IO";
|
||||
case WeightsLayout::os_is_yx_isa8_osv8_isv4: return "OS_IS_YX_ISA8_OSV8_ISV4";
|
||||
case WeightsLayout::os_is_yx_isa8_osv16_isv4: return "OS_IS_YX_ISA8_OSV16_ISV4";
|
||||
case WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
|
||||
case WeightsLayout::os_is_zyx_isa8_osv8_isv4: return "OS_IS_ZYX_ISA8_OSV8_ISV4";
|
||||
case WeightsLayout::os_is_zyx_isa8_osv16_isv4: return "OS_IS_ZYX_ISA8_OSV16_ISV4";
|
||||
case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
|
||||
case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
|
||||
case WeightsLayout::is_o_yx_isv32: return "IS_O_YX_ISV32";
|
||||
|
@ -144,8 +144,12 @@ inline std::string fmt_to_str(format fmt) {
|
||||
return "os_is_yx_isv16_osv16";
|
||||
case format::os_is_yx_isa8_osv8_isv4:
|
||||
return "os_is_yx_isa8_osv8_isv4";
|
||||
case format::os_is_yx_isa8_osv16_isv4:
|
||||
return "os_is_yx_isa8_osv16_isv4";
|
||||
case format::os_is_zyx_isa8_osv8_isv4:
|
||||
return "os_is_zyx_isa8_osv8_isv4";
|
||||
case format::os_is_zyx_isa8_osv16_isv4:
|
||||
return "os_is_zyx_isa8_osv16_isv4";
|
||||
case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
return "os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4";
|
||||
case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
|
@ -238,8 +238,12 @@ kernel_selector::weights_layout to_weights_layout(format f) {
|
||||
return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb;
|
||||
case format::os_is_yx_isa8_osv8_isv4:
|
||||
return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4;
|
||||
case format::os_is_yx_isa8_osv16_isv4:
|
||||
return kernel_selector::weights_layout::os_is_yx_isa8_osv16_isv4;
|
||||
case format::os_is_zyx_isa8_osv8_isv4:
|
||||
return kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4;
|
||||
case format::os_is_zyx_isa8_osv16_isv4:
|
||||
return kernel_selector::weights_layout::os_is_zyx_isa8_osv16_isv4;
|
||||
case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
|
||||
case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
@ -390,6 +394,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
|
||||
return cldnn::format::os_is_yx_isa8_osv8_isv4;
|
||||
case kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4:
|
||||
return cldnn::format::os_is_zyx_isa8_osv8_isv4;
|
||||
case kernel_selector::weights_layout::os_is_yx_isa8_osv16_isv4:
|
||||
return cldnn::format::os_is_yx_isa8_osv16_isv4;
|
||||
case kernel_selector::weights_layout::os_is_zyx_isa8_osv16_isv4:
|
||||
return cldnn::format::os_is_zyx_isa8_osv16_isv4;
|
||||
case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
|
||||
case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
|
||||
|
Loading…
Reference in New Issue
Block a user