[IE CLDNN] Fully connected MMAD simd16 improvements (#3394)

This commit is contained in:
Ilya Znamenskiy 2020-12-11 10:15:11 +03:00 committed by GitHub
parent 7fe21dc6ee
commit b6f311b463
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 300 additions and 109 deletions

View File

@ -413,6 +413,9 @@ struct layout {
if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) { if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) {
sizes[0] = align_to(sizes[0], 8); sizes[0] = align_to(sizes[0], 8);
sizes[1] = align_to(sizes[1], 32); sizes[1] = align_to(sizes[1], 32);
} else if (this->format == cldnn::format::os_is_yx_isa8_osv16_isv4 && !(is_aligned_to(sizes[0], 16)) && !(is_aligned_to(sizes[1], 32))) {
sizes[0] = align_to(sizes[0], 16);
sizes[1] = align_to(sizes[1], 32);
} else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) { } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) {
sizes[0] = align_to(sizes[0], 32); sizes[0] = align_to(sizes[0], 32);
sizes[1] = align_to(sizes[1], 32); sizes[1] = align_to(sizes[1], 32);

View File

@ -162,6 +162,8 @@ struct format {
///< convolution, F(6,3) -- filter 3x3 with stride 1 ///< convolution, F(6,3) -- filter 3x3 with stride 1
os_is_yx_isa8_osv8_isv4, ///< format for weights for MMAD convolution os_is_yx_isa8_osv8_isv4, ///< format for weights for MMAD convolution
os_is_zyx_isa8_osv8_isv4, ///< format for weights for MMAD convolution os_is_zyx_isa8_osv8_isv4, ///< format for weights for MMAD convolution
os_is_yx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD
os_is_zyx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD
os_is_yx_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD convolution os_is_yx_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD convolution
os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution
os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution
@ -273,8 +275,10 @@ struct format {
{ image_2d_weights_c1_b_fyx, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, { image_2d_weights_c1_b_fyx, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
{ lstm_weights_dio, { 1, 1, 2, 0, 0, "oixy", "oixy?", {}}}, { lstm_weights_dio, { 1, 1, 2, 0, 0, "oixy", "oixy?", {}}},
{ os_is_yx_isa8_osv8_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, { os_is_yx_isa8_osv8_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
{ os_is_yx_isa8_osv16_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
{ os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, { os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}},
{ os_is_zyx_isa8_osv8_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}}}, { os_is_zyx_isa8_osv8_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}}},
{ os_is_zyx_isa8_osv16_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}}},
{ os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {{0, 32}, {1, 32}}}}, { os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {{0, 32}, {1, 32}}}},
{ os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{0, 32}, {1, 32}}}}, { os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{0, 32}, {1, 32}}}},
{ is_o_yx_isv32, { 1, 1, 2, 0, 0, "oyxi", "oixy?", {{1, 32}}}}, { is_o_yx_isv32, { 1, 1, 2, 0, 0, "oyxi", "oixy?", {{1, 32}}}},
@ -995,6 +999,13 @@ public:
my_sizes[1] = align_to(my_sizes[1], 32); my_sizes[1] = align_to(my_sizes[1], 32);
adjusted_coords[0] = align_to(adjusted_coords[0], 8); adjusted_coords[0] = align_to(adjusted_coords[0], 8);
adjusted_coords[1] = align_to(adjusted_coords[1], 32); adjusted_coords[1] = align_to(adjusted_coords[1], 32);
} else if (fmt == cldnn::format::os_is_yx_isa8_osv16_isv4 &&
!(is_aligned_to(my_sizes[0], 16)) &&
!(is_aligned_to(my_sizes[1], 32))) {
my_sizes[0] = align_to(my_sizes[0], 16);
my_sizes[1] = align_to(my_sizes[1], 32);
adjusted_coords[0] = align_to(adjusted_coords[0], 16);
adjusted_coords[1] = align_to(adjusted_coords[1], 32);
} else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(my_sizes[0], 32)) && !(is_aligned_to(my_sizes[1], 32))) { } else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(my_sizes[0], 32)) && !(is_aligned_to(my_sizes[1], 32))) {
my_sizes[0] = align_to(my_sizes[0], 32); my_sizes[0] = align_to(my_sizes[0], 32);
my_sizes[1] = align_to(my_sizes[1], 32); my_sizes[1] = align_to(my_sizes[1], 32);

View File

@ -86,8 +86,10 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
{ WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::dlstm_dir_io, { 1, 0, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::dlstm_dir_io, { 1, 0, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_isa8_osv8_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_yx_isa8_osv8_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_isa8_osv16_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_isa8_osv8_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } }, { WeightsLayout::os_is_zyx_isa8_osv8_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_isa8_osv16_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } },
{ WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, 2, 3, 4, -1, -1, -1 } }, { WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, 2, 3, 4, -1, -1, -1 } },
{ WeightsLayout::is_o_yx_isv32, { 1, 2, -1, 0, 3, -1, -1, -1 } }, { WeightsLayout::is_o_yx_isv32, { 1, 2, -1, 0, 3, -1, -1, -1 } },
@ -457,6 +459,16 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
newDims[3] = RoundUp(newDims[3], 32); newDims[3] = RoundUp(newDims[3], 32);
newDims[4] = RoundUp(newDims[4], 8); newDims[4] = RoundUp(newDims[4], 8);
break; break;
case os_is_yx_isa8_osv16_isv4:
assert(newDims.size() == 4);
newDims[3] = RoundUp(newDims[3], 16);
newDims[2] = RoundUp(newDims[2], 32);
break;
case os_is_zyx_isa8_osv16_isv4:
assert(newDims.size() == 5);
newDims[3] = RoundUp(newDims[3], 32);
newDims[4] = RoundUp(newDims[4], 16);
break;
case os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: case os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
assert(newDims.size() == 4); assert(newDims.size() == 4);
newDims[3] = RoundUp(newDims[3], 32); newDims[3] = RoundUp(newDims[3], 32);
@ -693,6 +705,9 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
} else if (l == os_is_yx_isa8_osv8_isv4 || l == os_is_yx_isa8_osv8_isv4_swizzled_by_4) { } else if (l == os_is_yx_isa8_osv8_isv4 || l == os_is_yx_isa8_osv8_isv4_swizzled_by_4) {
ret[0].pitch = 256; ret[0].pitch = 256;
ret[1].pitch = ret[0].pitch * ret[0].v; ret[1].pitch = ret[0].pitch * ret[0].v;
} else if (l == os_is_yx_isa8_osv16_isv4) {
ret[0].pitch = 512;
ret[1].pitch = ret[0].pitch * ret[0].v;
} else if (l == os_i_yxs_osv4_yxsv4) { } else if (l == os_i_yxs_osv4_yxsv4) {
ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 4; ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 4;
ret[3].pitch = ret[2].v * RoundUp(ret[0].v * ret[1].v, 4); ret[3].pitch = ret[2].v * RoundUp(ret[0].v * ret[1].v, 4);

View File

@ -107,6 +107,8 @@ enum WeightsLayout {
dlstm_dir_io, // dlstm weights layout direction, input_size, 4* hiden_size dlstm_dir_io, // dlstm weights layout direction, input_size, 4* hiden_size
os_is_yx_isa8_osv8_isv4, // for MMAD convolution os_is_yx_isa8_osv8_isv4, // for MMAD convolution
os_is_zyx_isa8_osv8_isv4, // for MMAD convolution os_is_zyx_isa8_osv8_isv4, // for MMAD convolution
os_is_yx_isa8_osv16_isv4, // for fully connected MMAD
os_is_zyx_isa8_osv16_isv4, // for fully connected MMAD
os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28,
// 1,5... // 1,5...
os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28,

View File

@ -61,27 +61,62 @@ bool FullyConnectedKernelMMAD::Validate(const Params& params, const optional_par
return true; return true;
} }
FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::SetTuningParams(const fully_connected_params& params) const { FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::GetTuningParams(const fully_connected_params& params) const {
FullyConnectedTuningData tuning_data; FullyConnectedTuningData tuning_data;
const auto& input = params.inputs[0]; const auto& input = params.inputs[0];
const auto& output = params.output;
size_t feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0 ? tuning_data.sub_group_size = 8;
input.Feature().v / 32 : CeilDiv(input.Feature().v, 32); if (input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1 && input.Batch().v == 1) {
// Known cases for TGL where simd16 works better than simd8
bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;
bool simd16_exception_2 = input.Feature().v == 21504 && output.Feature().v == 512;
if (feature_blocks_count) if (simd16_exception_1 || simd16_exception_2)
while (feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 && tuning_data.sub_group_size = 16;
}
size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size != 0 ?
input.Feature().v / sub_group_pack_size :
input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 ?
CeilDiv(input.Feature().v, 32) % 2 == 0 ? CeilDiv(input.Feature().v, 64) : CeilDiv(input.Feature().v, 64) - 1 :
CeilDiv(input.Feature().v, sub_group_pack_size);
bool slm_div_factor_exception = input.Batch().v == 300 && input.Feature().v == 2048 &&
output.Batch().v == 300 && (output.Feature().v == 324 || output.Feature().v == 81);
if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception)
while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
(tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size)) (tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size))
tuning_data.slm_div_factor *= 2; tuning_data.slm_div_factor *= 2;
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
tuning_data.full_unroll_factor = tuning_data.feature_blocks_count / tuning_data.slm_div_factor;
if (tuning_data.sub_group_size == 16) {
tuning_data.unroll_factor = 1;
} else {
size_t temp_unroll_factor = 3;
if (tuning_data.full_unroll_factor > 3) {
while (tuning_data.full_unroll_factor % temp_unroll_factor)
temp_unroll_factor--;
tuning_data.unroll_factor = temp_unroll_factor;
} else {
tuning_data.unroll_factor = tuning_data.full_unroll_factor;
}
}
return tuning_data; return tuning_data;
} }
FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params, FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params,
int) const { int) const {
FullyConnectedTuningData tuning_data = SetTuningParams(params); FullyConnectedTuningData tuning_data = GetTuningParams(params);
auto dispatchData = Parent::SetDefault(params); auto dispatchData = Parent::SetDefault(params);
const auto& output = params.output; const auto& output = params.output;
@ -92,84 +127,65 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons
} }
JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params, JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params,
const DispatchData& dispatchData) const { const DispatchData& runInfo) const {
FullyConnectedTuningData tuning_data = SetTuningParams(params); FullyConnectedTuningData tuning_data = GetTuningParams(params);
auto jit = Parent::GetJitConstants(params, dispatchData); auto jit = Parent::GetJitConstants(params, runInfo);
auto& input = params.inputs[0]; auto& input = params.inputs[0];
auto& weights = params.weights; auto& weights = params.weights;
size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
if (tuning_data.sub_group_size == 8) {
if (input.GetDims().size() == 5) { if (input.GetDims().size() == 5) {
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)")); jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
} else { } else {
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)")); jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)"));
} }
} else {
Datatype input_packed_type = Datatype::INT32; if (input.GetDims().size() == 5) {
Datatype filter_packed_type = Datatype::INT32; jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
} else {
if (input.GetDType() == Datatype::UINT8) { jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)"));
input_packed_type = Datatype::UINT32; }
} else if (input.GetDType() == Datatype::INT8) {
input_packed_type = Datatype::INT32;
} }
if (weights.GetDType() == WeightsType::UINT8) { jit.Merge(MakeTypeJitConstants(input.GetDType() == Datatype::UINT8 ? Datatype::UINT32 : Datatype::INT32, "INPUT_PACKED"));
filter_packed_type = Datatype::UINT32; jit.Merge(MakeTypeJitConstants(weights.GetDType() == WeightsType::UINT8 ? Datatype::UINT32 : Datatype::INT32, "FILTER_PACKED"));
} else if (weights.GetDType() == WeightsType::INT8) {
filter_packed_type = Datatype::INT32;
}
jit.Merge(MakeTypeJitConstants(input_packed_type, "INPUT_PACKED"));
jit.Merge(MakeTypeJitConstants(filter_packed_type, "FILTER_PACKED"));
auto filter_spatial_size = weights.X().v * weights.Y().v * weights.Z().v; auto filter_spatial_size = weights.X().v * weights.Y().v * weights.Z().v;
int filter_spatial_pitch = 4 * 8 * 8; auto filter_spatial_pitch = 8 * sub_group_pack_size;
auto filter_fblock_pitch = tuning_data.sub_group_size == 8 ?
filter_spatial_size * filter_spatial_pitch :
filter_spatial_size * filter_spatial_pitch * 2;
jit.AddConstant(MakeJitConstant("FILTER_SPATIAL_SIZE", filter_spatial_size)); jit.AddConstant(MakeJitConstant("FILTER_SPATIAL_SIZE", filter_spatial_size));
jit.AddConstant(MakeJitConstant("MMAD_FILTER_SPATIAL_PITCH", filter_spatial_pitch)); jit.AddConstant(MakeJitConstant("MMAD_FILTER_SPATIAL_PITCH", filter_spatial_pitch));
jit.AddConstant(MakeJitConstant("MMAD_FILTER_FBLOCK_PITCH", filter_spatial_size * filter_spatial_pitch)); jit.AddConstant(MakeJitConstant("MMAD_FILTER_FBLOCK_PITCH", filter_fblock_pitch));
size_t input_x_pitch = input.X().pitch; size_t input_x_pitch = input.X().pitch;
size_t input_y_pitch = input.Y().pitch; size_t input_y_pitch = input.Y().pitch;
size_t input_z_pitch = input.Z().pitch; size_t input_z_pitch = input.Z().pitch;
if (input.GetLayout() == DataLayout::bfyx) { if (input.GetLayout() == DataLayout::bfyx) {
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", sub_group_pack_size));
} else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) { } else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) {
input_x_pitch = 32; input_x_pitch = 32;
input_y_pitch *= 32; input_y_pitch *= 32;
input_z_pitch *= 32; input_z_pitch *= 32;
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * sub_group_pack_size));
} }
bool has_feature_leftovers = (input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size) ||
(input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 && CeilDiv(input.Feature().v, 32) % 2);
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", has_feature_leftovers));
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", tuning_data.feature_blocks_count));
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", tuning_data.unroll_factor));
size_t feature_blocks_count; jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", tuning_data.full_unroll_factor));
size_t temp_unroll_factor = 9, unroll_factor, full_unroll_factor;
if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) {
feature_blocks_count = input.Feature().v / 32;
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true));
} else {
feature_blocks_count = CeilDiv(input.Feature().v, 32);
}
full_unroll_factor = feature_blocks_count / tuning_data.slm_div_factor;
if (full_unroll_factor > 9) {
while (full_unroll_factor % temp_unroll_factor)
temp_unroll_factor--;
unroll_factor = temp_unroll_factor;
} else {
unroll_factor = full_unroll_factor;
}
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", feature_blocks_count));
jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", unroll_factor));
jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", full_unroll_factor));
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch));
@ -197,10 +213,9 @@ KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const
auto fc_params = static_cast<const fully_connected_params&>(params); auto fc_params = static_cast<const fully_connected_params&>(params);
auto& input = fc_params.inputs[0]; auto& input = fc_params.inputs[0];
auto w_layout = WeightsLayout::os_is_yx_isa8_osv8_isv4; auto w_layout = GetTuningParams(fc_params).sub_group_size == 16 ?
if (input.GetDims().size() == 5) { input.GetDims().size() == 4 ? WeightsLayout::os_is_yx_isa8_osv16_isv4 : WeightsLayout::os_is_zyx_isa8_osv16_isv4 :
w_layout = WeightsLayout::os_is_zyx_isa8_osv8_isv4; input.GetDims().size() == 4 ? WeightsLayout::os_is_yx_isa8_osv8_isv4 : WeightsLayout::os_is_zyx_isa8_osv8_isv4;
}
KernelsData res = {}; KernelsData res = {};
for (size_t i = 0; i < autoTuneOptions.size(); i++) { for (size_t i = 0; i < autoTuneOptions.size(); i++) {

View File

@ -1,4 +1,4 @@
// Copyright (c) 2016 Intel Corporation // Copyright (c) 2016-2020 Intel Corporation
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
@ -30,20 +30,25 @@ public:
ParamsKey GetSupportedKey() const override; ParamsKey GetSupportedKey() const override;
struct FullyConnectedTuningData { struct FullyConnectedTuningData {
const size_t sub_group_size = 8; const size_t pack_size = 4;
size_t sub_group_size = 8;
size_t slm_div_factor = 1; size_t slm_div_factor = 1;
size_t work_group_size = 1; size_t work_group_size = 1;
size_t feature_blocks_count;
size_t unroll_factor;
size_t full_unroll_factor;
}; };
protected: protected:
JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override; JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
std::vector<FusedOpType> GetSupportedFusedOps() const override { std::vector<FusedOpType> GetSupportedFusedOps() const override {
return { FusedOpType::QUANTIZE, return { FusedOpType::QUANTIZE,
FusedOpType::SCALE, FusedOpType::SCALE,
FusedOpType::ACTIVATION }; FusedOpType::ACTIVATION,
FusedOpType::ELTWISE };
} }
bool Validate(const Params& params, const optional_params& options) const override; bool Validate(const Params& params, const optional_params& options) const override;
FullyConnectedTuningData SetTuningParams(const fully_connected_params& params) const; FullyConnectedTuningData GetTuningParams(const fully_connected_params& params) const;
}; };
} // namespace kernel_selector } // namespace kernel_selector

View File

@ -21,6 +21,13 @@
#define INPUT_PACKED_TYPE_8 CAT(INPUT_PACKED_TYPE, 8) #define INPUT_PACKED_TYPE_8 CAT(INPUT_PACKED_TYPE, 8)
#define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8) #define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8)
#define INPUT_PACKED_TYPE_VEC CAT(INPUT_PACKED_TYPE, SUB_GROUP_SIZE)
#define FILTER_PACKED_TYPE_VEC CAT(FILTER_PACKED_TYPE, SUB_GROUP_SIZE)
#define BLOCK_READ(ptr) intel_sub_group_block_read((const __global uint*)(ptr))
#define BLOCK_READ_8(ptr) intel_sub_group_block_read8((const __global uint*)(ptr))
#define MMAD CAT(MMAD_, SUB_GROUP_SIZE)
#define AS_TYPE(type, val) CAT(as_, type)(val) #define AS_TYPE(type, val) CAT(as_, type)(val)
@ -64,7 +71,7 @@ KERNEL(fully_connected_gpu_MMAD)(
uint k = feature_block * FULL_UNROLL_FACTOR; uint k = feature_block * FULL_UNROLL_FACTOR;
#else #else
for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR)
#endif #endif // FULL_UNROLL_FACTOR < 2
{ {
#if !SPLIT_SPATIAL #if !SPLIT_SPATIAL
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
@ -73,8 +80,10 @@ KERNEL(fully_connected_gpu_MMAD)(
for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
#endif #endif // !SPLIT_SPATIAL
#else // SPATIAL_MAJOR #else // SPATIAL_MAJOR
#if !SPLIT_SPATIAL #if !SPLIT_SPATIAL
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
#else #else
@ -82,7 +91,7 @@ KERNEL(fully_connected_gpu_MMAD)(
for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi)
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
# endif #endif // !SPLIT_SPATIAL
#if FULL_UNROLL_FACTOR < 2 #if FULL_UNROLL_FACTOR < 2
for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k) for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k)
@ -90,21 +99,20 @@ KERNEL(fully_connected_gpu_MMAD)(
uint k = feature_block * FULL_UNROLL_FACTOR; uint k = feature_block * FULL_UNROLL_FACTOR;
#else #else
for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR)
#endif #endif // FULL_UNROLL_FACTOR < 2
{ {
#endif #endif // SPATIAL_MAJOR
#if !SPLIT_SPATIAL #if !SPLIT_SPATIAL
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH; uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH;
#else #else
uint input_idx = input_offset + k * MMAD_INPUT_FBLOCK_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; uint input_idx = input_offset + k * MMAD_INPUT_FBLOCK_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
#endif #endif // !SPLIT_SPATIAL
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH; uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH;
#if UNROLL_FACTOR < 2 #if UNROLL_FACTOR < 2
uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx)); INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx));
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); INPUT_PACKED_TYPE_VEC activations;
INPUT_PACKED_TYPE_8 activations;
activations.s0 = sub_group_broadcast(input_data, 0); activations.s0 = sub_group_broadcast(input_data, 0);
activations.s1 = sub_group_broadcast(input_data, 1); activations.s1 = sub_group_broadcast(input_data, 1);
@ -114,27 +122,44 @@ KERNEL(fully_connected_gpu_MMAD)(
activations.s5 = sub_group_broadcast(input_data, 5); activations.s5 = sub_group_broadcast(input_data, 5);
activations.s6 = sub_group_broadcast(input_data, 6); activations.s6 = sub_group_broadcast(input_data, 6);
activations.s7 = sub_group_broadcast(input_data, 7); activations.s7 = sub_group_broadcast(input_data, 7);
#if SUB_GROUP_SIZE == 16
activations.s8 = sub_group_broadcast(input_data, 8);
activations.s9 = sub_group_broadcast(input_data, 9);
activations.sa = sub_group_broadcast(input_data, 0xa);
activations.sb = sub_group_broadcast(input_data, 0xb);
activations.sc = sub_group_broadcast(input_data, 0xc);
activations.sd = sub_group_broadcast(input_data, 0xd);
activations.se = sub_group_broadcast(input_data, 0xe);
activations.sf = sub_group_broadcast(input_data, 0xf);
#endif // SUB_GROUP_SIZE == 16
uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)); FILTER_PACKED_TYPE_VEC weights_data;
FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u); #if SUB_GROUP_SIZE == 8
weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
dotProd = MMAD_8(activations, weights_data, dotProd);
#else #else
weights_data.lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
weights_data.hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 8 * 4));
#endif // SUB_GROUP_SIZE == 8
dotProd = MMAD(activations, weights_data, dotProd);
#else // UNROLL_FACTOR < 2
INPUT_PACKED_TYPE input_data[UNROLL_FACTOR]; INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR]; FILTER_PACKED_TYPE_VEC weights_data[UNROLL_FACTOR];
__attribute__((opencl_unroll_hint)) __attribute__((opencl_unroll_hint))
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input + input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH));
input_idx + kb * MMAD_INPUT_FBLOCK_PITCH))); #if SUB_GROUP_SIZE == 8
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH)); #else
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0); weights_data[kb].lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
weights_data[kb].hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 32 + kb * MMAD_FILTER_FBLOCK_PITCH));
#endif // SUB_GROUP_SIZE
} }
__attribute__((opencl_unroll_hint)) __attribute__((opencl_unroll_hint))
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
INPUT_PACKED_TYPE_8 in; INPUT_PACKED_TYPE_VEC in;
in.s0 = sub_group_broadcast(input_data[kb], 0); in.s0 = sub_group_broadcast(input_data[kb], 0);
in.s1 = sub_group_broadcast(input_data[kb], 1); in.s1 = sub_group_broadcast(input_data[kb], 1);
@ -144,8 +169,17 @@ KERNEL(fully_connected_gpu_MMAD)(
in.s5 = sub_group_broadcast(input_data[kb], 5); in.s5 = sub_group_broadcast(input_data[kb], 5);
in.s6 = sub_group_broadcast(input_data[kb], 6); in.s6 = sub_group_broadcast(input_data[kb], 6);
in.s7 = sub_group_broadcast(input_data[kb], 7); in.s7 = sub_group_broadcast(input_data[kb], 7);
#if SUB_GROUP_SIZE == 16
dotProd = MMAD_8(in, weights_data[kb], dotProd); in.s8 = sub_group_broadcast(input_data[kb], 8);
in.s9 = sub_group_broadcast(input_data[kb], 9);
in.sa = sub_group_broadcast(input_data[kb], 0xa);
in.sb = sub_group_broadcast(input_data[kb], 0xb);
in.sc = sub_group_broadcast(input_data[kb], 0xc);
in.sd = sub_group_broadcast(input_data[kb], 0xd);
in.se = sub_group_broadcast(input_data[kb], 0xe);
in.sf = sub_group_broadcast(input_data[kb], 0xf);
#endif // SUB_GROUP_SIZE == 16
dotProd = MMAD(in, weights_data[kb], dotProd);
} }
#endif // UNROLL_FACTOR < 2 #endif // UNROLL_FACTOR < 2
} }
@ -174,6 +208,7 @@ KERNEL(fully_connected_gpu_MMAD)(
#endif // !SPLIT_SPATIAL #endif // !SPLIT_SPATIAL
#else // SPATIAL_MAJOR #else // SPATIAL_MAJOR
#if !SPLIT_SPATIAL #if !SPLIT_SPATIAL
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
#else // !SPLIT_SPATIAL #else // !SPLIT_SPATIAL
@ -182,24 +217,27 @@ KERNEL(fully_connected_gpu_MMAD)(
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
#endif // !SPLIT_SPATIAL #endif // !SPLIT_SPATIAL
#endif // SPATIAL_MAJOR #endif // SPATIAL_MAJOR
#if !SPLIT_SPATIAL #if !SPLIT_SPATIAL
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH; uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
#else // !SPLIT_SPATIAL #else
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH +
zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
#endif // !SPLIT_SPATIAL #endif // !SPLIT_SPATIAL
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH; uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0); MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
for (uint i = 0; i < 4; i++) { for (uint i = 0; i < 4; i++) {
if (FEATURE_BLOCKS_COUNT * 32 + sglid * 4 + i < INPUT0_FEATURE_NUM) { if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < INPUT0_FEATURE_NUM) {
input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH]; input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH];
} }
} }
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
INPUT_PACKED_TYPE_8 activations; //activations of all lanes INPUT_PACKED_TYPE_VEC activations;
activations.s0 = sub_group_broadcast(input_data, 0); activations.s0 = sub_group_broadcast(input_data, 0);
activations.s1 = sub_group_broadcast(input_data, 1); activations.s1 = sub_group_broadcast(input_data, 1);
activations.s2 = sub_group_broadcast(input_data, 2); activations.s2 = sub_group_broadcast(input_data, 2);
@ -208,11 +246,26 @@ KERNEL(fully_connected_gpu_MMAD)(
activations.s5 = sub_group_broadcast(input_data, 5); activations.s5 = sub_group_broadcast(input_data, 5);
activations.s6 = sub_group_broadcast(input_data, 6); activations.s6 = sub_group_broadcast(input_data, 6);
activations.s7 = sub_group_broadcast(input_data, 7); activations.s7 = sub_group_broadcast(input_data, 7);
#if SUB_GROUP_SIZE == 16
activations.s8 = sub_group_broadcast(input_data, 8);
activations.s9 = sub_group_broadcast(input_data, 9);
activations.sa = sub_group_broadcast(input_data, 0xa);
activations.sb = sub_group_broadcast(input_data, 0xb);
activations.sc = sub_group_broadcast(input_data, 0xc);
activations.sd = sub_group_broadcast(input_data, 0xd);
activations.se = sub_group_broadcast(input_data, 0xe);
activations.sf = sub_group_broadcast(input_data, 0xf);
#endif // SUB_GROUP_SIZE == 16
uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)); FILTER_PACKED_TYPE_VEC weights_data;
FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u); #if SUB_GROUP_SIZE == 8
weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
#else
weights_data.lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx));
weights_data.hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 32));
#endif // SUB_GROUP_SIZE == 8
dotProd = MMAD_8(activations, weights_data, dotProd); dotProd = MMAD(activations, weights_data, dotProd);
} }
#endif // HAS_FEATURE_LEFTOVERS #endif // HAS_FEATURE_LEFTOVERS
@ -224,12 +277,12 @@ KERNEL(fully_connected_gpu_MMAD)(
const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0); const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0);
#elif BIAS_PER_OFM #elif BIAS_PER_OFM
const uint bias_index = feature; const uint bias_index = feature;
#endif #endif // BIAS_PER_OUTPUT
float dequantized = (float)dotProd + biases[bias_index]; float dequantized = (float)dotProd + biases[bias_index];
#else // BIAS_TERM #else
float dequantized = (float)dotProd; float dequantized = (float)dotProd;
#endif #endif // BIAS_TERM
const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0); const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0);
@ -240,7 +293,7 @@ KERNEL(fully_connected_gpu_MMAD)(
output[out_idx] = res; output[out_idx] = res;
#else #else
output[out_idx] = TO_OUTPUT_TYPE(dequantized); output[out_idx] = TO_OUTPUT_TYPE(dequantized);
#endif #endif // HAS_FUSED_OPS
#if SLM_DIV_FACTOR > 1 #if SLM_DIV_FACTOR > 1
} }
@ -249,4 +302,11 @@ KERNEL(fully_connected_gpu_MMAD)(
#undef INPUT_PACKED_TYPE_8 #undef INPUT_PACKED_TYPE_8
#undef FILTER_PACKED_TYPE_8 #undef FILTER_PACKED_TYPE_8
#undef INPUT_PACKED_TYPE_VEC
#undef FILTER_PACKED_TYPE_VEC
#undef BLOCK_READ
#undef BLOCK_READ_8
#undef MMAD
#undef AS_TYPE #undef AS_TYPE

View File

@ -715,6 +715,63 @@ inline uint FUNC(get_os_is_zyx_isa8_osv8_isv4_index)(uint o, uint i, uint z, uin
CAT(prefix, _OFM_NUM), \ CAT(prefix, _OFM_NUM), \
CAT(prefix, _OFFSET)) CAT(prefix, _OFFSET))
inline uint FUNC(get_os_is_yx_isa8_osv16_isv4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset)
{
const uint f_32_aligned = ((size_ifm + 31)/32) * 32;
const uint isv2_idx = i % 4;
const uint osv_idx = o % 16;
const uint isv1_idx = (i / 4) % 8;
const uint is_idx = i / 32;
const uint os_idx = o / 16;
size_t idx = offset + isv2_idx + 4 * (osv_idx + 16 * isv1_idx);
idx += x * 4 * 8 * 16;
idx += y * size_x * 4 * 8 * 16;
idx += is_idx * size_y * size_x * 4 * 8 * 16;
idx += os_idx * (f_32_aligned/32) * size_y * size_x * 4 * 8 * 16;
return idx;
}
#define GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(prefix, o, i, y, x) \
FUNC_CALL(get_os_is_yx_isa8_osv16_isv4_index)( \
o, i, y, x, CAT(prefix, _SIZE_X ), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
CAT(prefix, _OFFSET))
inline uint FUNC(get_os_is_zyx_isa8_osv16_isv4_index)(uint o, uint i, uint z, uint y, uint x,
uint size_x, uint size_y, uint size_z,
uint size_ifm, uint size_ofm, uint offset)
{
const uint ifm_slices = (size_ifm + 31)/32;
const uint isv2_idx = i % 4;
const uint osv_idx = o % 16;
const uint isv1_idx = (i / 4) % 8;
const uint is_idx = i / 32;
const uint os_idx = o / 16;
size_t idx = offset + isv2_idx + 4 * (osv_idx + 16 * isv1_idx);
idx += x * 4 * 8 * 16;
idx += y * size_x * 4 * 8 * 16;
idx += z * size_y * size_x * 4 * 8 * 16;
idx += is_idx * size_z * size_y * size_x * 4 * 8 * 16;
idx += os_idx * ifm_slices * size_z * size_y * size_x * 4 * 8 * 16;
return idx;
}
#define GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(prefix, o, i, z, y, x) \
FUNC_CALL(get_os_is_zyx_isa8_osv16_isv4_index)( \
o, i, z, y, x, \
CAT(prefix, _SIZE_X ), \
CAT(prefix, _SIZE_Y), \
CAT(prefix, _SIZE_Z), \
CAT(prefix, _IFM_NUM), \
CAT(prefix, _OFM_NUM), \
CAT(prefix, _OFFSET))
inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset) inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset)
{ {
const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32; const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32;

View File

@ -783,6 +783,7 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attrib
} }
#define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C) #define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C)
#define MMAD_16(A, B, C) FUNC_CALL(mmad16)(A, B, C)
#define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C) #define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C)
#define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C) #define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C)
#define MMAD_16x16(A, B, C) FUNC_CALL(mmad16x16)(A, B, C) #define MMAD_16x16(A, B, C) FUNC_CALL(mmad16x16)(A, B, C)

View File

@ -48,6 +48,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, y, x); return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4 #elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4
return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, z, y, x); return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_YX_ISA8_OSV16_ISV4
return GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(INPUT0, o, i, y, x);
#elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV16_ISV4
return GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(INPUT0, o, i, z, y, x);
#elif defined INPUT0_LAYOUT_IS_O_YX_ISV32 #elif defined INPUT0_LAYOUT_IS_O_YX_ISV32
return GET_FILTER_IS_O_YX_ISV32(INPUT0, o, i, y, x); return GET_FILTER_IS_O_YX_ISV32(INPUT0, o, i, y, x);
#elif defined INPUT0_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4 #elif defined INPUT0_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4
@ -156,6 +160,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, y, x); return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4 #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4
return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, z, y, x); return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_YX_ISA8_OSV16_ISV4
return GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(OUTPUT, o, i, y, x);
#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV16_ISV4
return GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(OUTPUT, o, i, z, y, x);
#elif defined OUTPUT_LAYOUT_IS_O_YX_ISV32 #elif defined OUTPUT_LAYOUT_IS_O_YX_ISV32
return GET_FILTER_IS_O_YX_ISV32(OUTPUT, o, i, y, x); return GET_FILTER_IS_O_YX_ISV32(OUTPUT, o, i, y, x);
#elif defined OUTPUT_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4 #elif defined OUTPUT_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4

View File

@ -330,8 +330,10 @@ std::string toString(WeightsLayout layout) {
case WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb: return "IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB"; case WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb: return "IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB";
case WeightsLayout::dlstm_dir_io: return "DLSTM_DIR_IO"; case WeightsLayout::dlstm_dir_io: return "DLSTM_DIR_IO";
case WeightsLayout::os_is_yx_isa8_osv8_isv4: return "OS_IS_YX_ISA8_OSV8_ISV4"; case WeightsLayout::os_is_yx_isa8_osv8_isv4: return "OS_IS_YX_ISA8_OSV8_ISV4";
case WeightsLayout::os_is_yx_isa8_osv16_isv4: return "OS_IS_YX_ISA8_OSV16_ISV4";
case WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
case WeightsLayout::os_is_zyx_isa8_osv8_isv4: return "OS_IS_ZYX_ISA8_OSV8_ISV4"; case WeightsLayout::os_is_zyx_isa8_osv8_isv4: return "OS_IS_ZYX_ISA8_OSV8_ISV4";
case WeightsLayout::os_is_zyx_isa8_osv16_isv4: return "OS_IS_ZYX_ISA8_OSV16_ISV4";
case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4";
case WeightsLayout::is_o_yx_isv32: return "IS_O_YX_ISV32"; case WeightsLayout::is_o_yx_isv32: return "IS_O_YX_ISV32";

View File

@ -144,8 +144,12 @@ inline std::string fmt_to_str(format fmt) {
return "os_is_yx_isv16_osv16"; return "os_is_yx_isv16_osv16";
case format::os_is_yx_isa8_osv8_isv4: case format::os_is_yx_isa8_osv8_isv4:
return "os_is_yx_isa8_osv8_isv4"; return "os_is_yx_isa8_osv8_isv4";
case format::os_is_yx_isa8_osv16_isv4:
return "os_is_yx_isa8_osv16_isv4";
case format::os_is_zyx_isa8_osv8_isv4: case format::os_is_zyx_isa8_osv8_isv4:
return "os_is_zyx_isa8_osv8_isv4"; return "os_is_zyx_isa8_osv8_isv4";
case format::os_is_zyx_isa8_osv16_isv4:
return "os_is_zyx_isa8_osv16_isv4";
case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
return "os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4"; return "os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4";
case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:

View File

@ -238,8 +238,12 @@ kernel_selector::weights_layout to_weights_layout(format f) {
return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb; return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb;
case format::os_is_yx_isa8_osv8_isv4: case format::os_is_yx_isa8_osv8_isv4:
return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4; return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4;
case format::os_is_yx_isa8_osv16_isv4:
return kernel_selector::weights_layout::os_is_yx_isa8_osv16_isv4;
case format::os_is_zyx_isa8_osv8_isv4: case format::os_is_zyx_isa8_osv8_isv4:
return kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4; return kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4;
case format::os_is_zyx_isa8_osv16_isv4:
return kernel_selector::weights_layout::os_is_zyx_isa8_osv16_isv4;
case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4:
@ -390,6 +394,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
return cldnn::format::os_is_yx_isa8_osv8_isv4; return cldnn::format::os_is_yx_isa8_osv8_isv4;
case kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4: case kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4:
return cldnn::format::os_is_zyx_isa8_osv8_isv4; return cldnn::format::os_is_zyx_isa8_osv8_isv4;
case kernel_selector::weights_layout::os_is_yx_isa8_osv16_isv4:
return cldnn::format::os_is_yx_isa8_osv16_isv4;
case kernel_selector::weights_layout::os_is_zyx_isa8_osv16_isv4:
return cldnn::format::os_is_zyx_isa8_osv16_isv4;
case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4:
return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4;
case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: