diff --git a/inference-engine/thirdparty/clDNN/api/layout.hpp b/inference-engine/thirdparty/clDNN/api/layout.hpp index 3fe7e537fb7..afe12825ff9 100644 --- a/inference-engine/thirdparty/clDNN/api/layout.hpp +++ b/inference-engine/thirdparty/clDNN/api/layout.hpp @@ -413,6 +413,9 @@ struct layout { if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4 && !(is_aligned_to(sizes[0], 8)) && !(is_aligned_to(sizes[1], 32))) { sizes[0] = align_to(sizes[0], 8); sizes[1] = align_to(sizes[1], 32); + } else if (this->format == cldnn::format::os_is_yx_isa8_osv16_isv4 && !(is_aligned_to(sizes[0], 16)) && !(is_aligned_to(sizes[1], 32))) { + sizes[0] = align_to(sizes[0], 16); + sizes[1] = align_to(sizes[1], 32); } else if (this->format == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(sizes[0], 32)) && !(is_aligned_to(sizes[1], 32))) { sizes[0] = align_to(sizes[0], 32); sizes[1] = align_to(sizes[1], 32); diff --git a/inference-engine/thirdparty/clDNN/api/tensor.hpp b/inference-engine/thirdparty/clDNN/api/tensor.hpp index b9a236bce93..5661423d8be 100644 --- a/inference-engine/thirdparty/clDNN/api/tensor.hpp +++ b/inference-engine/thirdparty/clDNN/api/tensor.hpp @@ -162,6 +162,8 @@ struct format { ///< convolution, F(6,3) -- filter 3x3 with stride 1 os_is_yx_isa8_osv8_isv4, ///< format for weights for MMAD convolution os_is_zyx_isa8_osv8_isv4, ///< format for weights for MMAD convolution + os_is_yx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD + os_is_zyx_isa8_osv16_isv4, ///< format for weights for fully connected MMAD os_is_yx_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD convolution os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, ///< format for weights for MMAD fsv32 convolution @@ -273,8 +275,10 @@ struct format { { image_2d_weights_c1_b_fyx, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, { lstm_weights_dio, { 1, 1, 2, 0, 0, "oixy", "oixy?", {}}}, { os_is_yx_isa8_osv8_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, + { os_is_yx_isa8_osv16_isv4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, { os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {}}}, { os_is_zyx_isa8_osv8_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}}}, + { os_is_zyx_isa8_osv16_isv4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}}}, { os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, 0, "oiyx", "oixy?", {{0, 32}, {1, 32}}}}, { os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 3, 0, 0, "oizyx", "oixyz", {{0, 32}, {1, 32}}}}, { is_o_yx_isv32, { 1, 1, 2, 0, 0, "oyxi", "oixy?", {{1, 32}}}}, @@ -995,6 +999,13 @@ public: my_sizes[1] = align_to(my_sizes[1], 32); adjusted_coords[0] = align_to(adjusted_coords[0], 8); adjusted_coords[1] = align_to(adjusted_coords[1], 32); + } else if (fmt == cldnn::format::os_is_yx_isa8_osv16_isv4 && + !(is_aligned_to(my_sizes[0], 16)) && + !(is_aligned_to(my_sizes[1], 32))) { + my_sizes[0] = align_to(my_sizes[0], 16); + my_sizes[1] = align_to(my_sizes[1], 32); + adjusted_coords[0] = align_to(adjusted_coords[0], 16); + adjusted_coords[1] = align_to(adjusted_coords[1], 32); } else if (fmt == cldnn::format::os_is_yx_isa8_osv8_isv4_swizzled_by_4 && !(is_aligned_to(my_sizes[0], 32)) && !(is_aligned_to(my_sizes[1], 32))) { my_sizes[0] = align_to(my_sizes[0], 32); my_sizes[1] = align_to(my_sizes[1], 32); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp index 4797367c27c..7878ece5713 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp @@ -86,8 +86,10 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{ { WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::dlstm_dir_io, { 1, 0, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_yx_isa8_osv8_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, + { WeightsLayout::os_is_yx_isa8_osv16_isv4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_zyx_isa8_osv8_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } }, + { WeightsLayout::os_is_zyx_isa8_osv16_isv4, { 0, 1, 2, 3, 4, -1, -1, -1 } }, { WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, -1, 2, 3, -1, -1, -1 } }, { WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 0, 1, 2, 3, 4, -1, -1, -1 } }, { WeightsLayout::is_o_yx_isv32, { 1, 2, -1, 0, 3, -1, -1, -1 } }, @@ -457,6 +459,16 @@ NDims WeightsTensor::GetSimpleDims(const std::vector& d, WeightsLayout l newDims[3] = RoundUp(newDims[3], 32); newDims[4] = RoundUp(newDims[4], 8); break; + case os_is_yx_isa8_osv16_isv4: + assert(newDims.size() == 4); + newDims[3] = RoundUp(newDims[3], 16); + newDims[2] = RoundUp(newDims[2], 32); + break; + case os_is_zyx_isa8_osv16_isv4: + assert(newDims.size() == 5); + newDims[3] = RoundUp(newDims[3], 32); + newDims[4] = RoundUp(newDims[4], 16); + break; case os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: assert(newDims.size() == 4); newDims[3] = RoundUp(newDims[3], 32); @@ -693,6 +705,9 @@ NDims WeightsTensor::GetSimpleDims(const std::vector& d, WeightsLayout l } else if (l == os_is_yx_isa8_osv8_isv4 || l == os_is_yx_isa8_osv8_isv4_swizzled_by_4) { ret[0].pitch = 256; ret[1].pitch = ret[0].pitch * ret[0].v; + } else if (l == os_is_yx_isa8_osv16_isv4) { + ret[0].pitch = 512; + ret[1].pitch = ret[0].pitch * ret[0].v; } else if (l == os_i_yxs_osv4_yxsv4) { ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 4; ret[3].pitch = ret[2].v * RoundUp(ret[0].v * ret[1].v, 4); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h index e1b57d72e7c..67fa78eda7f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h @@ -107,6 +107,8 @@ enum WeightsLayout { dlstm_dir_io, // dlstm weights layout direction, input_size, 4* hiden_size os_is_yx_isa8_osv8_isv4, // for MMAD convolution os_is_zyx_isa8_osv8_isv4, // for MMAD convolution + os_is_yx_isa8_osv16_isv4, // for fully connected MMAD + os_is_zyx_isa8_osv16_isv4, // for fully connected MMAD os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, // 1,5... os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp index 8b2e9f7c3ad..306d4b60d23 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp @@ -61,27 +61,62 @@ bool FullyConnectedKernelMMAD::Validate(const Params& params, const optional_par return true; } -FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::SetTuningParams(const fully_connected_params& params) const { +FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::GetTuningParams(const fully_connected_params& params) const { FullyConnectedTuningData tuning_data; const auto& input = params.inputs[0]; + const auto& output = params.output; - size_t feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0 ? - input.Feature().v / 32 : CeilDiv(input.Feature().v, 32); + tuning_data.sub_group_size = 8; + if (input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1 && input.Batch().v == 1) { + // Known cases for TGL where simd16 works better than simd8 + bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512; + bool simd16_exception_2 = input.Feature().v == 21504 && output.Feature().v == 512; - if (feature_blocks_count) - while (feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 && + if (simd16_exception_1 || simd16_exception_2) + tuning_data.sub_group_size = 16; + } + + size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size; + + tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size != 0 ? + input.Feature().v / sub_group_pack_size : + input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 ? + CeilDiv(input.Feature().v, 32) % 2 == 0 ? CeilDiv(input.Feature().v, 64) : CeilDiv(input.Feature().v, 64) - 1 : + CeilDiv(input.Feature().v, sub_group_pack_size); + + bool slm_div_factor_exception = input.Batch().v == 300 && input.Feature().v == 2048 && + output.Batch().v == 300 && (output.Feature().v == 324 || output.Feature().v == 81); + + if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception) + while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size)) tuning_data.slm_div_factor *= 2; tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; + tuning_data.full_unroll_factor = tuning_data.feature_blocks_count / tuning_data.slm_div_factor; + + if (tuning_data.sub_group_size == 16) { + tuning_data.unroll_factor = 1; + } else { + size_t temp_unroll_factor = 3; + + if (tuning_data.full_unroll_factor > 3) { + while (tuning_data.full_unroll_factor % temp_unroll_factor) + temp_unroll_factor--; + tuning_data.unroll_factor = temp_unroll_factor; + } else { + tuning_data.unroll_factor = tuning_data.full_unroll_factor; + } + } + return tuning_data; } FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params, int) const { - FullyConnectedTuningData tuning_data = SetTuningParams(params); + FullyConnectedTuningData tuning_data = GetTuningParams(params); auto dispatchData = Parent::SetDefault(params); const auto& output = params.output; @@ -92,84 +127,65 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons } JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params, - const DispatchData& dispatchData) const { - FullyConnectedTuningData tuning_data = SetTuningParams(params); + const DispatchData& runInfo) const { + FullyConnectedTuningData tuning_data = GetTuningParams(params); - auto jit = Parent::GetJitConstants(params, dispatchData); + auto jit = Parent::GetJitConstants(params, runInfo); auto& input = params.inputs[0]; auto& weights = params.weights; + size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size; + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); - if (input.GetDims().size() == 5) { - jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)")); + if (tuning_data.sub_group_size == 8) { + if (input.GetDims().size() == 5) { + jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)")); + } else { + jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)")); + } } else { - jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)")); + if (input.GetDims().size() == 5) { + jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(FILTER, f, 0, 0, 0)")); + } else { + jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(FILTER, f, 0, 0, 0, 0)")); + } } - Datatype input_packed_type = Datatype::INT32; - Datatype filter_packed_type = Datatype::INT32; - - if (input.GetDType() == Datatype::UINT8) { - input_packed_type = Datatype::UINT32; - } else if (input.GetDType() == Datatype::INT8) { - input_packed_type = Datatype::INT32; - } - - if (weights.GetDType() == WeightsType::UINT8) { - filter_packed_type = Datatype::UINT32; - } else if (weights.GetDType() == WeightsType::INT8) { - filter_packed_type = Datatype::INT32; - } - - jit.Merge(MakeTypeJitConstants(input_packed_type, "INPUT_PACKED")); - jit.Merge(MakeTypeJitConstants(filter_packed_type, "FILTER_PACKED")); + jit.Merge(MakeTypeJitConstants(input.GetDType() == Datatype::UINT8 ? Datatype::UINT32 : Datatype::INT32, "INPUT_PACKED")); + jit.Merge(MakeTypeJitConstants(weights.GetDType() == WeightsType::UINT8 ? Datatype::UINT32 : Datatype::INT32, "FILTER_PACKED")); auto filter_spatial_size = weights.X().v * weights.Y().v * weights.Z().v; - int filter_spatial_pitch = 4 * 8 * 8; + auto filter_spatial_pitch = 8 * sub_group_pack_size; + auto filter_fblock_pitch = tuning_data.sub_group_size == 8 ? + filter_spatial_size * filter_spatial_pitch : + filter_spatial_size * filter_spatial_pitch * 2; jit.AddConstant(MakeJitConstant("FILTER_SPATIAL_SIZE", filter_spatial_size)); jit.AddConstant(MakeJitConstant("MMAD_FILTER_SPATIAL_PITCH", filter_spatial_pitch)); - jit.AddConstant(MakeJitConstant("MMAD_FILTER_FBLOCK_PITCH", filter_spatial_size * filter_spatial_pitch)); + jit.AddConstant(MakeJitConstant("MMAD_FILTER_FBLOCK_PITCH", filter_fblock_pitch)); size_t input_x_pitch = input.X().pitch; size_t input_y_pitch = input.Y().pitch; size_t input_z_pitch = input.Z().pitch; if (input.GetLayout() == DataLayout::bfyx) { - jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", 32)); + jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", sub_group_pack_size)); } else if (input.GetLayout() == DataLayout::b_fs_yx_fsv32 || input.GetLayout() == DataLayout::b_fs_zyx_fsv32) { input_x_pitch = 32; input_y_pitch *= 32; input_z_pitch *= 32; - jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32)); + jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * sub_group_pack_size)); } + bool has_feature_leftovers = (input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size) || + (input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 && CeilDiv(input.Feature().v, 32) % 2); + + jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", has_feature_leftovers)); + jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", tuning_data.feature_blocks_count)); jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); - - size_t feature_blocks_count; - size_t temp_unroll_factor = 9, unroll_factor, full_unroll_factor; - - if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) { - feature_blocks_count = input.Feature().v / 32; - jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true)); - } else { - feature_blocks_count = CeilDiv(input.Feature().v, 32); - } - - full_unroll_factor = feature_blocks_count / tuning_data.slm_div_factor; - - if (full_unroll_factor > 9) { - while (full_unroll_factor % temp_unroll_factor) - temp_unroll_factor--; - unroll_factor = temp_unroll_factor; - } else { - unroll_factor = full_unroll_factor; - } - - jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", feature_blocks_count)); - jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", unroll_factor)); - jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", full_unroll_factor)); + jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", tuning_data.unroll_factor)); + jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", tuning_data.full_unroll_factor)); jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch)); @@ -197,10 +213,9 @@ KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const auto fc_params = static_cast(params); auto& input = fc_params.inputs[0]; - auto w_layout = WeightsLayout::os_is_yx_isa8_osv8_isv4; - if (input.GetDims().size() == 5) { - w_layout = WeightsLayout::os_is_zyx_isa8_osv8_isv4; - } + auto w_layout = GetTuningParams(fc_params).sub_group_size == 16 ? + input.GetDims().size() == 4 ? WeightsLayout::os_is_yx_isa8_osv16_isv4 : WeightsLayout::os_is_zyx_isa8_osv16_isv4 : + input.GetDims().size() == 4 ? WeightsLayout::os_is_yx_isa8_osv8_isv4 : WeightsLayout::os_is_zyx_isa8_osv8_isv4; KernelsData res = {}; for (size_t i = 0; i < autoTuneOptions.size(); i++) { diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h index af7cb336e9a..529f4c5db3b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.h @@ -1,4 +1,4 @@ -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,20 +30,25 @@ public: ParamsKey GetSupportedKey() const override; struct FullyConnectedTuningData { - const size_t sub_group_size = 8; + const size_t pack_size = 4; + size_t sub_group_size = 8; size_t slm_div_factor = 1; size_t work_group_size = 1; + size_t feature_blocks_count; + size_t unroll_factor; + size_t full_unroll_factor; }; protected: - JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& dispatchData) const override; + JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; std::vector GetSupportedFusedOps() const override { return { FusedOpType::QUANTIZE, FusedOpType::SCALE, - FusedOpType::ACTIVATION }; + FusedOpType::ACTIVATION, + FusedOpType::ELTWISE }; } bool Validate(const Params& params, const optional_params& options) const override; - FullyConnectedTuningData SetTuningParams(const fully_connected_params& params) const; + FullyConnectedTuningData GetTuningParams(const fully_connected_params& params) const; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl index 95fc65da680..7b59f7e15d5 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl @@ -19,10 +19,17 @@ #include "include/fetch.cl" #include "include/mmad.cl" -#define INPUT_PACKED_TYPE_8 CAT(INPUT_PACKED_TYPE, 8) -#define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8) +#define INPUT_PACKED_TYPE_8 CAT(INPUT_PACKED_TYPE, 8) +#define FILTER_PACKED_TYPE_8 CAT(FILTER_PACKED_TYPE, 8) +#define INPUT_PACKED_TYPE_VEC CAT(INPUT_PACKED_TYPE, SUB_GROUP_SIZE) +#define FILTER_PACKED_TYPE_VEC CAT(FILTER_PACKED_TYPE, SUB_GROUP_SIZE) -#define AS_TYPE(type, val) CAT(as_, type)(val) +#define BLOCK_READ(ptr) intel_sub_group_block_read((const __global uint*)(ptr)) +#define BLOCK_READ_8(ptr) intel_sub_group_block_read8((const __global uint*)(ptr)) + +#define MMAD CAT(MMAD_, SUB_GROUP_SIZE) + +#define AS_TYPE(type, val) CAT(as_, type)(val) __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) KERNEL(fully_connected_gpu_MMAD)( @@ -64,25 +71,27 @@ KERNEL(fully_connected_gpu_MMAD)( uint k = feature_block * FULL_UNROLL_FACTOR; #else for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) -#endif +#endif // FULL_UNROLL_FACTOR < 2 { -# if !SPLIT_SPATIAL +#if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { -# else +#else for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi) for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; -#endif +#endif // !SPLIT_SPATIAL + #else // SPATIAL_MAJOR -# if !SPLIT_SPATIAL + +#if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { -# else +#else for (uint zi = 0; zi < FILTER_SIZE_Z; ++zi) for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; -# endif +#endif // !SPLIT_SPATIAL #if FULL_UNROLL_FACTOR < 2 for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k) @@ -90,21 +99,20 @@ KERNEL(fully_connected_gpu_MMAD)( uint k = feature_block * FULL_UNROLL_FACTOR; #else for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR) -#endif +#endif // FULL_UNROLL_FACTOR < 2 { -#endif +#endif // SPATIAL_MAJOR + #if !SPLIT_SPATIAL uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH; #else uint input_idx = input_offset + k * MMAD_INPUT_FBLOCK_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; -#endif +#endif // !SPLIT_SPATIAL uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH; #if UNROLL_FACTOR < 2 - uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx)); - INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); - - INPUT_PACKED_TYPE_8 activations; + INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx)); + INPUT_PACKED_TYPE_VEC activations; activations.s0 = sub_group_broadcast(input_data, 0); activations.s1 = sub_group_broadcast(input_data, 1); @@ -114,27 +122,44 @@ KERNEL(fully_connected_gpu_MMAD)( activations.s5 = sub_group_broadcast(input_data, 5); activations.s6 = sub_group_broadcast(input_data, 6); activations.s7 = sub_group_broadcast(input_data, 7); +#if SUB_GROUP_SIZE == 16 + activations.s8 = sub_group_broadcast(input_data, 8); + activations.s9 = sub_group_broadcast(input_data, 9); + activations.sa = sub_group_broadcast(input_data, 0xa); + activations.sb = sub_group_broadcast(input_data, 0xb); + activations.sc = sub_group_broadcast(input_data, 0xc); + activations.sd = sub_group_broadcast(input_data, 0xd); + activations.se = sub_group_broadcast(input_data, 0xe); + activations.sf = sub_group_broadcast(input_data, 0xf); +#endif // SUB_GROUP_SIZE == 16 - uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)); - FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u); - - dotProd = MMAD_8(activations, weights_data, dotProd); + FILTER_PACKED_TYPE_VEC weights_data; +#if SUB_GROUP_SIZE == 8 + weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx)); #else + weights_data.lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx)); + weights_data.hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 8 * 4)); +#endif // SUB_GROUP_SIZE == 8 + + dotProd = MMAD(activations, weights_data, dotProd); +#else // UNROLL_FACTOR < 2 INPUT_PACKED_TYPE input_data[UNROLL_FACTOR]; - FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR]; + FILTER_PACKED_TYPE_VEC weights_data[UNROLL_FACTOR]; __attribute__((opencl_unroll_hint)) for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { - input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input + - input_idx + kb * MMAD_INPUT_FBLOCK_PITCH))); - - uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH)); - weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0); + input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH)); +#if SUB_GROUP_SIZE == 8 + weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH)); +#else + weights_data[kb].lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH)); + weights_data[kb].hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 32 + kb * MMAD_FILTER_FBLOCK_PITCH)); +#endif // SUB_GROUP_SIZE } __attribute__((opencl_unroll_hint)) for (uint kb = 0; kb < UNROLL_FACTOR; kb++) { - INPUT_PACKED_TYPE_8 in; + INPUT_PACKED_TYPE_VEC in; in.s0 = sub_group_broadcast(input_data[kb], 0); in.s1 = sub_group_broadcast(input_data[kb], 1); @@ -144,8 +169,17 @@ KERNEL(fully_connected_gpu_MMAD)( in.s5 = sub_group_broadcast(input_data[kb], 5); in.s6 = sub_group_broadcast(input_data[kb], 6); in.s7 = sub_group_broadcast(input_data[kb], 7); - - dotProd = MMAD_8(in, weights_data[kb], dotProd); +#if SUB_GROUP_SIZE == 16 + in.s8 = sub_group_broadcast(input_data[kb], 8); + in.s9 = sub_group_broadcast(input_data[kb], 9); + in.sa = sub_group_broadcast(input_data[kb], 0xa); + in.sb = sub_group_broadcast(input_data[kb], 0xb); + in.sc = sub_group_broadcast(input_data[kb], 0xc); + in.sd = sub_group_broadcast(input_data[kb], 0xd); + in.se = sub_group_broadcast(input_data[kb], 0xe); + in.sf = sub_group_broadcast(input_data[kb], 0xf); +#endif // SUB_GROUP_SIZE == 16 + dotProd = MMAD(in, weights_data[kb], dotProd); } #endif // UNROLL_FACTOR < 2 } @@ -174,6 +208,7 @@ KERNEL(fully_connected_gpu_MMAD)( #endif // !SPLIT_SPATIAL #else // SPATIAL_MAJOR + #if !SPLIT_SPATIAL for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) { #else // !SPLIT_SPATIAL @@ -182,24 +217,27 @@ KERNEL(fully_connected_gpu_MMAD)( for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) { const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y; #endif // !SPLIT_SPATIAL + #endif // SPATIAL_MAJOR #if !SPLIT_SPATIAL uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH; -#else // !SPLIT_SPATIAL - uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; -#endif // !SPLIT_SPATIAL +#else + uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH; +#endif // !SPLIT_SPATIAL uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH; MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0); for (uint i = 0; i < 4; i++) { - if (FEATURE_BLOCKS_COUNT * 32 + sglid * 4 + i < INPUT0_FEATURE_NUM) { + if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < INPUT0_FEATURE_NUM) { input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH]; } } INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u); - INPUT_PACKED_TYPE_8 activations; //activations of all lanes + INPUT_PACKED_TYPE_VEC activations; + activations.s0 = sub_group_broadcast(input_data, 0); activations.s1 = sub_group_broadcast(input_data, 1); activations.s2 = sub_group_broadcast(input_data, 2); @@ -208,11 +246,26 @@ KERNEL(fully_connected_gpu_MMAD)( activations.s5 = sub_group_broadcast(input_data, 5); activations.s6 = sub_group_broadcast(input_data, 6); activations.s7 = sub_group_broadcast(input_data, 7); +#if SUB_GROUP_SIZE == 16 + activations.s8 = sub_group_broadcast(input_data, 8); + activations.s9 = sub_group_broadcast(input_data, 9); + activations.sa = sub_group_broadcast(input_data, 0xa); + activations.sb = sub_group_broadcast(input_data, 0xb); + activations.sc = sub_group_broadcast(input_data, 0xc); + activations.sd = sub_group_broadcast(input_data, 0xd); + activations.se = sub_group_broadcast(input_data, 0xe); + activations.sf = sub_group_broadcast(input_data, 0xf); +#endif // SUB_GROUP_SIZE == 16 - uint8 weights_data_u = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)); - FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u); + FILTER_PACKED_TYPE_VEC weights_data; +#if SUB_GROUP_SIZE == 8 + weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx)); +#else + weights_data.lo = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx)); + weights_data.hi = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + SUB_GROUP_SIZE * 32)); +#endif // SUB_GROUP_SIZE == 8 - dotProd = MMAD_8(activations, weights_data, dotProd); + dotProd = MMAD(activations, weights_data, dotProd); } #endif // HAS_FEATURE_LEFTOVERS @@ -220,16 +273,16 @@ KERNEL(fully_connected_gpu_MMAD)( return; #if BIAS_TERM -#if BIAS_PER_OUTPUT +#if BIAS_PER_OUTPUT const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0); #elif BIAS_PER_OFM const uint bias_index = feature; -#endif +#endif // BIAS_PER_OUTPUT float dequantized = (float)dotProd + biases[bias_index]; -#else // BIAS_TERM +#else float dequantized = (float)dotProd; -#endif +#endif // BIAS_TERM const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0); @@ -240,7 +293,7 @@ KERNEL(fully_connected_gpu_MMAD)( output[out_idx] = res; #else output[out_idx] = TO_OUTPUT_TYPE(dequantized); -#endif +#endif // HAS_FUSED_OPS #if SLM_DIV_FACTOR > 1 } @@ -249,4 +302,11 @@ KERNEL(fully_connected_gpu_MMAD)( #undef INPUT_PACKED_TYPE_8 #undef FILTER_PACKED_TYPE_8 +#undef INPUT_PACKED_TYPE_VEC +#undef FILTER_PACKED_TYPE_VEC + +#undef BLOCK_READ +#undef BLOCK_READ_8 + +#undef MMAD #undef AS_TYPE diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl index 92bae09b2b2..8b685eb66b7 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl @@ -715,6 +715,63 @@ inline uint FUNC(get_os_is_zyx_isa8_osv8_isv4_index)(uint o, uint i, uint z, uin CAT(prefix, _OFM_NUM), \ CAT(prefix, _OFFSET)) +inline uint FUNC(get_os_is_yx_isa8_osv16_isv4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset) +{ + const uint f_32_aligned = ((size_ifm + 31)/32) * 32; + const uint isv2_idx = i % 4; + const uint osv_idx = o % 16; + const uint isv1_idx = (i / 4) % 8; + const uint is_idx = i / 32; + const uint os_idx = o / 16; + + size_t idx = offset + isv2_idx + 4 * (osv_idx + 16 * isv1_idx); + idx += x * 4 * 8 * 16; + idx += y * size_x * 4 * 8 * 16; + idx += is_idx * size_y * size_x * 4 * 8 * 16; + idx += os_idx * (f_32_aligned/32) * size_y * size_x * 4 * 8 * 16; + + return idx; +} + +#define GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(prefix, o, i, y, x) \ + FUNC_CALL(get_os_is_yx_isa8_osv16_isv4_index)( \ + o, i, y, x, CAT(prefix, _SIZE_X ), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + CAT(prefix, _OFFSET)) + +inline uint FUNC(get_os_is_zyx_isa8_osv16_isv4_index)(uint o, uint i, uint z, uint y, uint x, + uint size_x, uint size_y, uint size_z, + uint size_ifm, uint size_ofm, uint offset) +{ + const uint ifm_slices = (size_ifm + 31)/32; + const uint isv2_idx = i % 4; + const uint osv_idx = o % 16; + const uint isv1_idx = (i / 4) % 8; + const uint is_idx = i / 32; + const uint os_idx = o / 16; + + size_t idx = offset + isv2_idx + 4 * (osv_idx + 16 * isv1_idx); + idx += x * 4 * 8 * 16; + idx += y * size_x * 4 * 8 * 16; + idx += z * size_y * size_x * 4 * 8 * 16; + idx += is_idx * size_z * size_y * size_x * 4 * 8 * 16; + idx += os_idx * ifm_slices * size_z * size_y * size_x * 4 * 8 * 16; + + return idx; +} + +#define GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(prefix, o, i, z, y, x) \ + FUNC_CALL(get_os_is_zyx_isa8_osv16_isv4_index)( \ + o, i, z, y, x, \ + CAT(prefix, _SIZE_X ), \ + CAT(prefix, _SIZE_Y), \ + CAT(prefix, _SIZE_Z), \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + CAT(prefix, _OFFSET)) + inline uint FUNC(get_os_is_yx_isa8_osv8_isv4_swizzled_by_4_index)(uint o, uint i, uint y, uint x, uint size_x, uint size_y, uint size_ifm, uint size_ofm, uint offset) { const uint o_swizzled = (o % 4) * 8 + ((o % 32) / 4) + (o / 32) * 32; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl index d323b9ccf90..40bd275ca63 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl @@ -783,6 +783,7 @@ inline uchar FUNC(sub_group_block_read_uchar)(const __local uchar* ptr) __attrib } #define MMAD_8(A, B, C) FUNC_CALL(mmad8)(A, B, C) +#define MMAD_16(A, B, C) FUNC_CALL(mmad16)(A, B, C) #define MMAD_4x8(A, B, C) FUNC_CALL(mmad4x8)(A, B, C) #define MMAD_8x8(A, B, C) FUNC_CALL(mmad8x8)(A, B, C) #define MMAD_16x16(A, B, C) FUNC_CALL(mmad16x16)(A, B, C) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl index f1230a80b03..d42d438f745 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl @@ -48,6 +48,10 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4 return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(INPUT0, o, i, z, y, x); +#elif defined INPUT0_LAYOUT_OS_IS_YX_ISA8_OSV16_ISV4 + return GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(INPUT0, o, i, y, x); +#elif defined INPUT0_LAYOUT_OS_IS_ZYX_ISA8_OSV16_ISV4 + return GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(INPUT0, o, i, z, y, x); #elif defined INPUT0_LAYOUT_IS_O_YX_ISV32 return GET_FILTER_IS_O_YX_ISV32(INPUT0, o, i, y, x); #elif defined INPUT0_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4 @@ -156,6 +160,10 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV8_ISV4 return GET_FILTER_OS_IS_ZYX_ISA8_OSV8_ISV4_INDEX(OUTPUT, o, i, z, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_YX_ISA8_OSV16_ISV4 + return GET_FILTER_OS_IS_YX_ISA8_OSV16_ISV4_INDEX(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_OS_IS_ZYX_ISA8_OSV16_ISV4 + return GET_FILTER_OS_IS_ZYX_ISA8_OSV16_ISV4_INDEX(OUTPUT, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_IS_O_YX_ISV32 return GET_FILTER_IS_O_YX_ISV32(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_IS_O32_YX_ISV32_SWIZZLED_BY_4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp index dab028ecbdc..259a40bae62 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp @@ -330,8 +330,10 @@ std::string toString(WeightsLayout layout) { case WeightsLayout::image_2d_weights_winograd_6x3_s1_xfbyb: return "IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB"; case WeightsLayout::dlstm_dir_io: return "DLSTM_DIR_IO"; case WeightsLayout::os_is_yx_isa8_osv8_isv4: return "OS_IS_YX_ISA8_OSV8_ISV4"; + case WeightsLayout::os_is_yx_isa8_osv16_isv4: return "OS_IS_YX_ISA8_OSV16_ISV4"; case WeightsLayout::os_is_yx_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::os_is_zyx_isa8_osv8_isv4: return "OS_IS_ZYX_ISA8_OSV8_ISV4"; + case WeightsLayout::os_is_zyx_isa8_osv16_isv4: return "OS_IS_ZYX_ISA8_OSV16_ISV4"; case WeightsLayout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_YX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: return "OS_IS_ZYX_OSA4_ISA8_OSV8_ISV4_SWIZZLED_BY_4"; case WeightsLayout::is_o_yx_isv32: return "IS_O_YX_ISV32"; diff --git a/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h b/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h index 0ffdc56208e..53f4317f11e 100644 --- a/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h +++ b/inference-engine/thirdparty/clDNN/src/include/to_string_utils.h @@ -144,8 +144,12 @@ inline std::string fmt_to_str(format fmt) { return "os_is_yx_isv16_osv16"; case format::os_is_yx_isa8_osv8_isv4: return "os_is_yx_isa8_osv8_isv4"; + case format::os_is_yx_isa8_osv16_isv4: + return "os_is_yx_isa8_osv16_isv4"; case format::os_is_zyx_isa8_osv8_isv4: return "os_is_zyx_isa8_osv8_isv4"; + case format::os_is_zyx_isa8_osv16_isv4: + return "os_is_zyx_isa8_osv16_isv4"; case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return "os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4"; case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: diff --git a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp index 542f842aff0..593ac0b0375 100644 --- a/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp +++ b/inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp @@ -238,8 +238,12 @@ kernel_selector::weights_layout to_weights_layout(format f) { return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb; case format::os_is_yx_isa8_osv8_isv4: return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4; + case format::os_is_yx_isa8_osv16_isv4: + return kernel_selector::weights_layout::os_is_yx_isa8_osv16_isv4; case format::os_is_zyx_isa8_osv8_isv4: return kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4; + case format::os_is_zyx_isa8_osv16_isv4: + return kernel_selector::weights_layout::os_is_zyx_isa8_osv16_isv4; case format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; case format::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: @@ -390,6 +394,10 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { return cldnn::format::os_is_yx_isa8_osv8_isv4; case kernel_selector::weights_layout::os_is_zyx_isa8_osv8_isv4: return cldnn::format::os_is_zyx_isa8_osv8_isv4; + case kernel_selector::weights_layout::os_is_yx_isa8_osv16_isv4: + return cldnn::format::os_is_yx_isa8_osv16_isv4; + case kernel_selector::weights_layout::os_is_zyx_isa8_osv16_isv4: + return cldnn::format::os_is_zyx_isa8_osv16_isv4; case kernel_selector::weights_layout::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4: return cldnn::format::os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4; case kernel_selector::weights_layout::os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4: