diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv16.cl index f8c9da73b67..44a2ef0fc9f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv16.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_b_fs_yx_fsv16.cl @@ -10,6 +10,7 @@ #define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE) #define TO_TYPE(type, val) CAT(convert_, type)(val) +#define TO_TYPE_SAT(type, val) CAT(CAT(convert_, type), _sat)(val) #if BLOCK_SIZE != 1 #define READ_FUNC(ptr, offset) CAT(DT_INPUT_BLOCK_READ, BLOCK_SIZE)(ptr, offset) @@ -80,7 +81,12 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS OUTPUT_TYPE_BLOCK out = TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE), FUSED_OPS_RESULT); #else #if BLOCK_SIZE != 1 +#if OUTPUT_IS_FP OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE), res), ACTIVATION_PARAMS_TYPED); +#else + OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE_SAT(MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE), res), ACTIVATION_PARAMS_TYPED); +#endif + #else OUTPUT_TYPE out = ACTIVATION_TYPED(TO_OUTPUT_TYPE(res), ACTIVATION_PARAMS_TYPED); #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl new file mode 100644 index 00000000000..fe3d975f831 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/eltwise_blocked_opt.cl @@ -0,0 +1,72 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE) +#define TO_TYPE(type, val) CAT(convert_, type)(val) +#define TO_TYPE_SAT(type, val) CAT(CAT(convert_, type), _sat)(val) + +#if ELTWISE_BROADCAST + #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX_SAFE)(idx_order) +#else + #define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX)(idx_order) +#endif + +KERNEL(eltwise_blocked_opt)(INPUTS_DECLS + __global OUTPUT_TYPE* output +#if HAS_FUSED_OPS_DECLS +, FUSED_OPS_DECLS +#endif +) +{ + const uint zyx = (uint)get_global_id(1); +#if OUTPUT_DIMS == 5 + const uint z = zyx / (uint)XY_BLOCK; + const uint yx = zyx % XY_BLOCK; + const uint y = yx / OUTPUT_SIZE_X; + const uint x = yx % OUTPUT_SIZE_X; +#else + const uint z = 0; + const uint y = zyx / OUTPUT_SIZE_X; + const uint x = zyx % OUTPUT_SIZE_X; +#endif + + const uint f_block = get_global_id(0); + const uint b = get_global_id(2); + + MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, VEC_SIZE) res; + + DO_ELTWISE + +#if HAS_FUSED_OPS + FUSED_OPS; + OUTPUT_TYPE_BLOCK out = TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), FUSED_OPS_RESULT); +#else +#if QUANTIZATION_TERM && !OUTPUT_IS_FP + OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE_SAT(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), res), ACTIVATION_PARAMS_TYPED); +#else + OUTPUT_TYPE_BLOCK out = ACTIVATION_TYPED(TO_TYPE(MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE), res), ACTIVATION_PARAMS_TYPED); +#endif +#endif + +#ifdef LEFTOVERS + // Overwrite + if ((f_block*VEC_SIZE + VEC_SIZE) > OUTPUT_FEATURE_NUM) { + for (uint fp = OUTPUT_FEATURE_NUM % VEC_SIZE; fp < VEC_SIZE; fp++) { + out[fp] = OUTPUT_VAL_ZERO; + } + } +#endif + +#if OUTPUT_DIMS == 5 + VSTORE_N(out, 0, &output[OUTPUT_GET_INDEX(b, (f_block*VEC_SIZE), z, y, x)]); +#else + VSTORE_N(out, 0, &output[OUTPUT_GET_INDEX(b, (f_block*VEC_SIZE), y, x)]); +#endif +} + +#undef OUTPUT_TYPE_BLOCK +#undef TO_TYPE +#undef TO_TYPE_SAT diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp index 1b86245cb64..5b5171df9e5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp @@ -270,6 +270,7 @@ std::vector GetOptimalLocalWorkGroupSizes(std::vector gws, const auto blocked_bsv_fsv_layout = output_layout == DataLayout::bs_fs_yx_bsv16_fsv2 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv2 || output_layout == DataLayout::bs_fs_yx_bsv16_fsv4 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv4 || output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_yx_bsv16_fsv32 || + output_layout == DataLayout::bs_fs_yx_bsv32_fsv16 || output_layout == DataLayout::bs_fs_yx_bsv32_fsv32 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv32 || output_layout == DataLayout::bs_fs_zyx_bsv32_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv32_fsv32; @@ -388,6 +389,9 @@ std::vector GetOptimalLocalWorkGroupSizes(std::vector gws, const } else if ((output_layout == DataLayout::bs_fs_yx_bsv16_fsv16 || output_layout == DataLayout::bs_fs_zyx_bsv16_fsv16) && (axis_by_gws[b] != axis_by_gws[f]) && (axis_by_gws[b] != unused_axis)) { max_optimal_lws0_value = 16; + } else if ((output_layout == DataLayout::bs_fs_yx_bsv32_fsv32 || output_layout == DataLayout::bs_fs_zyx_bsv32_fsv32) && + (axis_by_gws[b] != axis_by_gws[f]) && (axis_by_gws[b] != unused_axis)) { + max_optimal_lws0_value = 32; } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp new file mode 100644 index 00000000000..debcb508108 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp @@ -0,0 +1,388 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "eltwise_kernel_blocked_opt.h" +#include "kernel_selector_utils.h" +#include +#include +#include + +namespace kernel_selector { +static inline bool InputHasFeatureBroadcast(const eltwise_params& params, const size_t op_num, const size_t input_idx); +static inline bool IsBroadcastingPossibleInput(const DataTensor& input, const DataTensor& output); +static inline int GetFeatureBlockSizeFromFormat(const eltwise_params& params, size_t index); + +ParamsKey EltwiseKernel_blocked_opt::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv16); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16); + k.EnableInputLayout(DataLayout::b_fs_zyx_fsv16); + k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv16); + k.EnableInputLayout(DataLayout::b_fs_yx_fsv32); + k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32); + k.EnableInputLayout(DataLayout::b_fs_zyx_fsv32); + k.EnableOutputLayout(DataLayout::b_fs_zyx_fsv32); + k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv32); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv32); + k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv32); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv32); + k.EnableInputLayout(DataLayout::bs_fs_yx_bsv32_fsv16); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv32_fsv16); + k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); + k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16); + k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv32_fsv32); + k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv32_fsv32); + k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv32); + k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv32); + k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv32_fsv16); + k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv32_fsv16); + k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); + k.EnableOutputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16); + k.EnableDifferentTypes(); + k.EnableBatching(); + k.EnableTensorPitches(); + k.EnableTensorOffset(); + k.EnableEltwiseBroadcast(); + return k; +} + +KernelsData EltwiseKernel_blocked_opt::GetKernelsData(const Params& params, const optional_params& options) const { + if (!Validate(params, options)) { + return {}; + } + + KernelData kd = KernelData::Default(params); + eltwise_params& newParams = *static_cast(kd.params.get()); + + auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options); + auto cldnn_jit = GetJitConstants(newParams); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + DispatchData dispatchData = SetDefault(newParams); + + auto& kernel = kd.kernels[0]; + + kernel.params.workGroups.global = dispatchData.gws; + kernel.params.workGroups.local = dispatchData.lws; + + kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT); + kernel.params.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), + false, + false, + GetFusedPrimitiveInputsCount(params)); + + return {kd}; +} + +KernelsPriority EltwiseKernel_blocked_opt::GetKernelsPriority(const Params& /*params*/, const optional_params& /*options*/) const { + return FORCE_PRIORITY_1; +} + +// Protected +bool EltwiseKernel_blocked_opt::Validate(const Params& params, const optional_params& o) const { + if (!EltwiseKernelBase::Validate(params, o)) { + return false; + } + + const auto& ewParams = static_cast(params); + if (IsUnsupportedModeForVecCode(ewParams)) + return false; + + for (size_t i = 0; i < ewParams.inputs.size(); i++) { + if ((GetFeatureBlockSizeFromFormat(ewParams, i) == 1) && + !IsBroadcastingPossibleInput(ewParams.inputs[i], ewParams.outputs[0])) { + return false; + } + } + + const auto vec_size = GetFeatureBlockSizeFromFormat(ewParams, 0); + const auto input0 = ewParams.inputs[0]; + const auto& output = ewParams.outputs[0]; + // Check that padding before features doesn't mis-align the blocks + if (input0.Feature().pad.before % vec_size != 0 || output.Feature().pad.before % vec_size != 0) + return false; + + auto compareTensors = [](const DataTensor& input0, const DataTensor& input1) -> bool { + // Check all parameters except DataType + auto& input0_dims = input0.GetDims(); + auto& input1_dims = input1.GetDims(); + bool same = input0.GetLayout() == input1.GetLayout() && + input0.GetPaddedVal() == input1.GetPaddedVal() && + input0.GetViewOffset() == input1.GetViewOffset() && + input0_dims.size() == input1_dims.size(); + for (size_t i = 0; i < input0_dims.size(); i++) { + same &= input0_dims[i].v == input1_dims[i].v && + input0_dims[i].pad.before == input1_dims[i].pad.before && + input0_dims[i].pad.after == input1_dims[i].pad.after && + input0_dims[i].pitch == input1_dims[i].pitch; + } + + return same; + }; + + for (size_t i = 1; i < ewParams.inputs.size(); i++) { + if (ewParams.inputs[i].LogicalSize() == input0.LogicalSize() && !(compareTensors(ewParams.inputs[i], input0))) + return false; + if (ewParams.inputs[i].Feature().pad.before % vec_size != 0) { + return false; + } + } + + return true; +} + +JitConstants EltwiseKernel_blocked_opt::MakeLoadJitConstants(const eltwise_params& params, bool /*use_vload*/) const { + const auto vec_size = GetFeatureBlockSizeFromFormat(params, 0); + JitConstants jit = {}; + std::string vload_decls; + + + // Make load jit constants + for (size_t op_num = 0; op_num < params.operations.size(); op_num++) { + const std::string op_num_str = toCodeString(op_num); + const auto &ew = params.operations[op_num]; + for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { + const auto &input = ew.inputs[input_idx]; + const std::string name = "INPUT_" + op_num_str + "_" + toCodeString(input_idx); + + // Get a string for a default index based on dimension + std::string default_indexing_str; + if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 4) + default_indexing_str = "b, (f_block * " + toCodeString(vec_size) +"), y, x"; + else if (DataTensor::ChannelsCount(params.inputs[input_idx].GetLayout()) == 5) + default_indexing_str = "b, (f_block * " + toCodeString(vec_size) +"), z, y, x"; + else + IE_ASSERT("MakeLoadJit : Unexpected dimension for eltwise optimized kernel."); + + switch (input.mode) { + case EltwiseInputMode::SCALAR: + jit.AddConstant(MakeJitConstant(name, input.scalar)); + break; + case EltwiseInputMode::INPUT_BUFFER: + { + const std::string idx_order = "INPUT" + toCodeString(input.index) + "_IDX_ORDER"; + jit.AddConstant(MakeJitConstant(idx_order, default_indexing_str)); + + if (params.inputs[input.index].LogicalSize() == 1) { + const std::string vload_name = "DO_VLOAD" + toCodeString(op_num) + "_" + toCodeString(input_idx); + const std::string vload_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " + toCodeString(vec_size) + ") " + + "tmp_a" + toCodeString(op_num) + "_" + toCodeString(input_idx) + " = " + + "(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " + toCodeString(vec_size) + "))" + + "(input" + toCodeString(input.index) + "[0])"; + jit.AddConstant(MakeJitConstant(vload_name, vload_value)); + jit.AddConstant(MakeJitConstant(name, "tmp_a" + toCodeString(op_num) + "_" + toCodeString(input_idx))); + } else { + bool feature_broadcasting = (params.inputs[input_idx].Feature().v == 1 && params.outputs[0].Feature().v != 1); + + if (feature_broadcasting) { + const std::string broadcast_name = "DO_FEATURE_BROADCAST" + toCodeString(op_num) + "_" + toCodeString(input_idx); + std::string broadcast_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " + toCodeString(vec_size) + ") tmp_b" + + toCodeString(op_num) + " = (MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " + toCodeString(vec_size) + "))" + + "(input" + toCodeString(input.index) + "[GET_INDEX(INPUT, " + toCodeString(input.index) + + ", " + idx_order + ")]);"; + + jit.AddConstant(MakeJitConstant(broadcast_name, broadcast_value)); + jit.AddConstant(MakeJitConstant(name, "tmp_b" + toCodeString(op_num))); + } else { + const std::string vload_name = "DO_VLOAD" + toCodeString(op_num) + "_" + toCodeString(input_idx); + const std::string vload_value = "\\\n\tMAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " + toCodeString(vec_size) + ")" + + " tmp_a" + toCodeString(op_num) + "_" + toCodeString(input_idx) + + " = TO_TYPE(MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, " + toCodeString(vec_size) + "), vload" + + toCodeString(vec_size) + "(0, &input" + toCodeString(input.index) + + "[GET_INDEX(INPUT," + toCodeString(input.index) + ", " + idx_order + ")]));"; + + jit.AddConstant(MakeJitConstant(vload_name, vload_value)); + jit.AddConstant(MakeJitConstant(name, "tmp_a" + toCodeString(op_num) + "_" + toCodeString(input_idx))); + } + } + break; + } + case EltwiseInputMode::OUTPUT_BUFFER: + jit.AddConstant(MakeJitConstant(name, "output[off]")); + break; + case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER: + jit.AddConstant(MakeJitConstant( + name, + "input" + toCodeString(input.index) + "[(size_t)tmp" + toCodeString(input.tmpIndex) + "]")); + break; + case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX: + jit.AddConstant(MakeJitConstant(name, "tmp" + toCodeString(input.tmpIndex))); + break; + default: + break; + } + } + } + + return jit; +} + +JitConstants EltwiseKernel_blocked_opt::GetJitConstants(const eltwise_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + const auto vec_size = GetFeatureBlockSizeFromFormat(params, 0); + + jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); + jit.AddConstant(MakeJitConstant("BLOCK_SIZE", vec_size)); + jit.AddConstant(MakeJitConstant("XY_BLOCK", params.outputs[0].X().v * params.outputs[0].Y().v)); + + bool use_vload = false; + jit.Merge(MakeInputDeclsJitConstants(params, use_vload)); + jit.Merge(MakeLoadJitConstants(params, use_vload)); + jit.Merge(GetOperationsJitConstants(params, use_vload, vec_size)); + + std::string do_eltwise; + auto& operations = params.operations; + for (size_t op_num = 0; op_num < operations.size(); op_num++) { + const auto &ew = operations[op_num]; + for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { + const auto &input = ew.inputs[input_idx]; + if (input.mode != EltwiseInputMode::INPUT_BUFFER && input.mode != EltwiseInputMode::SCALAR) + continue; + + if (InputHasFeatureBroadcast(params, op_num, input_idx)) { + do_eltwise += "\\\n\tDO_FEATURE_BROADCAST" + toCodeString(op_num) + "_" + toCodeString(input_idx) + ";"; + } else { + do_eltwise += "\\\n\tDO_VLOAD" + toCodeString(op_num) + "_" + toCodeString(input_idx) + ";"; + } + } + do_eltwise += "\\\n\tOPERATION" + toCodeString(op_num) + ";"; + } + + do_eltwise += "\\\n\tres = tmp" + toCodeString(operations.size() - 1) + ";"; + + jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise)); + + if (params.layoutBased || params.int8_quantization || params.broadcast) { + jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + } + + if (!params.stride.empty()) { + jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); + } + + jit.Merge(MakeActivationJitConstants(params.activations, params.outputs[0].GetDType(), "_TYPED")); + + if (params.outputs[0].Feature().v % vec_size != 0) + jit.AddConstant(MakeJitConstant("LEFTOVERS", params.outputs[0].Feature().v % vec_size)); + + // Fused_ops + if (!params.fused_ops.empty()) { + kernel_selector::Datatype input_dt = GetAccumulatorType(params); + std::vector idx_order; + if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) { + idx_order = {"b", "f_block * " + toCodeString(vec_size), "y", "x"}; + } else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) { + idx_order = {"b", "f_block * " + toCodeString(vec_size), "z", "y", "x"}; + } + + FusedOpsConfiguration conf = {"", idx_order, "res", input_dt, (size_t)vec_size}; + + conf.vec_axis = Tensor::DataChannelName::FEATURE; + jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + } + + jit.AddConstant(MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization)); + jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size)); + jit.AddConstant(MakeJitConstant("VSTORE_N", "vstore" + toCodeString(vec_size))); + + if (params.broadcast) { + bool need_idx_safe = true; + for (size_t i = 0; i < params.inputs.size(); i++) { + if (params.inputs[i].LogicalSize() == 1) { + need_idx_safe = false; + break; + } + } + + if (need_idx_safe) + jit.AddConstant(MakeJitConstant("ELTWISE_BROADCAST", params.broadcast)); + } + + return jit; +} + +EltwiseKernelBase::DispatchData EltwiseKernel_blocked_opt::SetDefault(const eltwise_params& params) const { + DispatchData dispatchData; + auto in_layout = params.inputs[0].GetLayout(); + auto out_layout = params.outputs[0].GetLayout(); + std::vector> dims_by_gws = {{Tensor::DataChannelName::FEATURE}, + {Tensor::DataChannelName::X, Tensor::DataChannelName::Y}, + {Tensor::DataChannelName::BATCH}}; + // Global workgroup size 0: feature, 1: spatial, 2: batch + dispatchData.gws[0] = CeilDiv(params.outputs[0].Feature().v, GetFeatureBlockSizeFromFormat(params, 0)); + dispatchData.gws[2] = params.outputs[0].Batch().v; + if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 5) + dispatchData.gws[1] = params.outputs[0].X().v * params.outputs[0].Y().v * params.outputs[0].Z().v; + else if (DataTensor::ChannelsCount(params.outputs[0].GetLayout()) == 4) + dispatchData.gws[1] = params.outputs[0].X().v * params.outputs[0].Y().v; + else + IE_ASSERT("Unexpected dimension for eltwise_blocked_opt kernel."); + + // Calculate local workgroup size + dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo, in_layout, out_layout, dims_by_gws); + if (out_layout == DataLayout::b_fs_yx_fsv4) { + dispatchData.lws[0] = 1; + dispatchData.lws[2] = 1; + } + + return dispatchData; +} + +// Local +static inline int GetFeatureBlockSizeFromFormat(const eltwise_params& arg, size_t index) { + auto in_layout = arg.inputs[index].GetLayout(); + switch (in_layout) { + case DataLayout::b_fs_yx_fsv4: + return 4; + case DataLayout::b_fs_yx_fsv16: + case DataLayout::b_fs_yx_fsv32: + case DataLayout::b_fs_zyx_fsv16: + case DataLayout::b_fs_zyx_fsv32: + case DataLayout::bs_fs_yx_bsv32_fsv32: + case DataLayout::bs_fs_yx_bsv32_fsv16: + case DataLayout::bs_fs_yx_bsv16_fsv32: + case DataLayout::bs_fs_yx_bsv16_fsv16: + case DataLayout::bs_fs_zyx_bsv32_fsv32: + case DataLayout::bs_fs_zyx_bsv32_fsv16: + case DataLayout::bs_fs_zyx_bsv16_fsv32: + case DataLayout::bs_fs_zyx_bsv16_fsv16: + return 8; + default: + return 1; + } +} + +static inline bool IsBroadcastingPossibleInput(const DataTensor& input, const DataTensor& output) { + if ((input.LogicalSize() == 1) || + (input.LogicalSize() == output.Feature().v && input.Feature().v == output.Feature().v)) { + return true; + } + return false; +} + +static inline bool InputHasFeatureBroadcast(const eltwise_params& params, const size_t op_num, const size_t input_idx) { + const auto &ew = params.operations[op_num]; + + const auto &input = ew.inputs[input_idx]; + if (input.mode == EltwiseInputMode::INPUT_BUFFER) { + if (params.inputs[input_idx].LogicalSize() != 1 + && params.inputs[input_idx].Feature().v == 1 + && params.outputs[0].Feature().v != 1) { + return true; + } + } + + return false; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.h new file mode 100644 index 00000000000..7b7ab381e91 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.h @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "eltwise_kernel_base.h" + +namespace kernel_selector { +class EltwiseKernel_blocked_opt : public EltwiseKernelBase { +public: + EltwiseKernel_blocked_opt() : EltwiseKernelBase("eltwise_blocked_opt") {} + virtual ~EltwiseKernel_blocked_opt() {} + + KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; + KernelsPriority GetKernelsPriority(const Params& params, const optional_params& options) const override; + ParamsKey GetSupportedKey() const override; + std::vector GetSupportedFusedOps() const override { + return { + FusedOpType::QUANTIZE, + FusedOpType::ACTIVATION, + FusedOpType::ELTWISE + }; + } + +protected: + bool Validate(const Params& p, const optional_params& o) const override; + JitConstants MakeLoadJitConstants(const eltwise_params& params, bool useVload8) const override; + JitConstants GetJitConstants(const eltwise_params& params) const override; + DispatchData SetDefault(const eltwise_params& params) const override; + void PrintWorkSize(const DispatchData& dis); +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_selector.cpp index f77b1b51b98..88afcf24504 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_selector.cpp @@ -9,6 +9,8 @@ #include "eltwise_kernel_b_fs_yx_fsv16.h" #include "eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.h" #include "eltwise_kernel_b_fs_yx_fsv4.h" +#include "eltwise_kernel_blocked_opt.h" + namespace kernel_selector { eltwise_kernel_selector::eltwise_kernel_selector() { @@ -17,6 +19,7 @@ eltwise_kernel_selector::eltwise_kernel_selector() { Attach(); Attach(); Attach(); + Attach(); Attach(); } diff --git a/src/plugins/intel_gpu/tests/test_cases/eltwise_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/eltwise_gpu_test.cpp index b7b9ab888fa..9f47a235497 100644 --- a/src/plugins/intel_gpu/tests/test_cases/eltwise_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/eltwise_gpu_test.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "intel_gpu/runtime/layout.hpp" #include "test_utils.h" #include @@ -3932,6 +3933,14 @@ struct eltwise_layout_test_params { #define CASE_ELTWISE_TEST9 eltwise_mode::eq, {4, 2, 4, 4}, {1, 1, 1, 1}, format::b_fs_yx_fsv16, format::bfyx, "generic_eltwise_ref" class eltwise_layout_test : public BaseEltwiseTest { +public: + std::string PrintToString(const eltwise_layout_test_params& params) { + std::string res; + res += " format1 (" + format::traits(params.input0_format).str + ")"; + res += " format2 (" + format::traits(params.input1_format).str + ")"; + + return res; + } }; class eltwise_test_mixed_layout : public eltwise_layout_test {}; @@ -4027,6 +4036,15 @@ struct eltwise_random_test_params { struct eltwise_random_test : testing::TestWithParam { + static std::string PrintToString(const eltwise_random_test_params& params) { + std::string res = " data (" + cldnn::data_type_traits::name(params.input_type) + "), "; + res += " format (" + format::traits(params.in_format).str + ") input1 : "; + res += params.first_input_size.to_string() + " / input2 : "; + res += params.second_input_size.to_string() + "\n"; + + return res; + } + template void fill_random_typed(memory::ptr mem, int min, int max, int k) { auto l = mem->get_layout(); @@ -4188,15 +4206,17 @@ struct eltwise_random_test_param_generator : std::vector