From 36df508baf628acf09d2e34e7862332571295d8b Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Thu, 2 Feb 2023 10:08:36 +0400 Subject: [PATCH] [GPU] Added shape agnostic ref kernels for Select and Activation (#15016) * [GPU] Added Select shape agnostic support * [GPU] Added Activation shape agnostic support --- .../prepare_primitive_fusing.cpp | 1 - .../src/graph/impls/ocl/activation.cpp | 27 ++++- .../intel_gpu/src/graph/impls/ocl/pooling.cpp | 2 +- .../intel_gpu/src/graph/impls/ocl/select.cpp | 85 +++++++++----- .../src/graph/include/primitive_inst.h | 2 +- .../intel_gpu/src/graph/include/select_inst.h | 3 + src/plugins/intel_gpu/src/graph/select.cpp | 17 +++ .../cl_kernels/activation_ref.cl | 11 +- .../cl_kernels/select_gpu_ref.cl | 42 ++++--- .../activation/activation_kernel_base.cpp | 20 +++- .../activation/activation_kernel_ref.cpp | 1 + .../kernels/select/select_kernel_base.cpp | 20 +++- .../kernels/select/select_kernel_ref.cpp | 1 + .../intel_gpu/src/plugin/ops/convert.cpp | 4 +- .../fusions/fully_connected_fusion_test.cpp | 2 +- .../test_cases/activation_simple_gpu_test.cpp | 78 ++++++++++++- .../tests/test_cases/select_gpu_test.cpp | 104 +++++++++++++++++- 17 files changed, 351 insertions(+), 69 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index a6ea3f1ed94..2022042c4ac 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -911,7 +911,6 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { return; std::vector> parents = node.get_dependencies(); - std::list users = node.get_users(); std::vector can_fuse_parents = { false, false }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp index 61c4589a7e1..96b6e3798d1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/activation.cpp @@ -64,12 +64,37 @@ struct activation_impl : typed_primitive_impl_ocl { return {params, optional_params}; } + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param); + (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); + } }; namespace detail { attach_activation_impl::attach_activation_impl() { - implementation_map::add(impl_types::ocl, typed_primitive_impl_ocl::create, { + auto dyn_types = { + data_types::f32, + data_types::f16, + data_types::i8, + data_types::u8, + data_types::i32 + }; + + auto dyn_formats = { + format::bfyx, + format::bfzyx, + format::bfwzyx + }; + + implementation_map::add(impl_types::ocl, + shape_types::dynamic_shape, + typed_primitive_impl_ocl::create, + dyn_types, + dyn_formats); + + implementation_map::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl::create, { std::make_tuple(data_types::f32, format::yxfb), std::make_tuple(data_types::f16, format::yxfb), std::make_tuple(data_types::f32, format::bfyx), diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp index 180427764ae..28b06faf37f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/pooling.cpp @@ -168,7 +168,7 @@ public: namespace detail { attach_pooling_impl::attach_pooling_impl() { - std::set::key_type> keys; + std::set::key_type> keys; auto types = { data_types::f16, data_types::f32, data_types::i8, data_types::u8 }; auto formats = { format::bfyx, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp index 85d8771d928..c66f8e5d78c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/select.cpp @@ -30,15 +30,20 @@ struct select_impl : typed_primitive_impl_ocl { return true; }; - for (size_t i = 0; i < layouts.size(); i++) { - auto shape = layouts[i].get_shape(); - auto shape_size = shape.size(); - if (shape_size < 4 && !broadcastable(o_layout, layouts[i])) { - shape.insert(shape.begin(), 4 - shape_size, 1); - layout new_layout = layouts[i]; - new_layout.set_partial_shape(shape); - layouts[i] = new_layout; + for (auto& l : input_layouts) { + auto pshape = l.get_partial_shape(); + auto rank = pshape.size(); + + if (rank < 4 && !broadcastable(o_layout, l)) { + pshape.insert(pshape.begin(), 4 - rank, 1); + layout new_layout = l; + new_layout.set_partial_shape(pshape); + l = new_layout; } } - for (size_t i = 1; i < layouts.size(); i++) { - params.inputs.push_back(convert_data_tensor(layouts[i])); + for (size_t i = 1; i < input_layouts.size(); ++i) { + params.inputs.push_back(convert_data_tensor(input_layouts[i])); } return {params, optional_params}; } + + void update_dispatch_data(const kernel_impl_params& impl_param) override { + auto kernel_params = get_kernel_params(impl_param); + (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); + } }; namespace detail { attach_select_impl::attach_select_impl() { - implementation_map::create, { - std::make_tuple(data_types::f32, format::yxfb), - std::make_tuple(data_types::f16, format::yxfb), - std::make_tuple(data_types::i8, format::yxfb), - std::make_tuple(data_types::u8, format::yxfb), - std::make_tuple(data_types::f32, format::bfyx), - std::make_tuple(data_types::f16, format::bfyx), - std::make_tuple(data_types::i8, format::bfyx), - std::make_tuple(data_types::u8, format::bfyx), - std::make_tuple(data_types::f32, format::byxf), - std::make_tuple(data_types::f16, format::byxf), - std::make_tuple(data_types::i8, format::byxf), - std::make_tuple(data_types::u8, format::byxf), - }); + auto types = { + data_types::f32, + data_types::f16, + data_types::i8, + data_types::u8 + }; + + auto static_formats = { + format::bfyx, + format::byxf, + format::yxfb, + }; + + implementation_map::create, + types, + static_formats); + + auto dyn_formats = { + format::bfyx + }; + + implementation_map::create, + types, + dyn_formats); } } // namespace detail diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index bd0b3e782f4..639430ae7c1 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -413,7 +413,7 @@ public: } static std::vector extend_input_shape_to_6d(kernel_impl_params const& orig_impl_param, int32_t input_idx) { - ov::PartialShape ps = orig_impl_param.get_input_layout(input_idx).get_partial_shape(); + ov::PartialShape ps = orig_impl_param.get_input_layout(input_idx).get_partial_shape(); if (ps.size() < 4) { ps.insert(ps.end(), 4 - ps.size(), ov::Dimension(1)); diff --git a/src/plugins/intel_gpu/src/graph/include/select_inst.h b/src/plugins/intel_gpu/src/graph/include/select_inst.h index ce6b9d67109..a30cb33e80a 100644 --- a/src/plugins/intel_gpu/src/graph/include/select_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/select_inst.h @@ -32,6 +32,9 @@ public: template static std::vector calc_output_layouts(const select_node& /*node*/, const kernel_impl_params& impl_param); static layout calc_output_layout(select_node const& node, kernel_impl_params const& impl_param); + static std::vector extend_shape_to_6d(ov::PartialShape ps); + static std::vector extend_input_shape_to_6d(kernel_impl_params const& orig_impl_param, int32_t input_idx); + static std::vector extend_output_shape_to_6d(kernel_impl_params const& orig_impl_param, int32_t output_idx); static std::string to_string(select_node const& node); typed_primitive_inst(network& network, select_node const& node); }; diff --git a/src/plugins/intel_gpu/src/graph/select.cpp b/src/plugins/intel_gpu/src/graph/select.cpp index 98e6b86be64..f701fdee2f8 100644 --- a/src/plugins/intel_gpu/src/graph/select.cpp +++ b/src/plugins/intel_gpu/src/graph/select.cpp @@ -55,6 +55,23 @@ std::vector select_inst::calc_output_layouts(const select_node& /*node*/ return {{output_shapes[0], dt, format::get_default_format(output_shapes[0].size())}}; } +std::vector select_inst::extend_shape_to_6d(ov::PartialShape ps) { + if (ps.size() < 4) { + ps.insert(ps.begin(), 4 - ps.size(), ov::Dimension(1)); + } + + layout l(ps, data_types::i32, format::get_default_format(ps.size())); + return l.transform(format::bfwzyx).to_shape(); +} + +std::vector select_inst::extend_input_shape_to_6d(kernel_impl_params const& orig_impl_param, int32_t input_idx) { + return extend_shape_to_6d(orig_impl_param.get_input_layout(input_idx).get_partial_shape()); +} + +std::vector select_inst::extend_output_shape_to_6d(kernel_impl_params const& orig_impl_param, int32_t output_idx) { + return extend_shape_to_6d(orig_impl_param.get_output_layout(output_idx).get_partial_shape()); +} + std::string select_inst::to_string(select_node const& node) { auto node_info = node.desc_to_json(); auto desc = node.get_primitive(); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl index 7282add44f2..6e646e5a49a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/activation_ref.cl @@ -12,6 +12,7 @@ // TODO: move it from layout based to memory based KERNEL(activation)( + OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output #if HAS_FUSED_OPS_DECLS @@ -32,7 +33,7 @@ KERNEL(activation)( const unsigned x = get_global_id(0); const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y; const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y; - #if OUTPUT_BATCH_NUM == 1 + #if OUTPUT_BATCH_NUM_CONST == 1 const unsigned feature = (uint)get_global_id(2); const unsigned batch = 0; #else @@ -44,7 +45,7 @@ KERNEL(activation)( const unsigned x = (uint)get_global_id(1); const unsigned y = (uint)get_global_id(2); #define z 0 - #if OUTPUT_BATCH_NUM == 1 + #if OUTPUT_BATCH_NUM_CONST == 1 const unsigned feature = (uint)get_global_id(0); const unsigned batch = 0; #else @@ -60,7 +61,7 @@ KERNEL(activation)( #define z 0 const unsigned x = (uint)get_global_id(0); const unsigned y = (uint)get_global_id(1); - #if OUTPUT_BATCH_NUM == 1 + #if OUTPUT_BATCH_NUM_CONST == 1 const unsigned feature = (uint)get_global_id(2); const unsigned batch = 0; #else @@ -72,11 +73,11 @@ KERNEL(activation)( // GWS.feature and GWS.batch is aligned to 16. Otherwise, there are some idling WIs. #if (defined(OUTPUT_LAYOUT_B_FS_YX_FSV16) || defined(OUTPUT_LAYOUT_B_FS_YX_FSV32)) \ - && OUTPUT_FEATURE_NUM % 16 != 0 + && (OUTPUT_FEATURE_NUM_CONST % 16 != 0 || IS_DYNAMIC) if (feature >= OUTPUT_FEATURE_NUM) return; #elif (defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV16) || defined(OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32)) \ - && (OUTPUT_FEATURE_NUM % 16 != 0 || OUTPUT_BATCH_NUM % 16 != 0) + && (OUTPUT_FEATURE_NUM_CONST % 16 != 0 || OUTPUT_BATCH_NUM_CONST % 16 != 0 || IS_DYNAMIC) if (batch >= OUTPUT_BATCH_NUM || feature >= OUTPUT_FEATURE_NUM) return; #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl index e63c0419e92..815f9d961fe 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl @@ -4,18 +4,23 @@ #include "include/batch_headers/fetch_data.cl" -#define GET_INDEX(prefix, num) \ - CAT(CAT(prefix, num), _OFFSET) + \ - (d1 % CAT(CAT(prefix, num), _SIZES)[0])*CAT(CAT(prefix, num), _PITCHES)[0] + \ - (d2 % CAT(CAT(prefix, num), _SIZES)[1])*CAT(CAT(prefix, num), _PITCHES)[1] + \ - (d3 % CAT(CAT(prefix, num), _SIZES)[2])*CAT(CAT(prefix, num), _PITCHES)[2] + \ - (d4 % CAT(CAT(prefix, num), _SIZES)[3])*CAT(CAT(prefix, num), _PITCHES)[3] +#ifdef IS_DYNAMIC + #define GET_INDEX(prefix) GET_DATA_INDEX_SAFE(prefix, d4, d3, d2, d1) +#else + #define GET_INDEX(prefix) \ + CAT(prefix, _OFFSET) + \ + (d1 % CAT(prefix, _SIZES)[0])*CAT(prefix, _PITCHES)[0] + \ + (d2 % CAT(prefix, _SIZES)[1])*CAT(prefix, _PITCHES)[1] + \ + (d3 % CAT(prefix, _SIZES)[2])*CAT(prefix, _PITCHES)[2] + \ + (d4 % CAT(prefix, _SIZES)[3])*CAT(prefix, _PITCHES)[3] +#endif -#define INPUT_0 input0[GET_INDEX(INPUT, 0)] -#define INPUT_1 input1[GET_INDEX(INPUT, 1)] -#define INPUT_2 input2[GET_INDEX(INPUT, 2)] +#define INPUT_0 input0[GET_INDEX(INPUT0)] +#define INPUT_1 input1[GET_INDEX(INPUT1)] +#define INPUT_2 input2[GET_INDEX(INPUT2)] KERNEL(select)( + OPTIONAL_SHAPE_INFO_ARG INPUTS_DECLS __global OUTPUT_TYPE* output) { @@ -24,14 +29,19 @@ const uint d1 = (uint) get_global_id(0); const uint d2 = (uint) get_global_id(1); const uint d34 = (uint) get_global_id(2); -const uint d3 = d34 % OUTPUT_SIZES[2]; -const uint d4 = d34 / OUTPUT_SIZES[2]; +#ifdef IS_DYNAMIC + const uint d3 = d34 % OUTPUT_FEATURE_NUM; + const uint d4 = d34 / OUTPUT_FEATURE_NUM; +#else + const uint d3 = d34 % OUTPUT_SIZES[2]; + const uint d4 = d34 / OUTPUT_SIZES[2]; +#endif -uint output_offset = OUTPUT_OFFSET + - d1*OUTPUT_PITCHES[0] + - d2*OUTPUT_PITCHES[1] + - d3*OUTPUT_PITCHES[2] + - d4*OUTPUT_PITCHES[3]; +#ifdef IS_DYNAMIC + uint output_offset = OUTPUT_GET_INDEX(d4, d3, d2, d1); +#else + uint output_offset = GET_DATA_INDEX_RAW(OUTPUT, d1, d2, d3, d4); +#endif const OUTPUT_TYPE res = select(INPUT_2, INPUT_1, MASK); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp index e219f24a29c..28a2fff66f1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_base.cpp @@ -54,6 +54,14 @@ JitConstants ActivationKernelBase::GetJitConstants(const activation_params& para }); } + if (params.has_dynamic_outputs()) { + jit.AddConstant(MakeJitConstant("OUTPUT_BATCH_NUM_CONST", 0)); + jit.AddConstant(MakeJitConstant("OUTPUT_FEATURE_NUM_CONST", 0)); + } else { + jit.AddConstant(MakeJitConstant("OUTPUT_BATCH_NUM_CONST", params.outputs[0].Batch().v)); + jit.AddConstant(MakeJitConstant("OUTPUT_FEATURE_NUM_CONST", params.outputs[0].Feature().v)); + } + return jit; } @@ -78,7 +86,6 @@ KernelsData ActivationKernelBase::GetCommonKernelsData(const Params& params, con } KernelData kd = KernelData::Default(params); - activation_params& newParams = *static_cast(kd.params.get()); auto dispatchData = SetDefault(newParams); @@ -86,9 +93,18 @@ KernelsData ActivationKernelBase::GetCommonKernelsData(const Params& params, con auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, options); auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + }; + auto& kernel = kd.kernels[0]; FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, - EXE_MODE_DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params)); + EXE_MODE_DEFAULT, false, false, 1, + GetFusedPrimitiveInputsCount(params), 1, newParams.outputs[0].is_dynamic()); if (!newParams.inputActivationParams.empty()) { kernel.params.arguments.push_back({ArgumentDescriptor::Types::SLOPE, 0}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_ref.cpp index 42ab01c87bf..8f5a6c8cf3a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/activation/activation_kernel_ref.cpp @@ -27,6 +27,7 @@ ParamsKey ActivationKernelRef::GetSupportedKey() const { k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); + k.EnableDynamicShapesSupport(); return k; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_base.cpp index 50edfd7f013..ee6f1e518c1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_base.cpp @@ -129,13 +129,21 @@ KernelsData SelectKernelBase::GetCommonKernelsData(const Params& params, const o DispatchData dispatchData = SetDefault(newParams); + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + }; + auto& kernel = kd.kernels[0]; - - kernel.params.workGroups.global = dispatchData.gws; - kernel.params.workGroups.local = dispatchData.lws; - - kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT); - kernel.params.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, false); + FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, + "", false, false, + (uint32_t)newParams.inputs.size(), + 0, + 1, + newParams.outputs[0].is_dynamic()); return {kd}; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.cpp index 89c4b8e1e5d..ca647dbcef8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.cpp @@ -36,6 +36,7 @@ ParamsKey SelectKernelRef::GetSupportedKey() const { k.EnableTensorPitches(); k.EnableTensorOffset(); k.EnableDifferentTypes(); + k.EnableDynamicShapesSupport(); return k; } diff --git a/src/plugins/intel_gpu/src/plugin/ops/convert.cpp b/src/plugins/intel_gpu/src/plugin/ops/convert.cpp index 0b3b4b77d70..5c1e1b26271 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/convert.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/convert.cpp @@ -23,9 +23,7 @@ static void CreateConvertLikeOp(Program& p, const std::shared_ptr(), - cldnn::reorder_mean_mode::subtract); + outDataType); p.add_primitive(*op, reorderPrim); } diff --git a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp index 6d068dc313f..e6becbacb9e 100644 --- a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp @@ -177,7 +177,7 @@ class fc_fp32_activation_dynamic : public FullyConnectedFusingTest {}; TEST_P(fc_fp32_activation_dynamic, basic) { auto p = GetParam(); auto test_input_layout = get_input_layout(p); - auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_rank()), test_input_layout.data_type, test_input_layout.format}; + auto dynamic_input_layout = layout{ov::PartialShape::dynamic(test_input_layout.get_partial_shape().size()), test_input_layout.data_type, test_input_layout.format}; create_topologies( input_layout("input", dynamic_input_layout), data("weights", get_mem(get_weights_layout(p))), diff --git a/src/plugins/intel_gpu/tests/test_cases/activation_simple_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/activation_simple_gpu_test.cpp index 418dd00bc21..b2452c75d12 100644 --- a/src/plugins/intel_gpu/tests/test_cases/activation_simple_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/activation_simple_gpu_test.cpp @@ -8,6 +8,7 @@ #include #include #include +#include "activation_inst.h" #include #include @@ -15,9 +16,84 @@ using namespace cldnn; using namespace ::tests; +TEST(activation_f32_fw_gpu, dynamic) { + auto& engine = get_test_engine(); + + ov::PartialShape in_shape = { 1, 1, 4, 2 }; + layout in_layout { ov::PartialShape::dynamic(in_shape.size()), data_types::f32, format::bfyx }; + + auto input = engine.allocate_memory({ in_shape, data_types::f32, format::bfyx }); + set_values(input, { -0.12f, 0.56f, 0.45f, -0.789f, 42.f, 0.999f, 0.7899f, 0.f}); + + std::vector funcs = { + activation_func::gelu, + activation_func::relu, + activation_func::hyperbolic_tan, + activation_func::sqrt + }; + + for (auto func : funcs) { + topology topology(input_layout("input", in_layout)); + topology.add(activation("activation", input_info("input"), func)); + + ExecutionConfig config; + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network network(engine, topology, config); + + network.set_input_data("input", input); + + auto inst = network.get_primitive("activation"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "activation"); + + auto output_memory = outputs.at("activation").get_memory(); + auto output_layout = output_memory->get_layout(); + cldnn::mem_lock output_ptr(output_memory, get_test_stream()); + cldnn::mem_lock input_ptr(input, get_test_stream()); + + int y_size = output_layout.spatial(1); + int x_size = output_layout.spatial(0); + int f_size = output_layout.feature(); + int b_size = output_layout.batch(); + + ASSERT_EQ(output_layout.format, format::bfyx); + ASSERT_EQ(y_size, 4); + ASSERT_EQ(x_size, 2); + ASSERT_EQ(f_size, 1); + ASSERT_EQ(b_size, 1); + + for (size_t i = 0; i < output_layout.get_linear_size(); ++i) { + switch (func) { + case activation_func::gelu: + ASSERT_NEAR(0.5f * static_cast(input_ptr[i]) * (1.f + std::erf(static_cast((input_ptr[i])) / std::sqrt(2.0f))), + output_ptr[i], 1e-5f); + break; + case activation_func::relu: + ASSERT_EQ(std::max(input_ptr[i], static_cast(0)), output_ptr[i]); + break; + case activation_func::hyperbolic_tan: + ASSERT_FLOAT_EQ(std::tanh(static_cast(input_ptr[i])), output_ptr[i]); + break; + case activation_func::sqrt: + if (input_ptr[i] >= 0) { + ASSERT_FLOAT_EQ(std::sqrt(static_cast(input_ptr[i])), output_ptr[i]); + } + break; + default: + break; + } + } + } +} + TEST(activation_f32_fw_gpu, not_basic_yxfb) { // Input: - // 1 0 -3 4 5 + // 1 0 -3 4 5 // 0 2 3 4 -6 // 3 -3 3 0 1 // 1 1 1 -1 0 diff --git a/src/plugins/intel_gpu/tests/test_cases/select_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/select_gpu_test.cpp index b62ec30a474..14ce3e7a0fe 100644 --- a/src/plugins/intel_gpu/tests/test_cases/select_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/select_gpu_test.cpp @@ -5,7 +5,8 @@ #include "test_utils.h" #include -#include "intel_gpu/primitives/select.hpp" +#include +#include "select_inst.h" using namespace cldnn; using namespace ::tests; @@ -2292,3 +2293,104 @@ TEST(select_gpu_fp32, select_numpy_broadcast_mask_u8_1x1x3) { ASSERT_EQ(answers[i], output_ptr[i]); } } + +TEST(select_gpu_f32, dynamic) { + auto& engine = get_test_engine(); + + ov::PartialShape in1_shape = { 2, 2, 2, 2 }; + ov::PartialShape in2_shape = { 2, 2, 2, 2 }; + ov::PartialShape mask_shape = { 2, 2, 2, 1 }; + + layout input1_layout { ov::PartialShape::dynamic(in1_shape.size()), data_types::f32, format::bfyx }; + layout input2_layout { ov::PartialShape::dynamic(in2_shape.size()), data_types::f32, format::bfyx }; + layout mask_layout { ov::PartialShape::dynamic(mask_shape.size()), data_types::f32, format::bfyx }; + + auto input1 = engine.allocate_memory({ in1_shape, data_types::f32, format::bfyx }); + auto input2 = engine.allocate_memory({ in2_shape, data_types::f32, format::bfyx }); + auto mask = engine.allocate_memory({ mask_shape, data_types::f32, format::bfyx }); + + set_values(input1, { + 1.f, 0.f, + 5.f, 1.5f, + + 2.f, 0.f, + 6.f, 5.2f, + + 3.f, 0.5f, + 7.f, 12.f, + + 4.f, -0.5f, + 8.f, 8.f + }); + + set_values(input2, { + 0.5f, 2.5f, + 1.5f, 3.f, + + 5.f, 7.f, + 2.f, 4.f, + + 15.f, 17.f, + 8.f, 10.f, + + -2.f, 6.5f, + -0.5f, -2.5f + }); + + set_values(mask, { + 0.f, + 0.f, + + 1.f, + 1.f, + + 0.f, + 1.f, + + 1.f, + 0.f, + }); + + topology topology; + topology.add(input_layout("input1", input1_layout)); + topology.add(input_layout("input2", input2_layout)); + topology.add(input_layout("mask", mask_layout)); + topology.add(cldnn::select("select", input_info("mask"), input_info("input1"), input_info("input2"))); + + ExecutionConfig config; + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network network(engine, topology, config); + + network.set_input_data("input1", input1); + network.set_input_data("input2", input2); + network.set_input_data("mask", mask); + + auto inst = network.get_primitive("select"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + + auto output = outputs.at("select").get_memory(); + + float answers[16] = { + 0.5f, 2.5f, + 1.5f, 3.f, + + 2.f, 0.f, + 6.f, 5.2f, + + 15.f, 17.f, + 7.f, 12.f, + + 4.f, -0.5f, + -0.5f, -2.5f + }; + + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (int i = 0; i < 16; i++) { + ASSERT_TRUE(are_equal(answers[i], output_ptr[i])); + } +}