diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp index 2830d4066e1..0a14ff8c472 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp @@ -24,6 +24,7 @@ ParamsKey ActivationKernelOpt::GetSupportedKey() const { k.EnableInputDataType(Datatype::INT32); k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); k.EnableOutputDataType(Datatype::INT8); k.EnableOutputDataType(Datatype::INT32); k.EnableOutputDataType(Datatype::F16); @@ -81,8 +82,7 @@ bool ActivationKernelOpt::Validate(const Params& p, const optional_params& o) co return false; if (!params.fused_ops.empty() && - ((params.output.GetLayout() != DataLayout::bfyx && params.output.GetLayout() != DataLayout::bfzyx) || - ((params.output.X().v * params.output.Y().v) % 4 != 0))) + (params.output.GetLayout() != DataLayout::bfyx && params.output.GetLayout() != DataLayout::bfzyx)) return false; return true; @@ -94,15 +94,56 @@ JitConstants ActivationKernelOpt::GetJitConstants(const activation_params& param jit.AddConstant(MakeJitConstant("NUM_COLS_WI", NUM_COLS_WI)); if (!params.fused_ops.empty()) { + bool can_use_vector = params.inputs[0].X().v % 4 == 0; + jit.AddConstant(MakeJitConstant("CAN_USE_VECTOR", can_use_vector)); + std::vector idx_order; - if (params.inputs[0].GetDims().size() <= 4) { - idx_order = {"fo_b", "fo_f", "fo_y", "fo_x"}; - } else if (params.inputs[0].GetDims().size() == 5) { - idx_order = {"fo_b", "fo_f", "fo_z", "fo_y", "fo_x"}; + + if (can_use_vector) { + if (params.inputs[0].GetDims().size() <= 4) { + idx_order = {"x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_FEATURE_NUM)", + "x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % OUTPUT_FEATURE_NUM", + "x / OUTPUT_SIZE_X % OUTPUT_SIZE_Y", + "x % OUTPUT_SIZE_X"}; + } else if (params.inputs[0].GetDims().size() == 5) { + idx_order = {"x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z* OUTPUT_FEATURE_NUM)", + "x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z) % OUTPUT_FEATURE_NUM", + "x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z", + "x / OUTPUT_SIZE_X % OUTPUT_SIZE_Y", + "x % OUTPUT_SIZE_X"}; + } + } else { + if (params.inputs[0].GetDims().size() <= 4) { + idx_order = {"(x + i) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_FEATURE_NUM)", + "(x + i) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % OUTPUT_FEATURE_NUM", + "(x + i) / OUTPUT_SIZE_X % OUTPUT_SIZE_Y", + "(x + i) % OUTPUT_SIZE_X"}; + } else if (params.inputs[0].GetDims().size() == 5) { + idx_order = {"(x + i) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z* OUTPUT_FEATURE_NUM)", + "(x + i) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z) % OUTPUT_FEATURE_NUM", + "(x + i) / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z", + "(x + i) / OUTPUT_SIZE_X % OUTPUT_SIZE_Y", + "(x + i) % OUTPUT_SIZE_X"}; + } } - FusedOpsConfiguration conf = - {"", idx_order, "v", input_dt, 4, LoadType::LT_UNALIGNED, BoundaryCheck::DISABLED, IndexType::TENSOR_COORD}; - jit.Merge(MakeFusedOpsJitConstants(params, {conf})); + FusedOpsConfiguration conf_vector = {"_VECTOR", + idx_order, + "v", + input_dt, + 4, + LoadType::LT_UNALIGNED, + BoundaryCheck::DISABLED, + IndexType::TENSOR_COORD, + Tensor::DataChannelName::X}; + FusedOpsConfiguration conf_scalar = {"_SCALAR", + idx_order, + "v[i]", + input_dt, + 1, + LoadType::LT_UNALIGNED, + BoundaryCheck::DISABLED, + IndexType::TENSOR_COORD}; + jit.Merge(MakeFusedOpsJitConstants(params, {conf_vector, conf_scalar})); } jit.Merge(MakeActivationJitConstants(params.activations, input_dt, "_KERNEL")); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h index e2db8123f26..51545bed5fa 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.h @@ -35,7 +35,9 @@ protected: bool Validate(const Params& p, const optional_params& o) const override; JitConstants GetJitConstants(const activation_params& params, DispatchData kd) const override; std::vector GetSupportedFusedOps() const override { - return { FusedOpType::QUANTIZE }; + return {FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION}; } }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h index ad15f0a5fae..e8e170be277 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.h @@ -29,7 +29,9 @@ public: ParamsKey GetSupportedKey() const override; JitConstants GetJitConstants(const activation_params& params, DispatchData kd) const override; std::vector GetSupportedFusedOps() const override { - return { FusedOpType::QUANTIZE }; + return {FusedOpType::QUANTIZE, + FusedOpType::SCALE, + FusedOpType::ACTIVATION}; } }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_opt.cl index 6e34d064956..7dbfc6f8797 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/activation_opt.cl @@ -18,7 +18,7 @@ #include "include/data_types.cl" KERNEL(activation)( - __global INPUT0_TYPE* input, + __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output #if HAS_FUSED_OPS_DECLS , FUSED_OPS_DECLS @@ -26,21 +26,8 @@ KERNEL(activation)( ) { const unsigned int x = (uint)get_global_id(0) * NUM_COLS_WI; -#if OUTPUT_DIMS == 5 - const unsigned int fo_x = x % OUTPUT_SIZE_X; - const unsigned int fo_y = x / OUTPUT_SIZE_X % OUTPUT_SIZE_Y; - const unsigned int fo_z = x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % OUTPUT_SIZE_Z; - const unsigned int fo_f = x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z) % OUTPUT_FEATURE_NUM; - const unsigned int fo_b = x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_Z* OUTPUT_FEATURE_NUM); -#elif OUTPUT_DIMS == 4 - const unsigned int fo_x = x % OUTPUT_SIZE_X; - const unsigned int fo_y = x / OUTPUT_SIZE_X % OUTPUT_SIZE_Y; - const unsigned int fo_f = x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % OUTPUT_FEATURE_NUM; - const unsigned int fo_b = x / (OUTPUT_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_FEATURE_NUM); -#endif - - unsigned int input_offset = x + INPUT0_OFFSET; - unsigned int output_offset = x + OUTPUT_OFFSET; + unsigned int input_offset = x + INPUT0_OFFSET; + unsigned int output_offset = x + OUTPUT_OFFSET; typedef CAT(INPUT0_TYPE, 4) input_t; typedef CAT(OUTPUT_TYPE, 4) output_t; @@ -48,9 +35,19 @@ KERNEL(activation)( input_t v = ((__global input_t*) (input + input_offset))[0]; v = ACTIVATION_KERNEL(v, ACTIVATION_PARAMS_KERNEL); + #if HAS_FUSED_OPS - FUSED_OPS; - *((__global output_t*)(output + output_offset)) = FUSED_OPS_RESULT; + output_t result; + #if !CAN_USE_VECTOR + for (int i = 0; i < 4; i++) { + FUSED_OPS_SCALAR; + result[i] = FUSED_OPS_RESULT_SCALAR; + } + #else + FUSED_OPS_VECTOR; + result = FUSED_OPS_RESULT_VECTOR; + #endif + *((__global output_t*)(output + output_offset)) = result; #else *((__global output_t*)(output + output_offset)) = v; #endif diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp index f8125664731..66cba243eff 100644 --- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp +++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp @@ -180,8 +180,10 @@ void prepare_primitive_fusing::fuse_activations(program_impl &p) { // - primitives input cannot be output // - no activation additional input // - input was optimized + // - can't have fused primitives if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() || - node.get_dependencies().size() != 1 || input.can_be_optimized() || node.is_constant()) + node.get_dependencies().size() != 1 || input.can_be_optimized() || node.is_constant() || + node.has_fused_primitives()) return; // - limit to primitives which implementations support activation fusing @@ -353,6 +355,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type(); + should_fuse |= input_data.is_type(); + if (!should_fuse) return; @@ -390,6 +394,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) { should_fuse |= input_data.is_type(); + should_fuse |= input_data.is_type(); + if (!should_fuse) return; diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp index c2af2edfbb2..a3bbae5806b 100644 --- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp +++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp @@ -2932,7 +2932,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, lrn_test_params{CASE_LRN_FP16_5, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features"}, }), ); - /* ----------------------------------------------------------------------------------------------------- */ /* -------------------------------- Activation cases --------------------------------------------------- */ /* ----------------------------------------------------------------------------------------------------- */ @@ -2940,43 +2939,62 @@ struct activation_test_params { tensor input_size; data_types input_type; format input_format; - activation_func activation_function; - activation_additional_params additional_params; data_types default_type; format default_format; size_t expected_fused_primitives; size_t expected_not_fused_primitives; + std::string kernel_name; }; -#define CASE_ACTIVATION_F32_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, activation_func::hyperbolic_tan, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_2 {1, 16, 8, 8}, data_types::f32, format::bfyx, activation_func::hard_sigmoid, {1.f, 1.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_3 {1, 16, 8, 8}, data_types::f32, format::bfyx, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_4 {1, 16, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, activation_func::hyperbolic_tan, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_5 {1, 16, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, activation_func::hard_sigmoid, {1.f, 1.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_6 {1, 16, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F32_1 {1, 16, 8, 8, 8}, data_types::f32, format::bfzyx, activation_func::hyperbolic_tan, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F32_2 {1, 16, 8, 8, 8}, data_types::f32, format::bfzyx, activation_func::hard_sigmoid, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F32_3 {1, 16, 8, 8, 8}, data_types::f32, format::bfzyx, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, activation_func::hyperbolic_tan, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_2 {1, 16, 8, 8}, data_types::f16, format::bfyx, activation_func::hard_sigmoid, {1.f, 1.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_3 {1, 16, 8, 8}, data_types::f16, format::bfyx, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_4 {1, 16, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, activation_func::hyperbolic_tan, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_5 {1, 16, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, activation_func::hard_sigmoid, {1.f, 1.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_6 {1, 16, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F16_1 {1, 16, 8, 8, 8}, data_types::f16, format::bfzyx, activation_func::hyperbolic_tan, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F16_2 {1, 16, 8, 8, 8}, data_types::f16, format::bfzyx, activation_func::hard_sigmoid, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F16_3 {1, 16, 8, 8, 8}, data_types::f16, format::bfzyx, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_U8_2 {1, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_U8_1 {1, 16, 8, 8, 8}, data_types::u8, format::bfzyx, activation_func::relu, {0.f, 0.f}, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_0 {7, 32, 3, 3}, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_2 {7, 3, 7, 7}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_3 {1, 14, 8, 8}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_4 {1, 17, 31, 29}, data_types::f32, format::yxfb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_5 {1, 17, 31, 29}, data_types::f32, format::byxf_af32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_6 {1, 17, 31, 29}, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_7 {1, 17, 31, 29}, data_types::f32, format::fyxb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_3D_F32_0 {3, 16, 13, 13, 13}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_1 {2, 16, 8, 8, 8}, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_2 {1, 16, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_3 {1, 17, 7, 7, 7}, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_4 {1, 17, 7, 7, 7}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_5 {1, 17, 7, 7, 7}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_6 {1, 17, 7, 7, 7}, data_types::f32, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx + +#define CASE_ACTIVATION_F16_0 {7, 32, 5, 5}, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_2 {7, 16, 7, 7}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_3 {1, 14, 8, 8}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_4 {1, 17, 31, 29}, data_types::f16, format::yxfb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_5 {1, 17, 31, 29}, data_types::f16, format::byxf_af32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_6 {1, 17, 31, 29}, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_7 {1, 17, 31, 29}, data_types::f16, format::fyxb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_3D_F16_0 {3, 16, 13, 13, 13}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_1 {2, 16, 8, 8, 8}, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_2 {1, 16, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_3 {1, 17, 7, 7, 7}, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_4 {1, 17, 7, 7, 7}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_5 {1, 17, 7, 7, 7}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_6 {1, 17, 7, 7, 7}, data_types::f16, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfzyx + +#define CASE_ACTIVATION_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_U8_2 {1, 12, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_I8_2 {1, 14, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_3D_I8_1 {1, 17, 8, 8, 8}, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx class ActivationFusingTest : public ::BaseFusingTest { public: void execute(activation_test_params& p) { auto input_prim = get_mem(get_input_layout(p)); + build_options options; + implementation_desc activation_impl = {p.input_format, p.kernel_name}; + options.set_option(build_option::optimize_data(true)); + options.set_option(build_option::force_implementations({{"act", activation_impl}})); + network network_fused(this->engine, this->topology_fused, options); network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); network_fused.set_input_data("input", input_prim); network_not_fused.set_input_data("input", input_prim); @@ -2989,8 +3007,7 @@ public: layout get_per_channel_layout(activation_test_params& p) { return layout{p.default_type, p.default_format, tensor{1, p.input_size.feature[0], 1, 1}}; } - activation_func get_activation_function(activation_test_params& p) { return p.activation_function; } - activation_additional_params get_activation_additional_params(activation_test_params& p) { return p.additional_params; } + format get_input_format(activation_test_params &p) { return p.input_format; } }; @@ -2998,68 +3015,97 @@ class activation_quantize_i8 : public ActivationFusingTest {}; TEST_P(activation_quantize_i8, basic) { auto p = GetParam(); create_topologies(input_layout("input", get_input_layout(p)), - activation("act", "input", get_activation_function(p), get_activation_additional_params(p)), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + activation("act", "input", activation_func::relu), + data("in_low", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_high", get_mem(get_single_element_layout(p), 1, max_random)), data("out_low", get_mem(get_single_element_layout(p), -127, 0)), data("out_high", get_mem(get_single_element_layout(p), 0, 127)), quantize("quant", "act", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32)); + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32)); - tolerance = 1.f; + tolerance = 1.0f; execute(p); } -INSTANTIATE_TEST_CASE_P( - fusings_gpu, - activation_quantize_i8, - ::testing::ValuesIn(std::vector{ - activation_test_params{CASE_ACTIVATION_F32_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_3, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_4, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_5, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_6, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F32_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3} - }), ); - -INSTANTIATE_TEST_CASE_P( - DISABLED_fusings_gpu, - activation_quantize_i8, - ::testing::ValuesIn(std::vector{ - // fp16 cases - activation_test_params{CASE_ACTIVATION_F16_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_3, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_4, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_5, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_6, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 3}, - - // u8 cases - activation_test_params{CASE_ACTIVATION_U8_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_U8_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_U8_1, 2, 3} - }), ); - -class activation_opt_quantize_i8 : public ActivationFusingTest {}; -TEST_P(activation_opt_quantize_i8, basic) { +TEST_P(activation_quantize_i8, per_channel) { auto p = GetParam(); - implementation_desc activation_impl = {get_input_format(p), "activation_opt"}; - this->bo_fused.set_option(build_option::force_implementations({{"act", activation_impl}})); - create_topologies(input_layout("input", get_input_layout(p)), - activation("act", "input", get_activation_function(p), get_activation_additional_params(p)), + activation("act", "input", activation_func::relu), data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), data("out_low", get_mem(get_single_element_layout(p), -127, 0)), data("out_high", get_mem(get_single_element_layout(p), 0, 127)), quantize("quant", "act", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32)); + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32)); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P( + fusings_gpu, + activation_quantize_i8, + ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 + activation_test_params{CASE_ACTIVATION_F32_0, 2, 3, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_F32_1, 2, 3, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F32_0, 2, 3, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 3, "activation_opt"}, + + activation_test_params{CASE_ACTIVATION_F32_0, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_1, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_2, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_3, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_4, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_0, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 3, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_2, 2, 3, "activation_ref"}, + }), ); + +INSTANTIATE_TEST_CASE_P( + DISABLED_fusings_gpu, + activation_quantize_i8, + ::testing::ValuesIn(std::vector{ + activation_test_params{CASE_ACTIVATION_F32_5, 2, 3, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_F32_6, 2, 3, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_F32_7, 2, 3, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 3, "activation_ref"}, // FIXME - accuracy bug + }), ); + +class activation_scale_activation_quantize_i8 : public ActivationFusingTest {}; +TEST_P(activation_scale_activation_quantize_i8, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + data("in_low", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_high", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), 0)), + data("out_high", get_mem(get_single_element_layout(p), 255)), + scale("scale", "act", "scale_data"), + activation("act2", "scale", activation_func::softsign), + quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32)); + + tolerance = 1.f; + execute(p); +} + +TEST_P(activation_scale_activation_quantize_i8, per_channel) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), 0)), + data("out_high", get_mem(get_single_element_layout(p), 255)), + scale("scale", "act", "scale_data"), + activation("act2", "scale", activation_func::softsign), + quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32)); tolerance = 1.f; execute(p); @@ -3067,31 +3113,113 @@ TEST_P(activation_opt_quantize_i8, basic) { INSTANTIATE_TEST_CASE_P( fusings_gpu, - activation_opt_quantize_i8, + activation_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ - activation_test_params{CASE_ACTIVATION_F32_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_F32_3, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F32_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F32_3, 2, 3} + // InputDataType = FP32 + activation_test_params{CASE_ACTIVATION_F32_0, 2, 5, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_F32_1, 2, 5, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F32_0, 2, 5, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 5, "activation_opt"}, + + activation_test_params{CASE_ACTIVATION_F32_0, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_1, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_2, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_3, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_4, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_5, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_6, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_7, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_0, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 5, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_2, 2, 5, "activation_ref"}, }), ); INSTANTIATE_TEST_CASE_P( DISABLED_fusings_gpu, - activation_opt_quantize_i8, + activation_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ - // fp16 cases - activation_test_params{CASE_ACTIVATION_F16_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_F16_3, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 3}, + activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 5, "activation_ref"}, // FIXME - accuracy bug + }), ); - // u8 cases - activation_test_params{CASE_ACTIVATION_U8_1, 2, 3}, - activation_test_params{CASE_ACTIVATION_3D_U8_1, 2, 3} +class activation_scale_activation : public ActivationFusingTest {}; +TEST_P(activation_scale_activation, basic) { + auto p = GetParam(); + create_topologies(input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + scale("scale", "act", "scale_data"), + activation("act2", "scale", activation_func::exp), + reorder("reorder_bfyx", "act2", p.default_format, data_types::f32)); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_CASE_P( + fusings_gpu, + activation_scale_activation, + ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 + activation_test_params{CASE_ACTIVATION_F32_0, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_F32_1, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F32_0, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 4, "activation_opt"}, + + activation_test_params{CASE_ACTIVATION_F32_0, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_1, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_2, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_3, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_4, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_5, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_6, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F32_7, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_0, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_1, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F32_2, 2, 4, "activation_ref"}, + + // InputDataType = FP16 + activation_test_params{CASE_ACTIVATION_F16_0, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_F16_1, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F16_0, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_opt"}, + + activation_test_params{CASE_ACTIVATION_F16_0, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_1, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_2, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_3, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_4, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_5, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_6, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_F16_7, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_0, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_F16_5, 2, 4, "activation_ref"}, + + // InputDataType = UINT8 + activation_test_params{CASE_ACTIVATION_U8_1, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_U8_2, 2, 4, "activation_ref"}, + + // InputDataType = INT8 + activation_test_params{CASE_ACTIVATION_I8_1, 2, 4, "activation_opt"}, + activation_test_params{CASE_ACTIVATION_3D_I8_1, 2, 4, "activation_opt"}, + + activation_test_params{CASE_ACTIVATION_I8_1, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_I8_2, 2, 4, "activation_ref"}, + activation_test_params{CASE_ACTIVATION_3D_I8_1, 2, 4, "activation_ref"} + }), ); + +INSTANTIATE_TEST_CASE_P( + DISABLED_fusings_gpu, + activation_scale_activation, + ::testing::ValuesIn(std::vector{ + activation_test_params{CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F32_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug + activation_test_params{CASE_ACTIVATION_3D_F16_6, 2, 4, "activation_ref"}, // FIXME - accuracy bug }), ); /* ----------------------------------------------------------------------------------------------------- */ @@ -3392,7 +3520,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ // Some fusings disabled under deconvolution -> convolution optimization // Quantize fusing disabled for fp16/fp32 for performance reasons - // deconv_test_params{ CASE_DECONV_FP32_1, 4, 5 }, FIXME Failure due to activation + quantization fusing + deconv_test_params{ CASE_DECONV_FP32_1, 4, 5 }, deconv_test_params{ CASE_DECONV_FP32_2, 3, 5 }, deconv_test_params{ CASE_DECONV_FP32_3, 3, 5 }, deconv_test_params{ CASE_DECONV_FP32_4, 3, 5 }, @@ -3438,7 +3566,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_i8, deconv_test_params{ CASE_DECONV_FP32_3D_8, 3, 5 }, // FIXME no quantize implementation for bs_fs_yx_bsv16_fsv16 format AND add_required_reorders pass completely ruins data types // add_required_reorders pass tries to reorder everything to output type if no format exists, this ruins fp32 -> int8 quantize - // deconv_test_params{ CASE_DECONV_FP32_3D_9, 3, 5 }, + //deconv_test_params{ CASE_DECONV_FP32_3D_9, 3, 5 }, deconv_test_params{ CASE_DECONV_FP16_3D_1, 3, 5 }, deconv_test_params{ CASE_DECONV_FP16_3D_2, 3, 5 }, @@ -3448,7 +3576,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_i8, deconv_test_params{ CASE_DECONV_FP16_3D_6, 3, 5 }, deconv_test_params{ CASE_DECONV_FP16_3D_7, 3, 5 }, deconv_test_params{ CASE_DECONV_FP16_3D_8, 3, 5 }, - // deconv_test_params{ CASE_DECONV_FP16_3D_9, 3, 5 }, + //deconv_test_params{ CASE_DECONV_FP16_3D_9, 3, 5 }, deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 5 }, deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 5 }, @@ -3505,7 +3633,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_ ::testing::ValuesIn(std::vector{ // Some fusings disabled under deconvolution -> convolution optimization // Quantize fusing disabled for fp16/fp32 for performance reasons - // deconv_test_params{ CASE_DECONV_FP32_1, 7, 9 }, FIXME Failure due to activation + quantization fusing + deconv_test_params{ CASE_DECONV_FP32_1, 7, 9 }, deconv_test_params{ CASE_DECONV_FP32_2, 6, 9 }, deconv_test_params{ CASE_DECONV_FP32_3, 6, 9 }, deconv_test_params{ CASE_DECONV_FP32_4, 6, 9 }, @@ -3541,7 +3669,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_ deconv_test_params{ CASE_DECONV_S8S8_7, 2, 9 }, deconv_test_params{ CASE_DECONV_S8S8_8, 2, 9 }, - // deconv_test_params{ CASE_DECONV_FP32_3D_1, 6, 9 }, FIXME Failure due to activation + quantization fusing + deconv_test_params{ CASE_DECONV_FP32_3D_1, 6, 9 }, deconv_test_params{ CASE_DECONV_FP32_3D_2, 6, 9 }, deconv_test_params{ CASE_DECONV_FP32_3D_3, 6, 9 }, deconv_test_params{ CASE_DECONV_FP32_3D_4, 6, 9 },