From c9d4e6b934c71258cb976ada959907c3cee6da73 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Tue, 30 Jun 2020 22:18:24 +0300 Subject: [PATCH] [IE CLDNN] Removed unused primitives and related structures (#1039) --- .../src/cldnn_engine/cldnn_program.cpp | 39 - inference-engine/src/cldnn_engine/dllmain.cpp | 22 - .../thirdparty/clDNN/api/activation.hpp | 7 - .../thirdparty/clDNN/api/activation_grad.hpp | 96 - .../thirdparty/clDNN/api/apply_adam.hpp | 111 - .../thirdparty/clDNN/api/batch_norm.hpp | 184 -- .../thirdparty/clDNN/api/batch_norm_grad.hpp | 61 - .../thirdparty/clDNN/api/contract.hpp | 95 - .../clDNN/api/convolution_grad_input.hpp | 95 - .../clDNN/api/convolution_grad_weights.hpp | 217 -- .../thirdparty/clDNN/api/deconvolution.hpp | 34 +- .../thirdparty/clDNN/api/eltwise.hpp | 49 +- .../thirdparty/clDNN/api/embed.hpp | 79 - .../clDNN/api/fully_connected_grad_input.hpp | 59 - .../api/fully_connected_grad_weights.hpp | 115 - .../thirdparty/clDNN/api/index_select.hpp | 109 - .../thirdparty/clDNN/api/lookup_table.hpp | 58 - .../thirdparty/clDNN/api/network.hpp | 6 - .../thirdparty/clDNN/api/scale_grad_input.hpp | 51 - .../clDNN/api/scale_grad_weights.hpp | 131 - .../clDNN/api/softmax_loss_grad.hpp | 47 - .../api_extension/fused_conv_bn_scale.hpp | 115 - .../api_extension/fused_conv_eltwise.hpp | 67 +- .../kernel_selector/common/common_tools.h | 1 - .../kernel_selector/common/common_types.h | 14 - .../activation/activation_kernel_base.cpp | 3 - .../activation/activation_kernel_opt.cpp | 1 - .../activation/activation_kernel_ref.cpp | 1 - .../batch_norm/batch_norm_kernel_base.cpp | 88 - .../batch_norm/batch_norm_kernel_base.h | 66 - .../batch_norm/batch_norm_kernel_ref.cpp | 41 - .../batch_norm/batch_norm_kernel_ref.h | 30 - .../batch_norm/batch_norm_kernel_selector.cpp | 29 - .../batch_norm/batch_norm_kernel_selector.h | 35 - .../batch_norm_grad_kernel_base.cpp | 72 - .../batch_norm_grad_kernel_base.h | 57 - .../batch_norm_grad_kernel_ref.cpp | 41 - .../batch_norm_grad_kernel_ref.h | 30 - .../batch_norm_grad_kernel_selector.cpp | 29 - .../batch_norm_grad_kernel_selector.h | 35 - .../contract/contract_kernel_base.cpp | 111 - .../contract/contract_kernel_base.h | 52 - .../contract/contract_kernel_ref.cpp | 49 - .../contract/contract_kernel_ref.h | 27 - .../contract/contract_kernel_selector.cpp | 24 - .../contract/contract_kernel_selector.h | 31 - ...on_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp | 87 - ...tion_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h | 41 - ...lution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp | 61 - ...volution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h | 37 - .../convolution_kernel_mmad_1x1_gemm.cpp | 108 - .../convolution_kernel_mmad_1x1_gemm.h | 40 - ...kernel_mmad_32x32sg_128x128wg_slm_int8.cpp | 180 -- ...n_kernel_mmad_32x32sg_128x128wg_slm_int8.h | 42 - ...kernel_mmad_32x32sg_224x128wg_slm_int8.cpp | 180 -- ...n_kernel_mmad_32x32sg_224x128wg_slm_int8.h | 42 - ...nvolution_kernel_mmad_32x32sg_slm_int8.cpp | 176 -- ...convolution_kernel_mmad_32x32sg_slm_int8.h | 41 - .../convolution_kernel_selector.cpp | 24 - .../convolution_grad_weights_kernel_1x1.cpp | 67 - .../convolution_grad_weights_kernel_1x1.h | 32 - .../convolution_grad_weights_kernel_3x3.cpp | 72 - .../convolution_grad_weights_kernel_3x3.h | 32 - .../convolution_grad_weights_kernel_7x7.cpp | 70 - .../convolution_grad_weights_kernel_7x7.h | 32 - .../convolution_grad_weights_kernel_base.cpp | 135 - .../convolution_grad_weights_kernel_base.h | 79 - .../convolution_grad_weights_kernel_ref.cpp | 45 - .../convolution_grad_weights_kernel_ref.h | 29 - ...nvolution_grad_weights_kernel_selector.cpp | 36 - ...convolution_grad_weights_kernel_selector.h | 34 - .../convolution_grad_weights_kernel_yxfb.cpp | 74 - .../convolution_grad_weights_kernel_yxfb.h | 32 - .../deconvolution_kernel_bfyx_opt.cpp | 1 - .../deconvolution_kernel_ref.cpp | 1 - .../eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp | 4 +- .../eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp | 288 -- .../eltwise/eltwise_kernel_b_fs_yx_fsv4.h | 36 - .../eltwise/eltwise_kernel_base.cpp | 15 +- .../eltwise/eltwise_kernel_base.h | 6 - .../eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp | 301 -- .../eltwise_kernel_fs_bs_yx_bsv4_fsv32.h | 32 - .../eltwise/eltwise_kernel_ref.cpp | 1 - .../eltwise/eltwise_kernel_selector.cpp | 4 - .../actual_kernels/embed/embed_kernel_ref.cpp | 108 - .../actual_kernels/embed/embed_kernel_ref.h | 42 - .../embed/embed_kernel_selector.cpp | 27 - .../embed/embed_kernel_selector.h | 35 - .../core/actual_kernels/embed/embed_params.h | 51 - .../fully_connected_kernel_base.cpp | 2 - .../fully_connected_kernel_mmad_batched.cpp | 122 - .../fully_connected_kernel_mmad_batched.h | 36 - .../fully_connected_kernel_selector.cpp | 2 - ...fully_connected_grad_input_kernel_base.cpp | 82 - .../fully_connected_grad_input_kernel_base.h | 54 - .../fully_connected_grad_input_kernel_ref.cpp | 44 - .../fully_connected_grad_input_kernel_ref.h | 29 - ...y_connected_grad_input_kernel_selector.cpp | 28 - ...lly_connected_grad_input_kernel_selector.h | 34 - ...lly_connected_grad_weights_kernel_base.cpp | 93 - ...fully_connected_grad_weights_kernel_base.h | 58 - ...ully_connected_grad_weights_kernel_ref.cpp | 43 - .../fully_connected_grad_weights_kernel_ref.h | 30 - ...connected_grad_weights_kernel_selector.cpp | 28 - ...y_connected_grad_weights_kernel_selector.h | 34 - .../fused_conv_bn_scale_kernel_base.cpp | 158 - .../fused_conv_bn_scale_kernel_base.h | 77 - .../fused_conv_bn_scale_kernel_ref.cpp | 71 - .../fused_conv_bn_scale_kernel_ref.h | 40 - .../fused_conv_bn_scale_kernel_selector.cpp | 26 - .../fused_conv_bn_scale_kernel_selector.h | 34 - ...used_conv_eltwise_kernel_af32_imad_1x1.cpp | 145 - .../fused_conv_eltwise_kernel_af32_imad_1x1.h | 40 - .../fused_conv_eltwise_kernel_base.cpp | 25 - .../fused_conv_eltwise_kernel_base.h | 12 - .../fused_conv_eltwise_kernel_gemm.cpp | 142 - .../fused_conv_eltwise_kernel_gemm.h | 42 - .../fused_conv_eltwise_kernel_imad.cpp | 221 -- .../fused_conv_eltwise_kernel_imad.h | 46 - ...kernel_mmad_32x32sg_128x128wg_slm_int8.cpp | 208 -- ...e_kernel_mmad_32x32sg_128x128wg_slm_int8.h | 42 - ...kernel_mmad_32x32sg_224x128wg_slm_int8.cpp | 208 -- ...e_kernel_mmad_32x32sg_224x128wg_slm_int8.h | 42 - .../fused_conv_eltwise_kernel_selector.cpp | 10 - .../index_select/index_select_kernel_base.cpp | 137 - .../index_select/index_select_kernel_base.h | 54 - .../index_select/index_select_kernel_ref.cpp | 54 - .../index_select/index_select_kernel_ref.h | 27 - .../index_select_kernel_selector.cpp | 24 - .../index_select_kernel_selector.h | 31 - .../lookup_table/lookup_table_kernel_axis.cpp | 88 - .../lookup_table/lookup_table_kernel_axis.h | 30 - .../lookup_table/lookup_table_kernel_base.cpp | 84 - .../lookup_table/lookup_table_kernel_base.h | 64 - .../lookup_table/lookup_table_kernel_ref.cpp | 41 - .../lookup_table/lookup_table_kernel_ref.h | 30 - .../lookup_table_kernel_selector.cpp | 31 - .../lookup_table_kernel_selector.h | 35 - .../permute/permute_kernel_ref.cpp | 2 +- .../pooling_kernel_gpu_average_opt.cpp | 102 - .../pooling/pooling_kernel_gpu_average_opt.h | 33 - ...pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp | 111 - .../pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h | 39 - ..._kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp | 106 - ...ng_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h | 38 - .../pooling/pooling_kernel_selector.cpp | 6 - .../quantize/quantize_kernel_base.cpp | 4 +- .../scale_grad_weights_kernel_base.cpp | 79 - .../scale_grad_weights_kernel_base.h | 58 - .../scale_grad_weights_kernel_ref.cpp | 44 - .../scale_grad_weights_kernel_ref.h | 30 - .../scale_grad_weights_kernel_selector.cpp | 27 - .../scale_grad_weights_kernel_selector.h | 35 - .../softmax_loss_grad_kernel_base.cpp | 77 - .../softmax_loss_grad_kernel_base.h | 49 - .../softmax_loss_grad_kernel_ref.cpp | 41 - .../softmax_loss_grad_kernel_ref.h | 29 - .../softmax_loss_grad_kernel_selector.cpp | 26 - .../softmax_loss_grad_kernel_selector.h | 33 - .../strided_slice_kernel_ref.cpp | 4 +- .../core/cl_kernels/batch_norm_gpu_ref.cl | 121 - .../cl_kernels/batch_norm_grad_gpu_ref.cl | 80 - .../core/cl_kernels/contract_ref.cl | 64 - .../convolution_gpu_1x1_gemm_mmad.cl | 120 - ...lution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl | 202 -- ...onvolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl | 105 - ...ion_gpu_mmad_32x32sg_128x128wg_slm_int8.cl | 396 --- ...ion_gpu_mmad_32x32sg_224x128wg_slm_int8.cl | 389 --- .../convolution_gpu_mmad_32x32sg_slm_int8.cl | 430 --- .../convolution_gpu_mmad_b_fs_yx_fsv32.cl | 5 - .../convolution_gpu_mmad_batched.cl | 116 - .../convolution_gpu_mmad_batched_block.cl | 199 -- .../convolution_gpu_mmad_batched_block_1x1.cl | 247 -- ...volution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl | 5 - ...nvolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl | 5 - .../convolution_gpu_mmad_slm_2x14_rep4.cl | 945 ------ .../convolution_gpu_mmad_slm_7x7_rep4.cl | 1044 ------- .../convolution_grad_weights_1x1.cl | 136 - .../convolution_grad_weights_3x3.cl | 182 -- .../convolution_grad_weights_7x7.cl | 105 - .../convolution_grad_weights_ref.cl | 122 - .../convolution_grad_weights_yxfb.cl | 118 - .../cl_kernels/deconvolution_gpu_bfyx_opt.cl | 10 - .../core/cl_kernels/deconvolution_gpu_ref.cl | 19 - .../core/cl_kernels/eltwise_b_fs_yx_fsv4.cl | 104 - .../cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl | 83 - .../core/cl_kernels/embed_ref.cl | 34 - .../fully_connected_gpu_mmad_batched.cl | 115 - .../fully_connected_grad_input_gpu_ref.cl | 46 - .../fully_connected_grad_weights_gpu_ref.cl | 80 - .../fused_conv_bn_scale_kernel_ref.cl | 197 -- .../fused_conv_eltwise_gpu_gemm_fp32.cl | 602 ---- ...ise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl | 510 ---- ...ise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl | 505 ---- .../gen9_common_conv_fwd_data_f16.cl | 6 - .../gen9_common_conv_fwd_data_f32.cl | 6 - .../core/cl_kernels/generic_eltwise_ref.cl | 6 +- .../core/cl_kernels/index_select_gpu_ref.cl | 103 - .../core/cl_kernels/lookup_table_axis.cl | 77 - .../core/cl_kernels/lookup_table_ref.cl | 32 - .../pooling_gpu_fs_bs_yx_bsv4_fsv32.cl | 227 -- .../pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl | 124 - .../cl_kernels/scale_grad_weights_gpu_ref.cl | 93 - .../cl_kernels/softmax_loss_grad_gpu_ref.cl | 38 - .../core/common/common_kernel_base.cpp | 12 +- .../core/common/common_kernel_base.h | 2 - .../kernel_selector/core/common/jitter.cpp | 34 +- .../core/common/training_kernel_base.cpp | 34 - .../core/common/training_kernel_base.h | 34 - .../core/common/training_params.cpp | 29 - .../core/common/training_params.h | 44 - .../kernel_selector/core/kernel_base.cpp | 1 - .../core/kernel_selector_common.cpp | 3 - .../core/kernel_selector_common.h | 5 - .../core/kernel_selector_params.cpp | 4 - .../core/kernel_selector_params.h | 20 +- .../thirdparty/clDNN/src/activation_grad.cpp | 95 - .../thirdparty/clDNN/src/apply_adam.cpp | 97 - .../thirdparty/clDNN/src/batch_norm.cpp | 116 - .../thirdparty/clDNN/src/batch_norm_grad.cpp | 55 - .../thirdparty/clDNN/src/contract.cpp | 126 - .../clDNN/src/convolution_grad_weights.cpp | 214 -- .../thirdparty/clDNN/src/deconvolution.cpp | 28 +- .../thirdparty/clDNN/src/eltwise.cpp | 36 - .../thirdparty/clDNN/src/embed.cpp | 87 - .../clDNN/src/fully_connected_grad_input.cpp | 76 - .../src/fully_connected_grad_weights.cpp | 70 - .../clDNN/src/fused_conv_bn_scale.cpp | 229 -- .../clDNN/src/gpu/activation_grad_gpu.cpp | 102 - .../clDNN/src/gpu/apply_adam_gpu.cpp | 181 -- .../clDNN/src/gpu/batch_norm_gpu.cpp | 156 - .../clDNN/src/gpu/batch_norm_grad_gpu.cpp | 87 - .../thirdparty/clDNN/src/gpu/contract_gpu.cpp | 86 - .../src/gpu/convolution_grad_weights_gpu.cpp | 194 -- .../clDNN/src/gpu/deconvolution_gpu.cpp | 2 - .../thirdparty/clDNN/src/gpu/eltwise_gpu.cpp | 50 +- .../thirdparty/clDNN/src/gpu/embed_gpu.cpp | 86 - .../gpu/fully_connected_grad_input_gpu.cpp | 86 - .../gpu/fully_connected_grad_weights_gpu.cpp | 113 - .../clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp | 160 - .../clDNN/src/gpu/fused_conv_eltwise_gpu.cpp | 41 - .../clDNN/src/gpu/index_select_gpu.cpp | 107 - .../thirdparty/clDNN/src/gpu/kernel.cpp | 68 - .../thirdparty/clDNN/src/gpu/kernel.h | 5 - .../clDNN/src/gpu/lookup_table_gpu.cpp | 129 - .../thirdparty/clDNN/src/gpu/register_gpu.cpp | 15 - .../thirdparty/clDNN/src/gpu/register_gpu.hpp | 28 - .../clDNN/src/gpu/scale_grad_input_gpu.cpp | 93 - .../clDNN/src/gpu/scale_grad_weights_gpu.cpp | 95 - .../clDNN/src/gpu/softmax_loss_grad_gpu.cpp | 74 - .../add_reshape_to_primitives.cpp | 114 - .../src/graph_optimizer/pre_optimize_bias.cpp | 3 - .../graph_optimizer/pre_replace_deconv.cpp | 5 +- .../prepare_primitive_fusing.cpp | 32 +- .../src/graph_optimizer/trim_to_outputs.cpp | 2 - .../clDNN/src/include/activation_grad_inst.h | 58 - .../clDNN/src/include/apply_adam_inst.h | 68 - .../clDNN/src/include/batch_norm_grad_inst.h | 52 - .../clDNN/src/include/batch_norm_inst.h | 108 - .../clDNN/src/include/contract_inst.h | 52 - .../include/convolution_grad_weights_inst.h | 158 - .../clDNN/src/include/eltwise_inst.h | 35 +- .../thirdparty/clDNN/src/include/embed_inst.h | 56 - .../include/fully_connected_grad_input_inst.h | 54 - .../fully_connected_grad_weights_inst.h | 64 - .../src/include/fused_conv_bn_scale_inst.h | 119 - .../src/include/fused_conv_eltwise_inst.h | 65 +- .../clDNN/src/include/index_select_inst.h | 59 - .../src/include/kernel_selector_helper.h | 25 - .../clDNN/src/include/layout_optimizer.h | 1 - .../clDNN/src/include/lookup_table_inst.h | 53 - .../clDNN/src/include/scale_grad_input_inst.h | 54 - .../src/include/scale_grad_weights_inst.h | 67 - .../src/include/softmax_loss_grad_inst.h | 40 - .../thirdparty/clDNN/src/index_select.cpp | 139 - .../clDNN/src/kernel_selector_helper.cpp | 28 - .../thirdparty/clDNN/src/lookup_table.cpp | 61 - .../thirdparty/clDNN/src/network.cpp | 8 - .../thirdparty/clDNN/src/primitive_inst.cpp | 5 - .../thirdparty/clDNN/src/program.cpp | 3 - .../thirdparty/clDNN/src/scale_grad_input.cpp | 100 - .../clDNN/src/scale_grad_weights.cpp | 112 - .../clDNN/src/softmax_loss_grad.cpp | 50 - .../test_cases/add_reorders_gpu_test.cpp | 59 +- .../tests/test_cases/apply_adam_gpu_test.cpp | 109 - .../tests/test_cases/batch_norm_gpu_test.cpp | 2663 ----------------- .../test_cases/batch_norm_grad_gpu_test.cpp | 114 - .../tests/test_cases/contract_gpu_test.cpp | 362 --- .../tests/test_cases/convolution_gpu_test.cpp | 871 ------ .../convolution_grad_input_gpu_test.cpp | 208 -- .../convolution_grad_weights_gpu_test.cpp | 1112 ------- .../clDNN/tests/test_cases/embed_gpu_test.cpp | 164 - .../fully_connected_grad_input_gpu_test.cpp | 89 - .../fully_connected_grad_weights_gpu_test.cpp | 249 -- .../fused_conv_eltwise_gpu_test.cpp | 55 - .../tests/test_cases/fusings_gpu_test.cpp | 18 +- .../test_cases/index_select_gpu_test.cpp | 1672 ----------- .../tests/test_cases/lookup_table_test.cpp | 251 -- .../test_cases/scale_grad_input_test.cpp | 90 - .../test_cases/scale_grad_weights_test.cpp | 325 -- .../test_cases/softmax_loss_grad_gpu_test.cpp | 65 - 301 files changed, 58 insertions(+), 31335 deletions(-) delete mode 100644 inference-engine/src/cldnn_engine/dllmain.cpp delete mode 100644 inference-engine/thirdparty/clDNN/api/activation_grad.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/apply_adam.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/batch_norm.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/contract.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/embed.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/index_select.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/lookup_table.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp delete mode 100644 inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_kernel_base.h delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.cpp delete mode 100644 inference-engine/thirdparty/clDNN/kernel_selector/core/common/training_params.h delete mode 100644 inference-engine/thirdparty/clDNN/src/activation_grad.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/apply_adam.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/batch_norm.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/batch_norm_grad.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/contract.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/convolution_grad_weights.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/embed.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/fully_connected_grad_input.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/fully_connected_grad_weights.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/fused_conv_bn_scale.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/activation_grad_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/apply_adam_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/batch_norm_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/batch_norm_grad_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/contract_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/convolution_grad_weights_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/embed_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_input_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fully_connected_grad_weights_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/fused_conv_bn_scale_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/index_select_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/lookup_table_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/scale_grad_input_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/scale_grad_weights_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/gpu/softmax_loss_grad_gpu.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/graph_optimizer/add_reshape_to_primitives.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/include/activation_grad_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/apply_adam_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/batch_norm_grad_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/batch_norm_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/contract_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/convolution_grad_weights_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/embed_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_input_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/fully_connected_grad_weights_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/fused_conv_bn_scale_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/index_select_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/lookup_table_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/scale_grad_input_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/scale_grad_weights_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/include/softmax_loss_grad_inst.h delete mode 100644 inference-engine/thirdparty/clDNN/src/index_select.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/lookup_table.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/scale_grad_input.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/scale_grad_weights.cpp delete mode 100644 inference-engine/thirdparty/clDNN/src/softmax_loss_grad.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/apply_adam_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/batch_norm_grad_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/contract_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_input_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/convolution_grad_weights_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/embed_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_input_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_grad_weights_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/index_select_gpu_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/lookup_table_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_input_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/scale_grad_weights_test.cpp delete mode 100644 inference-engine/thirdparty/clDNN/tests/test_cases/softmax_loss_grad_gpu_test.cpp diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index 37c30015c29..bce89f2a8d5 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1533,49 +1532,11 @@ void Program::CreateBatchNormalizationPrimitive(cldnn::topology& topology, Infer cldnn::primitive_id weightID = bnLayerName + "_" + m_scalesTag; cldnn::primitive_id biasID = bnLayerName + "_" + m_biasesTag; -#define _SCALE_BN_OPT -#ifdef _SCALE_BN_OPT - // Using scale as an optimization (1 mad instead of mad+rsq) - // create new blobs for scale shift CreateScaleWeightsAndBiasesFromBN(topology, bnLayer, weightID, biasID); auto scalePrim = cldnn::scale(bnLayerName, inputPrimitives[0], weightID, biasID); topology.add(scalePrim); -#else - cldnn::tensor blobTensor(0); - const auto bnDims = bnLayer->outData[0]->getTensorDesc().getDims(); - switch (bnDims.size()) { - case 2: - blobTensor = cldnn::feature(TensorValue(bnDims[1])); - break; - case 4: - blobTensor = cldnn::feature(TensorValue(bnDims[1])); - break; - default: - THROW_CLDNN_EXCEPTION("Batch normalization input doesn't have 2 or 4 dimensions in " << bnLayer->name); - } - cldnn::layout blobLayout( - DataTypeFromPrecision(layer->precision), - m_defaultFormat, - blobTensor); - // Create variance primitive - cldnn::primitive_id varianceID = bnLayerName + "_" + m_weightsTag; - varianceID = CreatePrimitiveFromBlob(topology, varianceID, bnLayer->_weights, blobLayout); - - // Create mean primitive - cldnn::primitive_id meanID = bnLayerName + "_" + m_biasesTag; - meanID = CreatePrimitiveFromBlob(topology, meanID, bnLayer->_biases, blobLayout); - - auto bnPrim = cldnn::batch_norm( - bnLayerName, - inputPrimitives[0], - meanID, - varianceID, - bnLayer->epsilon); - - topology.add(bnPrim); -#endif // _SCALE_BN_OPT AddPrimitiveToProfiler(bnLayerName, layer); } diff --git a/inference-engine/src/cldnn_engine/dllmain.cpp b/inference-engine/src/cldnn_engine/dllmain.cpp deleted file mode 100644 index a484571a204..00000000000 --- a/inference-engine/src/cldnn_engine/dllmain.cpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2018-2020 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -// dllmain.cpp : Defines the entry point for the DLL application. -#ifdef _WIN32 -#include - -BOOL APIENTRY DllMain(HMODULE hModule, - DWORD ul_reason_for_call, - LPVOID lpReserved) { - switch (ul_reason_for_call) { - case DLL_PROCESS_ATTACH: - case DLL_THREAD_ATTACH: - case DLL_THREAD_DETACH: - case DLL_PROCESS_DETACH: - break; - } - return TRUE; -} - -#endif diff --git a/inference-engine/thirdparty/clDNN/api/activation.hpp b/inference-engine/thirdparty/clDNN/api/activation.hpp index 9c88a38d7ea..80a120d3f73 100644 --- a/inference-engine/thirdparty/clDNN/api/activation.hpp +++ b/inference-engine/thirdparty/clDNN/api/activation.hpp @@ -71,13 +71,6 @@ enum class activation_func { gelu // (0.5*val*(1 + erf(val / sqrt(2))) }; -/// @brief activation gradient functions -enum class activation_grad_func { - none, // val - relu, // val * (input > 0) - relu_negative_slope, // val * ((input > 0) + a * (input <= 0) (a is additional param) -}; - /// @brief activation additional params struct activation_additional_params { float a, b; diff --git a/inference-engine/thirdparty/clDNN/api/activation_grad.hpp b/inference-engine/thirdparty/clDNN/api/activation_grad.hpp deleted file mode 100644 index d2d4d628dfe..00000000000 --- a/inference-engine/thirdparty/clDNN/api/activation_grad.hpp +++ /dev/null @@ -1,96 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include "activation.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Activation gradient for rectified linear unit or parameterized rectified linear unit. -/// @par Algorithm: -/// out(i,x,y) = input_gradient(i,x,y) * ((input(i,x,y) > 0) + slope(i) * (input(i,x,y) <= 0) -/// @par Where: -/// @li out(i,x,y) : value at x, y from i-th feature map after activation. -/// @li in(i,x,y) : value at x, y from i-th feature map before activation. -/// @li slope(i) : the slope value of the i-th feature map (can be shared across channels or one slope per channel). -struct activation_grad : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(activation_grad) - - /// @brief Constructs Relu grad primitive. - /// @param id This primitive id. - /// @param input_grad Input gradient primitive id. - /// @param input Input primitive id. - /// @param activation_grad_func activation_grad function. - /// @param additional_params additional params (slope). - activation_grad(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - activation_grad_func activation_grad_function, - activation_additional_params additional_params = {0.f, 0.f}, - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - activation_grad_function(activation_grad_function), - additional_params(additional_params), - additional_params_input("") {} - - /// @brief Constructs Relu grad primitive. - /// @param id This primitive id. - /// @param input_grad Input gradient primitive id. - /// @param input Input primitive id. - /// @param activation_grad_func activation_grad function. - /// @param additional_params additional params (slope). - activation_grad(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const primitive_id& additional_params_input, - activation_grad_func activation_grad_function, - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - activation_grad_function(activation_grad_function), - additional_params({0, 0}), - additional_params_input(additional_params_input) {} - - /// @brief activation_grad function. - activation_grad_func activation_grad_function; - - /// @brief activation_grad additional params. - activation_additional_params additional_params; - - /// @brief PRelu activation slope input primitive id. - /// Input x dimension should be equal to input feature size (one slope per channel). - /// All other dimensions should be 1. - primitive_id additional_params_input; - -protected: - std::vector> get_dependencies() const override { - if (additional_params_input.empty()) - return {}; - return {additional_params_input}; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/apply_adam.hpp b/inference-engine/thirdparty/clDNN/api/apply_adam.hpp deleted file mode 100644 index f74523b7062..00000000000 --- a/inference-engine/thirdparty/clDNN/api/apply_adam.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Apply Adam primitive. -/// @details Updates output using Adam algorithm. The output of this primitive should be mutable_data type in case user wants to update -/// variable accross network. If output is not mutable_data then it will be initialized with 0. -/// "Adam: A Method for Stochastic Optimization" by Diederik P. Kingma, Jimmy Ba -/// @n See: https://arxiv.org/abs/1412.6980 -/// -/// Algorithm: -/// @n float lr[t] = lr * sqrt(1 - beta2^t) / (1 - beta1^t); -/// @n float m[t] = beta1 * m[t-1] + (1 - beta1) * grad[t]; -/// @n float v[t] = beta2 * v[t-1] + (1 - beta2) * grad[t] * grad[t]; -/// @n float result = result - lr[t] * m[t] / (sqrt(v[t]) + epsilon); - -struct apply_adam : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(apply_adam) - - /// @brief Constructs apply Adam primitive. - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param m Primitive id containing mean data. - /// @param v Primitive id containing variance. - /// @param beta1_power Primitive id containing beta1^t. - /// @param beta2_power Primitive id containing beta2^t. - /// @param lr Learning rate parameter. - /// @param beta1 Beta1 parameter. - /// @param beta2 Beta2 parameter. - /// @param epsilon Epsilon. - /// @param dependency_id Optional primitive id that need to complete before execution of this primitive. Used only for synchronization. - apply_adam(const primitive_id& id, - const primitive_id& input, - const primitive_id& m, - const primitive_id& v, - const primitive_id& beta1_power, - const primitive_id& beta2_power, - float lr, - float beta1, - float beta2, - float epsilon, - const primitive_id& dependency_id = "", - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - m(m), - v(v), - beta1_power(beta1_power), - beta2_power(beta2_power), - lr(lr), - beta1(beta1), - beta2(beta2), - epsilon(epsilon), - dependency_id(dependency_id) {} - - /// @brief Primitive id containing m data. - primitive_id m; - /// @brief Primitive id containing v data. - primitive_id v; - /// @brief Primitive id containing beta1^t. - primitive_id beta1_power; - /// @brief Primitive id containing beta2^t. - primitive_id beta2_power; - /// @brief Learning rate parameter. - float lr; - /// @brief Beta1 parameter. - float beta1; - /// @brief Beta2 parameter. - float beta2; - /// @brief Epsilon. - float epsilon; - /// @brief Optional primitive id that need to complete before execution of this primitive. Used only for synchronization. - primitive_id dependency_id; - -protected: - std::vector> get_dependencies() const override { - std::vector> ret{m, v, beta1_power, beta2_power}; - ret.reserve(!dependency_id.empty()); - if (!dependency_id.empty()) - ret.push_back(dependency_id); - return ret; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/batch_norm.hpp b/inference-engine/thirdparty/clDNN/api/batch_norm.hpp deleted file mode 100644 index 29b8e69d960..00000000000 --- a/inference-engine/thirdparty/clDNN/api/batch_norm.hpp +++ /dev/null @@ -1,184 +0,0 @@ -/* -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Batch normalization primitive. -/// @details Performs batch normalization as discribed in -/// "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" by Ioffe, Szegedy -/// @n See: http://arxiv.org/abs/1502.03167 -/// -/// Algorithm: -/// @n global stats can be computed as: -/// @n out[i] = ( (in[i] - mean[b]) / sqrt(variance[b] + epsilon) ) * scale[b] + shift[b] - -struct batch_norm : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(batch_norm) - - /// @brief Constructs batch normalization primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param mean Primitive id containing mean data. - /// @param variance Primitive id containing variance. - /// @param epsilon Epsilon. - batch_norm(const primitive_id& id, - const primitive_id& input, - const primitive_id& mean, - const primitive_id& variance, - float epsilon, - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - mean(mean), - variance(variance), - inv_variance(""), - epsilon(epsilon) {} - - /// @brief Constructs batch normalization primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param mean Primitive id containing mean data. - /// @param variance Primitive id containing variance. - /// @brief scale Primitive id containing scale. - /// @brief shift Primitive id containing shift. - /// @param epsilon Epsilon. - batch_norm(const primitive_id& id, - const primitive_id& input, - const primitive_id& mean, - const primitive_id& variance, - const primitive_id& scale, - const primitive_id& shift, - float epsilon, - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - mean(mean), - variance(variance), - scale(scale), - shift(shift), - inv_variance(""), - epsilon(epsilon) {} - - /// @brief Constructs batch normalization primitive with mean and variance calculation (used for training). - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param epsilon Epsilon. - /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. For inference leave empty. - batch_norm(const primitive_id& id, - const primitive_id& input, - float epsilon, - const primitive_id& inv_variance = "", - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - mean(""), - variance(""), - inv_variance(inv_variance), - epsilon(epsilon) {} - - /// @brief Constructs batch normalization primitive with mean and variance calculation (used for training). - /// @param id This primitive id. - /// @param input Input primitive id. - /// @brief scale Primitive id containing scale. - /// @brief shift Primitive id containing shift. - /// @param epsilon Epsilon. - /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. For inference leave empty. - batch_norm(const primitive_id& id, - const primitive_id& input, - float epsilon, - const primitive_id& scale, - const primitive_id& shift, - const primitive_id& inv_variance = "", - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - mean(""), - variance(""), - scale(scale), - shift(shift), - inv_variance(inv_variance), - epsilon(epsilon) {} - - /// @brief Constructs batch normalization primitive with mean and variance calculation (used for training). - /// @param id This primitive id. - /// @param input Input primitive id. - /// @brief scale Primitive id containing scale. - /// @brief shift Primitive id containing shift. - /// @brief mean_out Primitive id containing mean output. - /// @brief variance_out Primitive id containing variance output. - /// @param epsilon Epsilon. - /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. For inference leave empty. - batch_norm(const primitive_id& id, - const primitive_id& input, - float epsilon, - const primitive_id& mean_out, - const primitive_id& variance_out, - const primitive_id& scale, - const primitive_id& shift, - const primitive_id& inv_variance = "", - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - mean(mean_out), - variance(variance_out), - scale(scale), - shift(shift), - inv_variance(inv_variance), - epsilon(epsilon) {} - - /// @brief Primitive id containing mean data. - primitive_id mean; - /// @brief Primitive id containing variance. - primitive_id variance; - /// @brief Primitive id containing scale. - primitive_id scale; - /// @brief Primitive id containing shift. - primitive_id shift; - /// @brief Primitive id containing inverted variance used in future gradient computing. - primitive_id inv_variance; - /// @brief Epsilon. - float epsilon; - -protected: - std::vector> get_dependencies() const override { - std::vector> deps; - - if (!mean.empty() && !variance.empty()) { - deps.push_back(mean); - deps.push_back(variance); - } - - if (!scale.empty() && !shift.empty()) { - deps.push_back(scale); - deps.push_back(shift); - } - - if (!inv_variance.empty()) - deps.push_back(inv_variance); - - return deps; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp b/inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp deleted file mode 100644 index cf487ad8575..00000000000 --- a/inference-engine/thirdparty/clDNN/api/batch_norm_grad.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs backward batch normalization layer. -/// @details Calculates mean gradient and gradient * input for every feature in data, -/// then output is calculated as inv_variance * (input_grad - mean_grad_input * input - mean_grad) -struct batch_norm_grad : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(batch_norm_grad) - - /// @brief Constructs batch normalization backward layer. - /// @param id This primitive id. - /// @param input_grad Input gradient primitive id. - /// @param input Input primitive id. - /// @param inv_variance Primitive id containing inverted variance from forward pass. - batch_norm_grad( - const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const primitive_id& inv_variance, - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), inv_variance(inv_variance) { - } - - /// @brief Primitive id containing inverted variance from forward pass. - primitive_id inv_variance; - -protected: - std::vector> get_dependencies() const override { - return {inv_variance}; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/contract.hpp b/inference-engine/thirdparty/clDNN/api/contract.hpp deleted file mode 100644 index 9242b4e845e..00000000000 --- a/inference-engine/thirdparty/clDNN/api/contract.hpp +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Select mode for the @ref contract layer. -enum class contract_mode : int32_t { - /// @brief Sum reduction. - sum, - /// @brief Product reduction. - prod, - /// @brief All reduction. - all, - /// @brief Any reduction. - any, - /// @brief Max reduction. - max -}; - -/// @brief Reduces input with an operation defined by @p mode along defined -/// by @p reduction_axes dimensions. -/// -/// @details Reduces the input using the binary operation determined by -/// @p mode. The @p reduction_axes determine the final shape of the -/// output, which is calculated based on the input shape by -/// collapsing the dimensions along which the reduction happens. -/// For example, for the input with -/// @n input_sizes = (in_b, in_f, in_y, in_x) -/// @n a reduction with -/// @n reduction_axes = (2) -/// @n would collapse the Y dimension, producing -/// @n output_shape = (1, in_b, in_f, in_x) -/// @n where every element is a @p mode reduction of the input elements with -/// @n the same B, F and X coordinates. -/// @n -/// @n@b Requirements: -/// @n - @p reduction_axes size (dimensions count) must be within (inclusive) range -/// 1 - 4. -/// @n - @p reduction_axes mustn't have duplicate values. -/// @n - Values of @p reduction_axes must be within (inclusive) range 0 - 3 -/// @n Breaking any of these conditions will raise an exception. -struct contract : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(contract) - - /// @brief Constructs contract primitive / layer. - /// - /// @param id An identifier of new primitive. - /// @param input An identifier of primitive which is an input for newly created - /// contract primitive. - /// @param mode Reduction mode. - /// @param reduction_axes Axes positions (0-based, from left to right) in input_shape - /// that are being reduced. - /// @param output_padding Optional padding for output from primitive. - contract( - const primitive_id& id, - const primitive_id& input, - contract_mode mode, - const std::vector& reduction_axes = {}, - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), - mode(mode), - reduction_axes(reduction_axes) { - } - /// @param mode Contract mode. - contract_mode mode; - /// @brief Array of axes positions from input shape (0-based, from left to right) - /// along which reduction should happen. - std::vector reduction_axes; -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp b/inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp deleted file mode 100644 index 534aedb78d6..00000000000 --- a/inference-engine/thirdparty/clDNN/api/convolution_grad_input.hpp +++ /dev/null @@ -1,95 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "deconvolution.hpp" -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs backward convolution operation for input. -/// @details convolution_grad_input is similar to deconvolution layer without biases and activation support. -/// It actually uses deconvolution primitive underneath with gradient bool set to true. -struct convolution_grad_input : public deconvolution { - /// @brief Constructs convolution_grad_input primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param weights List of primitive ids containing weights data. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_input window should start calculations. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param with_activation Enables Relu activation. - /// @param activation_slp Relu activation slope. - convolution_grad_input(const primitive_id& id, - const primitive_id& input, - const std::vector& weights, - tensor stride = {1, 1, 1, 1}, - tensor input_offset = {0, 0, 0, 0}, - const padding& output_padding = padding()) - : deconvolution(id, input, {weights}, stride, input_offset, output_padding, true) {} - - /// @brief Constructs convolution_grad_input primitive (computes input paddings to match output size). - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param weights List of primitive ids containing weights data. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_input window should start calculations. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param with_activation Enables Relu activation. - /// @param activation_slp Relu activation slope. - /// @param output_size User-defined output data size of the primitive (w/o padding). - convolution_grad_input(const primitive_id& id, - const primitive_id& input, - const std::vector& weights, - tensor stride, - tensor input_offset, - tensor output_size, - const padding& output_padding = padding()) - : deconvolution(id, input, {weights}, stride, input_offset, output_size, output_padding, true) {} - - /// @brief Constructs convolution_grad_input primitive (computes input paddings to match output size). - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param weights List of primitive ids containing weights data. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_input window should start calculations. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param with_activation Enables Relu activation. - /// @param activation_slp Relu activation slope. - /// @param output_size User-defined output data size of the primitive (w/o padding). - /// @return convolution_grad_input primitive with specified settings. - static convolution_grad_input create_with_output_size(const primitive_id& id, - const primitive_id& input, - const std::vector& weights, - tensor output_size, - tensor stride = {1, 1, 1, 1}, - tensor input_offset = {0, 0, 0, 0}, - const padding& output_padding = padding()) { - return convolution_grad_input(id, input, weights, stride, input_offset, output_size, output_padding); - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp deleted file mode 100644 index fa15fa73fca..00000000000 --- a/inference-engine/thirdparty/clDNN/api/convolution_grad_weights.hpp +++ /dev/null @@ -1,217 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs backward convolution operation for weights and biases. -/// @details convolution_grad_weights updates weights and bias mutable data for training purposes. -/// @details Please note that this primitive was not heavily tested and currently only batch=1 is enabled for this primitive. -struct convolution_grad_weights - : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(convolution_grad_weights) - - /// @brief Constructs convolution_grad_weights primitive. - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param input Input primitive id from convolution forward pass. - /// @param weights List of primitive ids containing weights data. - /// @param bias List of primitive ids containing bias data. Provide empty vector if using next parameters without bias. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_weights window should start calculations. - /// @param dilation Defines dilation size. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive. - /// This is for correct order of calculating. Leave empty if primitive is last in backward pass. - convolution_grad_weights(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const std::vector& weights, - const std::vector& bias, - tensor stride = {1, 1, 1, 1}, - tensor input_offset = {0, 0, 0, 0}, - tensor dilation = {1, 1, 1, 1}, - const primitive_id& conv_grad = "", - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - conv_grad(conv_grad), - stride(stride), - input_offset(input_offset), - dilation(dilation), - output_grad_w(false), - weights(weights), - bias(bias), - prev_weights_grad(std::vector(0)), - prev_bias_grad(std::vector(0)) {} - - /// @brief Constructs convolution_grad_weights primitive (w/o bias). - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param input Input primitive id from convolution forward pass. - /// @param weights List of primitive ids containing weights data. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_weights window should start calculations. - /// @param dilation Defines dilation size. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param Should primitive give weights gradient (delta) as an output - /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive. - /// This is for correct order of calculating. Leave empty if primitive is last in backward pass. - convolution_grad_weights(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const std::vector& weights, - tensor stride = {1, 1, 1, 1}, - tensor input_offset = {0, 0, 0, 0}, - tensor dilation = {1, 1, 1, 1}, - bool output_grad_w = false, - const primitive_id& conv_grad = "", - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - conv_grad(conv_grad), - stride(stride), - input_offset(input_offset), - dilation(dilation), - output_grad_w(output_grad_w), - weights(weights), - bias(std::vector(0)), - prev_weights_grad(std::vector(0)), - prev_bias_grad(std::vector(0)) {} - - /// @brief Constructs convolution_grad_weights primitive (w/o bias). - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param input Input primitive id from convolution forward pass. - /// @param weights List of primitive ids containing weights data. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_weights window should start calculations. - /// @param dilation Defines dilation size. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive. - /// This is for correct order of calculating. Leave empty if primitive is last in backward pass. - convolution_grad_weights(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const std::vector& weights, - tensor stride, - tensor input_offset, - tensor dilation, - const primitive_id& conv_grad = "", - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - conv_grad(conv_grad), - stride(stride), - input_offset(input_offset), - dilation(dilation), - output_grad_w(false), - weights(weights), - bias(std::vector(0)), - prev_weights_grad(std::vector(0)), - prev_bias_grad(std::vector(0)) {} - - /// @brief Constructs convolution_grad_weights primitive with momentum optimizer. - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param input Input primitive id from convolution forward pass. - /// @param weights List of primitive ids containing weights data. - /// @param bias List of primitive ids containing bias data. Provide empty vector if using next parameters without bias. - /// @param prev_weights_grad List of primitive ids which contains weights gradient data calculated in previous iteration. Used in momentum optimizer. - /// @param prev_bias_grad List of primitive ids which contains bias gradient data calculated in previous iteration. Used in momentum optimizer. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_weights window should start calculations. - /// @param dilation Defines dilation size. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param conv_grad Id of primitive which uses weights and biases updated in this primitive. - /// This is for correct order of calculating. Leave empty if primitive is last in backward pass. - convolution_grad_weights(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const std::vector& weights, - const std::vector& bias, - const std::vector& prev_weights_grad, - const std::vector& prev_bias_grad, - tensor stride = {1, 1, 1, 1}, - tensor input_offset = {0, 0, 0, 0}, - tensor dilation = {1, 1, 1, 1}, - const primitive_id& conv_grad = "", - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - conv_grad(conv_grad), - stride(stride), - input_offset(input_offset), - dilation(dilation), - output_grad_w(false), - weights(weights), - bias(bias), - prev_weights_grad(prev_weights_grad), - prev_bias_grad(prev_bias_grad) {} - - /// @brief Primitive id containing convolution gradient data. - primitive_id conv_grad; - /// @brief Defines shift in input buffer between adjacent calculations of output values. - tensor stride; - /// @brief Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution_grad_weights window should start calculations. - tensor input_offset; - /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. - /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. - /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. - tensor dilation; - /// @brief Should primitive give weights gradient (delta) as an output - bool output_grad_w; - /// @brief List of primitive ids containing weights data. - const primitive_id_arr weights; - /// @brief List of primitive ids containing bias data. - const primitive_id_arr bias; - /// @brief Array of primitive ids containing weights gradient data calculated in previous iteration. - /// Amount of primitives and their memory sizes should be same as weights. - const primitive_id_arr prev_weights_grad; - /// @brief Array of primitive ids containing bias gradient data calculated in previous iteration. - /// Amount of primitives and their memory sizes should be same as biases. - const primitive_id_arr prev_bias_grad; - - /// @brief On how many cards split the computation to. - int32_t split() const { return static_cast(weights.size()); } - -protected: - std::vector> get_dependencies() const override { - std::vector> ret; - ret.reserve(weights.size() + bias.size() + !conv_grad.empty() + prev_weights_grad.size() + - prev_bias_grad.size()); - for (auto& w : weights) ret.push_back(std::ref(w)); - for (auto& b : bias) ret.push_back(std::ref(b)); - - for (auto& g : prev_weights_grad) ret.push_back(std::ref(g)); - for (auto& g : prev_bias_grad) ret.push_back(std::ref(g)); - if (!conv_grad.empty()) - ret.push_back(conv_grad); - - return ret; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/deconvolution.hpp b/inference-engine/thirdparty/clDNN/api/deconvolution.hpp index a506a0850fd..141e7b8bbe5 100644 --- a/inference-engine/thirdparty/clDNN/api/deconvolution.hpp +++ b/inference-engine/thirdparty/clDNN/api/deconvolution.hpp @@ -56,8 +56,7 @@ struct deconvolution : public primitive_base { with_output_size(false), groups(1), weights(weights), - bias(bias), - _gradient(false) {} + bias(bias) {} /// @brief Constructs deconvolution primitive. /// @param id This primitive id. /// @param input Input primitive id. @@ -83,8 +82,7 @@ struct deconvolution : public primitive_base { with_output_size(false), groups(groups), weights(weights), - bias(bias), - _gradient(false) {} + bias(bias) {} /// @brief Constructs deconvolution primitive (w/o bias). /// @param id This primitive id. @@ -100,16 +98,14 @@ struct deconvolution : public primitive_base { const std::vector& weights, tensor stride = {1, 1, 1, 1}, tensor input_offset = {0, 0, 0, 0}, - const padding& output_padding = padding(), - bool gradient = false) + const padding& output_padding = padding()) : primitive_base(id, {input}, output_padding), input_offset(input_offset), stride(stride), with_output_size(false), groups(1), weights(weights), - bias(std::vector(0)), - _gradient(gradient) {} + bias(std::vector(0)) {} /// @brief Constructs deconvolution primitive (w/o bias). /// @param id This primitive id. @@ -127,16 +123,14 @@ struct deconvolution : public primitive_base { uint32_t groups, tensor stride = {1, 1, 1, 1}, tensor input_offset = {0, 0, 0, 0}, - const padding& output_padding = padding(), - bool gradient = false) + const padding& output_padding = padding()) : primitive_base(id, {input}, output_padding), input_offset(input_offset), stride(stride), with_output_size(false), groups(groups), weights(weights), - bias(std::vector(0)), - _gradient(gradient) {} + bias(std::vector(0)) {} /// @brief Constructs deconvolution primitive (computes input paddings to match output size). /// @param id This primitive id. @@ -164,8 +158,7 @@ struct deconvolution : public primitive_base { output_size(output_size), groups(1), weights(weights), - bias(bias), - _gradient(false) {} + bias(bias) {} /// @brief Constructs deconvolution primitive (computes input paddings to match output size). /// @param id This primitive id. @@ -195,8 +188,7 @@ struct deconvolution : public primitive_base { output_size(output_size), groups(groups), weights(weights), - bias(bias), - _gradient(false) {} + bias(bias) {} /// @brief Constructs deconvolution primitive (w/o bias, computes input paddings to match output size). /// @param id This primitive id. @@ -214,8 +206,7 @@ struct deconvolution : public primitive_base { tensor stride, tensor input_offset, tensor output_size, - const padding& output_padding = padding(), - bool gradient = false) + const padding& output_padding = padding()) : primitive_base(id, {input}, output_padding), input_offset(input_offset), stride(stride), @@ -223,8 +214,7 @@ struct deconvolution : public primitive_base { output_size(output_size), groups(1), weights(weights), - bias(std::vector(0)), - _gradient(gradient) {} + bias(std::vector(0)) {} /// @brief Constructs deconvolution primitive (computes input paddings to match output size). /// @param id This primitive id. @@ -300,12 +290,8 @@ struct deconvolution : public primitive_base { /// @brief On how many cards split the computation to. int32_t split() const { return static_cast(weights.size()); } - /// @brief Indicates that deconvolution is used for convolution backward computation (convolution_grad_input) - bool gradient() const { return _gradient; } protected: - bool _gradient; - std::vector> get_dependencies() const override { std::vector> ret; ret.reserve(weights.size() + bias.size()); diff --git a/inference-engine/thirdparty/clDNN/api/eltwise.hpp b/inference-engine/thirdparty/clDNN/api/eltwise.hpp index bff27ac4991..0926b52c430 100644 --- a/inference-engine/thirdparty/clDNN/api/eltwise.hpp +++ b/inference-engine/thirdparty/clDNN/api/eltwise.hpp @@ -92,13 +92,9 @@ struct eltwise : public primitive_base { eltwise_mode mode, const padding& output_padding = padding()) : primitive_base(id, {input, input2}, output_padding), - output_calibration_factors(""), - output_quantization_factor(1.0f), - input_quantization_factors(0), mode(mode), coefficients(std::vector(0)), - stride(std::vector(0)), - inputs_calibration_factors(std::vector(0)) {} + stride(std::vector(0)) {} /// @brief Constructs eltwise primitive. /// @param id This primitive id. @@ -115,13 +111,9 @@ struct eltwise : public primitive_base { eltwise_mode mode, const padding& output_padding = padding()) : primitive_base(id, {input, input2}, output_padding), - output_calibration_factors(""), - output_quantization_factor(1.0f), - input_quantization_factors(0), mode(mode), coefficients(std::vector(0)), - stride(stride), - inputs_calibration_factors(std::vector(0)) {} + stride(stride) {} /// @brief Constructs eltwise primitive. /// @param id This primitive id. @@ -134,13 +126,9 @@ struct eltwise : public primitive_base { data_types data_type, const padding& output_padding = padding()) : primitive_base(id, inputs, output_padding, optional_data_type{data_type}), - output_calibration_factors(""), - output_quantization_factor(1.0f), - input_quantization_factors(0), mode(mode), coefficients(std::vector(0)), - stride(std::vector(0)), - inputs_calibration_factors(std::vector(0)) {} + stride(std::vector(0)) {} /// @brief Constructs eltwise primitive. /// @param id This primitive id. @@ -151,13 +139,9 @@ struct eltwise : public primitive_base { eltwise_mode mode, const padding& output_padding = padding()) : primitive_base(id, inputs, output_padding), - output_calibration_factors(""), - output_quantization_factor(1.0f), - input_quantization_factors(0), mode(mode), coefficients(std::vector(0)), - stride(std::vector(0)), - inputs_calibration_factors(std::vector(0)) {} + stride(std::vector(0)) {} /// @brief Constructs eltwise primitive. /// @param id This primitive id. @@ -171,13 +155,9 @@ struct eltwise : public primitive_base { data_types data_type, const padding& output_padding = padding()) : primitive_base(id, inputs, output_padding, optional_data_type{data_type}), - output_calibration_factors(""), - output_quantization_factor(1.0f), - input_quantization_factors(0), mode(mode), coefficients(coefficients), - stride(std::vector(0)), - inputs_calibration_factors(std::vector(0)) { + stride(std::vector(0)) { if (mode == eltwise_mode::sum && !coefficients.empty() && coefficients.size() != inputs.size()) { throw std::invalid_argument("Invalid eltwise sum coefficients count (should be equal to 0 or input.size)"); } @@ -186,31 +166,12 @@ struct eltwise : public primitive_base { } } - /// @brief Primitive id containing output quanitization factors per output feature map. - primitive_id output_calibration_factors; - /// @brief Output quantization factor - float output_quantization_factor; - /// @brief List of quantization factors per input. - std::vector input_quantization_factors; /// @param mode Eltwise mode. eltwise_mode mode; /// @param coefficients Blob-wise coefficient for SUM operation. std::vector coefficients; /// @brief Defines shift in input buffers between adjacent calculations of output values. std::vector stride; - /// @brief List of primitive ids containing input quantization factors per feature map, one primitive id for each input. - const primitive_id_arr inputs_calibration_factors; - -protected: - std::vector> get_dependencies() const override { - std::vector> ret; - if (!output_calibration_factors.empty()) - ret.push_back(output_calibration_factors); - - for (auto& icf : inputs_calibration_factors) ret.push_back(std::ref(icf)); - - return ret; - } }; /// @} /// @} diff --git a/inference-engine/thirdparty/clDNN/api/embed.hpp b/inference-engine/thirdparty/clDNN/api/embed.hpp deleted file mode 100644 index 91a66e32b44..00000000000 --- a/inference-engine/thirdparty/clDNN/api/embed.hpp +++ /dev/null @@ -1,79 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief -/// @details Performs embedding upon input. -/// @n\b Example: -/// @n input_size = { 8, 1, 1, 75 }; -/// @n weights_size = {15, 1, 62, 1 }; -/// @n output_size = { 8, 75, 15, 1 }; -/// @par Algorithm: -/// @par Where: -struct embed : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(embed) - - /// @brief Constructs embed primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param weights Primitive id containing weights data. - /// @param bias Primitive id containing bias data. - embed( - const primitive_id& id, - const primitive_id& input, - const primitive_id& weights, - const primitive_id& bias) - : primitive_base(id, {input}), weights(weights), bias(bias) {} - - /// @brief Constructs embed primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - embed( - const primitive_id& id, - const primitive_id& input, - const primitive_id& weights) - : primitive_base(id, {input}), weights(weights), bias("") {} - - /// @brief Primitive id containing weights data. - primitive_id weights; - /// @brief Primitive id containing bias data. - primitive_id bias; - -protected: - std::vector> get_dependencies() const override { - if (bias.empty()) - return {weights}; - else - return {weights, bias}; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn -#pragma once diff --git a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp b/inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp deleted file mode 100644 index 23463cda324..00000000000 --- a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_input.hpp +++ /dev/null @@ -1,59 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs backward fully connected layer (inner product) for input. -struct fully_connected_grad_input : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(fully_connected_grad_input) - - /// @brief Constructs fully connected layer grad for input. - /// @param id This primitive id. - /// @param input_grad Input gradient primitive id. - /// @param input Input primitive id. - /// @param weights Primitive id containing weights data. - fully_connected_grad_input( - const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const primitive_id& weights, - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), weights(weights) { - } - - /// @brief Primitive id containing weights data. - primitive_id weights; - -protected: - std::vector> get_dependencies() const override { - return {weights}; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp deleted file mode 100644 index 71af7a81691..00000000000 --- a/inference-engine/thirdparty/clDNN/api/fully_connected_grad_weights.hpp +++ /dev/null @@ -1,115 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs backward fully connected layer (inner product) for weights and biases. -struct fully_connected_grad_weights - : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(fully_connected_grad_weights) - - /// @brief Constructs fully connected layer for weights and biases. - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param input Input primitive id. - /// @param weights Primitive id containing weights data. - /// @param bias Primitive id containing bias data. Provide empty string if using Relu without bias. - /// @param fc_grad Id of primitive which uses weights and biases updated in this primitive. - /// This is for correct order of calculating. Leave empty if primitive is last in backward pass. - fully_connected_grad_weights(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const primitive_id& weights, - const primitive_id& bias = "", - const primitive_id& fc_grad = "", - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - weights(weights), - bias(bias), - fc_grad(fc_grad), - prev_weights_grad(""), - prev_bias_grad("") {} - - /// @brief Constructs fully connected layer for weights and biases with momentum optimizer. - /// @param id This primitive id. - /// @param input Input gradient primitive id. - /// @param input Input primitive id. - /// @param weights Primitive id containing weights data. - /// @param bias Primitive id containing bias data. Provide empty string if using Relu without bias. - /// @param prev_weights_grad Id of primitive which contains weights gradient data calculated in previous iteration. Used in momentum optimizer. - /// @param prev_bias_grad Id of primitive which contains bias gradient data calculated in previous iteration. Used in momentum optimizer. - /// @param fc_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating. - fully_connected_grad_weights(const primitive_id& id, - const primitive_id& input_grad, - const primitive_id& input, - const primitive_id& weights, - const primitive_id& bias, - const primitive_id& prev_weights_grad, - const primitive_id& prev_bias_grad, - const primitive_id& fc_grad = "", - const padding& output_padding = padding()) - : primitive_base(id, {input_grad, input}, output_padding), - weights(weights), - bias(bias), - fc_grad(fc_grad), - prev_weights_grad(prev_weights_grad), - prev_bias_grad(prev_bias_grad) {} - - /// @brief Primitive id containing weights data. - primitive_id weights; - /// @brief Primitive id containing bias data. - primitive_id bias; - /// @brief Primitive id containing fully connected gradient data. - primitive_id fc_grad; - /// @brief Id of primitive containing weights gradient data calculated in previous iteration. It's memory size should be same as weights. - primitive_id prev_weights_grad; - /// @brief Id of primitive containing bias gradient data calculated in previous iteration. It's memory size should be same as biases. - primitive_id prev_bias_grad; - -protected: - std::vector> get_dependencies() const override { - std::vector> ret; - ret.reserve(1 + !bias.empty() + !fc_grad.empty() + !prev_weights_grad.empty() + !prev_bias_grad.empty()); - - ret.push_back(weights); - if (!bias.empty()) - ret.push_back(bias); - - if (!prev_weights_grad.empty()) - ret.push_back(prev_weights_grad); - if (!prev_bias_grad.empty()) - ret.push_back(prev_bias_grad); - if (!fc_grad.empty()) - ret.push_back(fc_grad); - - return ret; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/index_select.hpp b/inference-engine/thirdparty/clDNN/api/index_select.hpp deleted file mode 100644 index 0e6548eec25..00000000000 --- a/inference-engine/thirdparty/clDNN/api/index_select.hpp +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once - -#include "primitive.hpp" -#include - -namespace cldnn { - -/// @brief Axis which index_select primitive will index. -enum class index_select_axis_name { - along_b, - along_f, - along_y, - along_x -}; - -/// @brief Select index, which will be copied to the output.. -/// -/// @details Applies index selecting along specified dimension. The indices, which will be copied are specifed by -/// by @c indices. -/// @n -/// @n Example: -/// @n input_sizes = (1, 2, 4, 2) -/// @n input_values = (a, b, c, d) -/// @n (e, f, g, h) -/// @n indices_sizes = (1, 1, 6, 1) -/// @n indices_values = {0, 0, 1, 1, 3, 3} -/// @n For axis: along_x: -/// @n output_sizes = (1, 2, 6, 2) -/// @n output_values = (a, a, b, b, d, d) -/// @n (e, e, f, f, h, h) -/// @n -/// @n The resulting output will have sizes equal to input_size with changed concrete tensor size to inidices x size. -/// @n -/// @n@b Requirements: -/// @n - @c input must be a valid primitive_id, which output's format is bfyx/yxfb; -/// @n - @c indices must be a valid primitive_id, which output's layout is: (bfyx/yxfb, i32, {1, 1, indicies_size, 1}) -/// @n - @c axis - valid index_select_axis_name instance. -/// @n Breaking any of this conditions will cause exeption throw. -struct index_select : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(index_select) - - /// @brief Constructs index_select primitive / layer. - /// - /// @param id An identifier of new primitive. - /// @param input An identifier of primitive, which is an input for newly created - /// index_select primitive. - /// @param indicies An identifer of primitive, which have indices in memory distributed along x. - /// @param axis Axis of index selecting. - /// @param output_padding Optional padding for output from primitive. - index_select( - const primitive_id& id, - const primitive_id& input, - const primitive_id& indices, - index_select_axis_name axis = index_select_axis_name::along_b, - const padding& output_padding = padding()) - : primitive_base(id, {input, indices}, output_padding), axis({axis}), reverse(false) {} - - /// @brief Constructs index_select primitive / layer. - /// - /// @param id An identifier of new primitive. - /// @param input An identifier of primitive, which is an input for newly created - /// index_select primitive. - /// @param axis Axis of index selecting. - /// @param output_padding Optional padding for output from primitive. - index_select( - const primitive_id& id, - const primitive_id& input, - index_select_axis_name axis = index_select_axis_name::along_b, - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), axis({axis}), reverse(true) {} - - /// @brief Constructs index_select primitive / layer. - /// - /// @param id An identifier of new primitive. - /// @param input An identifier of primitive, which is an input for newly created - /// index_select primitive. - /// @param axis Vector of axes of index selecting. - /// @param output_padding Optional padding for output from primitive. - index_select( - const primitive_id& id, - const primitive_id& input, - const std::vector& axis = {index_select_axis_name::along_b}, - const padding& output_padding = padding()) - : primitive_base(id, {input}, output_padding), axis(axis), reverse(true) {} - - /// @brief A list of axes of index selecting - std::vector axis; - /// @brief Do index_select in reverse order on axis/axes. - bool reverse; -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/lookup_table.hpp b/inference-engine/thirdparty/clDNN/api/lookup_table.hpp deleted file mode 100644 index 65349edd55c..00000000000 --- a/inference-engine/thirdparty/clDNN/api/lookup_table.hpp +++ /dev/null @@ -1,58 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Returns values from data on which given indices are pointing at. -struct lookup_table : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(lookup_table) - - /// @brief Enum type to specify axis to maximize/minimize along. - enum axis_name { batch, feature, x, y, xyf }; - - /// @brief Constructs lookup_table primitive. - /// @param id This primitive id. - /// @param input_data Input data primitive id. - /// @param input_indices Input indices primitive id. - /// @param axis Axis to return values from. - lookup_table(const primitive_id& id, - const primitive_id& input_data, - const primitive_id& input_indices, - axis_name axis = axis_name::xyf, - const padding& output_padding = padding()) - : primitive_base(id, {input_data, input_indices}, output_padding), - axis(axis), - with_axis(axis == axis_name::xyf ? false : true) {} - - /// @brief Axis to return values from. If not set, returns data which index is pointing at in the flattened x, y, f dimensions for each batch. - axis_name axis; - /// @brief Indicates that the primitive has user defined axis to return values from. - bool with_axis; -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/network.hpp b/inference-engine/thirdparty/clDNN/api/network.hpp index ffb9645840e..6c83688120d 100644 --- a/inference-engine/thirdparty/clDNN/api/network.hpp +++ b/inference-engine/thirdparty/clDNN/api/network.hpp @@ -113,12 +113,6 @@ struct network { /// @brief Provides user-supplied @ref memory for output primitives defined by user in source @ref topology. void set_output_memory(const primitive_id& id, const memory& mem) const; - /// @brief Sets learning rate for training primitives. - void set_learning_rate(const float lr); - - /// @brief Return learning rate. - float get_learning_rate(); - /// @brief Return stream id. uint16_t get_stream_id(); diff --git a/inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp b/inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp deleted file mode 100644 index 667cf5b3d2f..00000000000 --- a/inference-engine/thirdparty/clDNN/api/scale_grad_input.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs scale primitive backward for input. -struct scale_grad_input : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(scale_grad_input) - - /// @brief Constructs scale_grad_input. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param scale_input Scale input primitive id with values needed for product computation. - scale_grad_input(const primitive_id& id, - const primitive_id& input, - const primitive_id& scale_input, // should be bfyx or yxfb, where each dimension can be 1, if all - // dimensions are 1 then this is scalar - const padding& output_padding = padding()) - : primitive_base(id, {input, scale_input}, output_padding) {} - -protected: - std::vector> get_dependencies() const override { return {}; } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp b/inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp deleted file mode 100644 index d13b18d137a..00000000000 --- a/inference-engine/thirdparty/clDNN/api/scale_grad_weights.hpp +++ /dev/null @@ -1,131 +0,0 @@ -/* -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Performs scale layer backward for scale_input and biases. -struct scale_grad_weights : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(scale_grad_weights) - - /// @brief Constructs scale_grad_weights primitive without bias. - /// @param id This primitive id. - /// @param input Input primitive id. Same as input for scale forward. - /// @param input_grad Input gradient primitive id. - /// @param scale_input Scale input primitive id. - /// @param scale_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating. - scale_grad_weights(const primitive_id& id, - const primitive_id& input, - const primitive_id& input_grad, - const primitive_id& scale_input, // should be one number per feature - const primitive_id& scale_grad = "", // leave empty if this is last primitive in backward pass - const padding& output_padding = padding()) - : primitive_base(id, {input, input_grad}, output_padding), - scale_input(scale_input), - bias(""), - prev_scale_grad(""), - prev_bias_grad(""), - scale_grad(scale_grad) {} - - /// @brief Constructs scale_grad_weights primitive with optional adding bias. - /// @param id This primitive id. - /// @param input Input primitive id. Same as input for scale forward. - /// @param input_grad Input gradient primitive id. - /// @param scale_input Scale input primitive id. - /// @param bias Primitive id containing bias data. - /// @param scale_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating. - scale_grad_weights(const primitive_id& id, - const primitive_id& input, - const primitive_id& input_grad, - const primitive_id& scale_input, // should be one number per feature - const primitive_id& bias, // should be same size as scale_input - const primitive_id& scale_grad = "", // leave empty if this is last primitive in backward pass - const padding& output_padding = padding()) - : primitive_base(id, {input, input_grad}, output_padding), - scale_input(scale_input), - bias(bias), - prev_scale_grad(""), - prev_bias_grad(""), - scale_grad(scale_grad) {} - - /// @brief Constructs scale_grad_weights primitive with optional bias and momentum optimizer. - /// @param id This primitive id. - /// @param input Input primitive id. Same as input for scale forward. - /// @param input_grad Input gradient primitive id. - /// @param scale_input Scale input primitive id. - /// @param bias Primitive id containing bias data. - /// @param prev_scale_grad Id of primitive which contains scale gradient data calculated in previous iteration. Used in momentum optimizer. - /// @param prev_bias_grad Id of primitive which contains bias gradient data calculated in previous iteration. Used in momentum optimizer. - /// @param scale_grad Id of primitive which uses weights and biases updated in this primitive. This is for correct order of calculating. - scale_grad_weights(const primitive_id& id, - const primitive_id& input, - const primitive_id& input_grad, - const primitive_id& scale_input, // should be one number per feature - const primitive_id& bias, // should be same size as scale_input - const primitive_id& prev_scale_grad, - const primitive_id& prev_bias_grad, // leave empty if bias not specified - const primitive_id& scale_grad = "", // leave empty if this is last primitive in backward pass - const padding& output_padding = padding()) - : primitive_base(id, {input, input_grad}, output_padding), - scale_input(scale_input), - bias(bias), - prev_scale_grad(prev_scale_grad), - prev_bias_grad(prev_bias_grad), - scale_grad(scale_grad) {} - - /// @brief Scale input primitive id. - primitive_id scale_input; - /// @brief Primitive id containing bias data. - primitive_id bias; - /// @brief Primitive id containing scale gradient data calculated in previous iteration. - primitive_id prev_scale_grad; - /// @brief Primitive id containing bias gradient data calculated in previous iteration. - primitive_id prev_bias_grad; - /// @brief Primitive id which uses weights and biases updated in this primitive. - primitive_id scale_grad; - -protected: - std::vector> get_dependencies() const override { - std::vector> ret; - ret.reserve(1 + !bias.empty() + !prev_scale_grad.empty() + !prev_bias_grad.empty()); - - ret.push_back(scale_input); - if (!bias.empty()) - ret.push_back(bias); - if (!prev_scale_grad.empty()) - ret.push_back(prev_scale_grad); - if (!prev_bias_grad.empty()) - ret.push_back(prev_bias_grad); - if (!scale_grad.empty()) - ret.push_back(scale_grad); - - return ret; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp b/inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp deleted file mode 100644 index e436f5b0baa..00000000000 --- a/inference-engine/thirdparty/clDNN/api/softmax_loss_grad.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "primitive.hpp" - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Backward pass for Softmax log loss. -/// @details The output values are the same as input_prob, except for the correct one based on the label which is subtracted by 1. -struct softmax_loss_grad : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(softmax_loss_grad) - - /// @brief Constructs softmax_loss_grad primitive. - /// @param id This primitive id. - /// @param input_prob Input primitive id. - /// @param labels Labels primitive id. - softmax_loss_grad(const primitive_id& id, - const primitive_id& input_prob, - const primitive_id& labels, - const padding& output_padding = padding()) - : primitive_base(id, {input_prob, labels}, output_padding) {} -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp b/inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp deleted file mode 100644 index cf27c417e42..00000000000 --- a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_bn_scale.hpp +++ /dev/null @@ -1,115 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -/////////////////////////////////////////////////////////////////////////////////////////////////// -#pragma once -#include "api/primitive.hpp" -#include - -namespace cldnn { -/// @addtogroup cpp_api C++ API -/// @{ -/// @addtogroup cpp_topology Network Topology -/// @{ -/// @addtogroup cpp_primitives Primitives -/// @{ - -/// @brief Primitives that fuses convolution, batch norm, scale and optionally Relu. -struct fused_conv_bn_scale : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(fused_conv_bn_scale) - - /// @brief Constructs convolution primitive fused with batch norm and scale. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param weights List of primitive ids containing weights data. - /// @param bias List of primitive ids containing bias data. - /// @param epsilon Small number to protect from 0 dividing. - /// @param scale_input Scale input primitive id with values needed for product computation. Used in fused scale part. - /// @param scale_bias Primitive id containing bias data for fused scale part. - /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, - /// where (0,0) point of the convolution window should start calculations. - /// @param stride Defines shift in input buffer between adjacent calculations of output values. - /// @param inv_variance Primitive id containing inverted variance calculated in this primitive. Used in fused batch norm part. - /// @param with_activation Enable Relu activation. - /// @param activation_slp Relu activation slope. - fused_conv_bn_scale(const primitive_id& id, - const primitive_id& input, - const std::vector& weights, - const std::vector& bias, - float epsilon, - const primitive_id& scale_input, - const primitive_id& scale_bias = "", - tensor stride = {1, 1, 1, 1}, - tensor dilation = {1, 1, 1, 1}, - tensor input_offset = {0, 0, 0, 0}, - const primitive_id& inv_variance = "", - const padding& output_padding = padding()) - : primitive_base(id, {input, scale_input}, output_padding), - input_offset(input_offset), - stride(stride), - dilation(dilation), - with_output_size(false), - scale_bias(scale_bias), - inv_variance(inv_variance), - epsilon(epsilon), - weights(weights), - bias(bias) { - if ((bias.size() != 0) && (weights.size() != bias.size())) - throw std::runtime_error("convolution's weights/bias count does not match"); - } - - /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. - tensor input_offset; - /// @brief Defines shift in input buffer between adjacent calculations of output values. - tensor stride; - /// @brief Defines gaps in the input - dilation rate k=1 is normal convolution, k=2 means skipping one pixel per input, k=4 means skipping 3 pixels. - /// As an example in one dimension, a filter w of size 3 would compute over input x the following: w[0]*x[0] + w[1]*x[1] + w[2]*x[2] for dilation of 1. - /// For dilation 2 the filter would instead compute w[0]*x[0] + w[1]*x[2] + w[2]*x[4]. - tensor dilation; - /// @brief Indicates that the primitive has user-defined output size (non-zero value). - bool with_output_size; - /// @brief User-defined output data size of the primitive (w/o padding). - tensor output_size; - /// @brief Primitive id containing scale bias data for fused convolution. - primitive_id scale_bias; - /// @brief Primitive id containing inverted variance used in future gradient computing for fused convolution. - primitive_id inv_variance; - /// @brief Epsilon for fused convolution. - float epsilon; - /// @brief On how many cards split the computation to. - int32_t split() const { return static_cast(weights.size()); } - /// @brief List of primitive ids containing weights data. - const primitive_id_arr weights; - /// @brief List of primitive ids containing bias data. - const primitive_id_arr bias; - -protected: - std::vector> get_dependencies() const override { - std::vector> ret; - ret.reserve(weights.size() + bias.size() + !scale_bias.empty() + !inv_variance.empty()); - for (auto& w : weights) ret.push_back(std::ref(w)); - for (auto& b : bias) ret.push_back(std::ref(b)); - if (!scale_bias.empty()) - ret.push_back(scale_bias); - if (!inv_variance.empty()) - ret.push_back(inv_variance); - return ret; - } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp b/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp index f7528c4e884..deb9a3ec73d 100644 --- a/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp +++ b/inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp @@ -37,9 +37,6 @@ struct fused_conv_eltwise : public primitive_base { /// @param input Input primitive id. /// @param weights List of primitive ids containing weights data. /// @param bias List of primitive ids containing bias data. - /// @param w_quantization_factor List of primitive ids containing weights quanitization factors per output feature map. - /// @param output_calibration_factors List of primitive ids output containing calibration factors per output feature map. - /// @param i_quantization_factor Input quantization factor /// @param input_offset Defines a shift, relative to (0,0) position of the input buffer, /// where (0,0) point of the convolution window should start calculations. /// @param stride Defines shift in input buffer between adjacent calculations of output values. @@ -57,11 +54,6 @@ struct fused_conv_eltwise : public primitive_base { eltwise_mode mode, const std::vector& weights, const std::vector& bias, - const std::vector& conv_w_quantization_factor, - const std::vector& conv_output_calibration_factors, - const float conv_i_quantization_factor, - const float non_conv_scale, - const primitive_id& eltw_output_calibration_factors, const std::vector& eltw_stride, tensor stride = {1, 1, 1, 1}, tensor input_offset = {0, 0, 0, 0}, @@ -74,18 +66,10 @@ struct fused_conv_eltwise : public primitive_base { optional_data_type output_data_type = {}) : primitive_base(id, {input, input2}, output_padding, output_data_type), conv((primitive_id_arr)weights, - (primitive_id_arr)bias, - (primitive_id_arr)conv_w_quantization_factor, - (primitive_id_arr)conv_output_calibration_factors), - eltw(eltw_output_calibration_factors), - non_conv_scale(non_conv_scale), + (primitive_id_arr)bias), + eltw(), conv_weights(weights), - conv_bias(bias), - conv_weights_quantization_factors(conv_w_quantization_factor), - conv_output_calibration_factors(conv_output_calibration_factors) { - conv.input_quantization_factor = conv_i_quantization_factor; - conv.output_quantization_factor = 1.0f; - + conv_bias(bias) { conv.input_offset = input_offset; conv.stride = stride; conv.dilation = dilation; @@ -100,10 +84,6 @@ struct fused_conv_eltwise : public primitive_base { if ((bias.size() != 0) && (weights.size() != bias.size())) throw std::runtime_error("convolution's weights/bias count does not match"); - if (conv.output_calibration_factors.size()) { - if ((weights.size() != 0) && (weights.size() != conv.weights_quantization_factors.size())) - throw std::runtime_error("convolution's weights count does not match quantization factors count"); - } } struct conv_data { @@ -111,14 +91,6 @@ struct fused_conv_eltwise : public primitive_base { const primitive_id_arr weights; /// @brief List of primitive ids containing bias data. const primitive_id_arr bias; - /// @brief List of primitive ids containing weights quanitization factors per output feature map. - const primitive_id_arr weights_quantization_factors; - /// @brief List of primitive ids containing output quanitization factors per output feature map for convolution. - const primitive_id_arr output_calibration_factors; - /// @brief Input quantization factor for convolution - float input_quantization_factor; - /// @brief Output quantization factor for convolution - float output_quantization_factor; /// @brief Defines a shift, relative to (0,0) position of the input buffer, where (0,0) point of the convolution window should start calculations. tensor input_offset; /// @brief Defines shift in input buffer between adjacent calculations of output values. @@ -137,20 +109,12 @@ struct fused_conv_eltwise : public primitive_base { tensor output_size; conv_data(const primitive_id_arr& weights, - const primitive_id_arr& bias, - const primitive_id_arr& weights_quantization_factors, - const primitive_id_arr& output_calibration_factors) + const primitive_id_arr& bias) : weights(weights), - bias(bias), - weights_quantization_factors(weights_quantization_factors), - output_calibration_factors(output_calibration_factors) {} + bias(bias) {} } conv; struct eltw_data { - /// @brief Primitive id containing output quanitization factors per output feature map. - primitive_id output_calibration_factors; - /// @brief Output quantization factor for eltwise - float output_quantization_factor; /// @param mode Eltwise mode. eltwise_mode mode; /// @brief Enable Relu activation. @@ -159,22 +123,11 @@ struct fused_conv_eltwise : public primitive_base { float activation_negative_slope; /// @brief Defines shift in input buffers between adjacent calculations of output values. std::vector stride; - explicit eltw_data(const primitive_id& output_calibration_factors) - : output_calibration_factors(output_calibration_factors) {} } eltw; /// @brief On how many cards split the computation to. int32_t split() const { return static_cast(conv.weights.size()); } - // FIXME: In fact, that should be needed for any EltWise primitive, not - // only the fused one. What's more important, these scales should be - // separate for different inputs and probably per-channel, not per - // primitive. - // - // I'm only needing a scalar for my particular task, so let's hack like - // this in the meantime. The final design is still to be investigated. - float non_conv_scale = 1.0f; - /// @brief Is optimization that output contains data from second input ON ? bool second_input_in_output = false; bool depth_to_space_already_fused = false; @@ -182,21 +135,13 @@ struct fused_conv_eltwise : public primitive_base { protected: const primitive_id_arr conv_weights; const primitive_id_arr conv_bias; - const primitive_id_arr conv_weights_quantization_factors; - const primitive_id_arr conv_output_calibration_factors; std::vector> get_dependencies() const override { std::vector> ret; - ret.reserve(conv.weights.size() + conv.bias.size() + conv.weights_quantization_factors.size() + - conv.output_calibration_factors.size() + (eltw.output_calibration_factors.empty() ? 0 : 1)); + ret.reserve(conv.weights.size() + conv.bias.size()); for (auto& w : conv.weights) ret.push_back(std::ref(w)); for (auto& b : conv.bias) ret.push_back(std::ref(b)); - for (auto& q : conv.weights_quantization_factors) ret.push_back(std::ref(q)); - for (auto& q : conv.output_calibration_factors) ret.push_back(std::ref(q)); - - if (!eltw.output_calibration_factors.empty()) - ret.push_back(eltw.output_calibration_factors); return ret; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h index 5865ca81734..d159874ee0a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_tools.h @@ -67,7 +67,6 @@ inline uint8_t GetActivationAdditionalParamsNumber(ActivationFunction func) { break; case ActivationFunction::RELU_NEGATIVE_SLOPE: case ActivationFunction::ELU: - case ActivationFunction::RELU_NEGATIVE_SLOPE_GRAD: paramsNum = 1; break; default: diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h index 58772662a89..c8e39446e3e 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h @@ -25,8 +25,6 @@ enum class KernelType { UNKNOWN, ARG_MAX_MIN, AVERAGE_UNPOOLING, - BATCH_NORM_GRAD, - LOOKUP_TABLE, CONVOLUTION, DECONVOLUTION, LRN, @@ -38,9 +36,7 @@ enum class KernelType { SOFT_MAX, ELTWISE, SCALE, - FUSED_CONV_BN_SCALE, FUSED_CONV_ELTWISE, - TABLE_LOOKUP, REORDER, RESHAPE, PERMUTE, @@ -49,21 +45,14 @@ enum class KernelType { REGION_YOLO, REORG_YOLO, MAX_UNPOOLING, - CONVOLUTION_GRAD_WEIGHTS, - SCALE_GRAD_WEIGHTS, MVN, - FULLY_CONNECTED_GRAD_INPUT, - FULLY_CONNECTED_GRAD_WEIGHTS, LSTM_GEMM, LSTM_ELT, - EMBED, - SOFT_MAX_LOSS_GRAD, BORDER, TILE, SELECT, BROADCAST, GEMM, - INDEX_SELECT, PYRAMID_ROI_ALIGN, CONTRACT, ONE_HOT, @@ -133,8 +122,6 @@ enum class ActivationFunction { SQRT, LINEAR, ELU, - RELU_GRAD, - RELU_NEGATIVE_SLOPE_GRAD, SIN, ASIN, SINH, @@ -155,7 +142,6 @@ enum class ActivationFunction { NEGATIVE, NOT, POW, - NONE_GRAD, ERF, HARD_SIGMOID, RECIPROCAL, diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp index e85a6e0eaaf..31b20418efb 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_base.cpp @@ -103,9 +103,6 @@ KernelsData ActivationKernelBase::GetCommonKernelsData(const Params& params, con FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 1, GetFusedPrimitiveInputsCount(params)); - if (newParams.gradient) - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); - if (!newParams.inputActivationParams.empty()) { kernel.arguments.push_back({ArgumentDescriptor::Types::SLOPE, 0}); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp index 0a14ff8c472..cbe17079757 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_opt.cpp @@ -34,7 +34,6 @@ ParamsKey ActivationKernelOpt::GetSupportedKey() const { k.EnableAllOutputLayout(); k.EnableTensorOffset(); k.EnableBatching(); - k.EnableGradient(); return k; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp index cc3231946a7..89f019c7af6 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/activation/activation_kernel_ref.cpp @@ -38,7 +38,6 @@ ParamsKey ActivationKernelRef::GetSupportedKey() const { k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); - k.EnableGradient(); return k; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp deleted file mode 100644 index ca181884b13..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "batch_norm_kernel_base.h" -#include - -namespace kernel_selector { -bool BatchNormKernelBase::Validate(const Params& p, const optional_params& o) const { - if (p.GetType() != KernelType::BATCH_NORM_GRAD || o.GetType() != KernelType::BATCH_NORM_GRAD) { - return false; - } - - return true; -} - -JitConstants BatchNormKernelBase::GetJitConstants(const batch_norm_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - - jit.AddConstant(MakeJitConstant("EPSILON", params.batchNormParams.epsilon)); - if (params.batchNormParams.with_inv_var) - jit.AddConstant(MakeJitConstant("FORWARD", 1)); - if (params.batchNormParams.with_scale_shift) - jit.AddConstant(MakeJitConstant("SCALE_SHIFT", 1)); - if (params.batchNormParams.with_mean_var_out) - jit.AddConstant(MakeJitConstant("MEAN_VAR_OUT", 1)); - - return jit; -} - -BatchNormKernelBase::DispatchData BatchNormKernelBase::SetDefault(const batch_norm_params& params) const { - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - kd.gws0 = params.inputs[0].Batch().v; - kd.gws1 = params.inputs[0].Feature().v; - kd.gws2 = 1; - - kd.lws0 = std::min(std::max(kd.gws0, static_cast(1)), static_cast(256)); - while (kd.gws0 % kd.lws0 != 0) { - --kd.lws0; - } - kd.lws1 = 1; - kd.lws2 = 1; - - return kd; -} - -KernelsData BatchNormKernelBase::GetCommonKernelsData(const Params& params, - const optional_params& options, - float estimatedTime) const { - if (!Validate(params, options)) { - return {}; - } - - const batch_norm_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - - KernelData kd = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - int inputs_num = 1 + orgParams.batchNormParams.with_inv_var + 2 * orgParams.batchNormParams.with_scale_shift + - 2 * orgParams.batchNormParams.with_mean_var_out; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, inputs_num); - - kd.estimatedTime = estimatedTime; - - return {kd}; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h deleted file mode 100644 index 91344f2786a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_base.h +++ /dev/null @@ -1,66 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "common_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// batch_norm_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct batch_norm_params : public base_params { - batch_norm_params() : base_params(KernelType::BATCH_NORM_GRAD) {} - - struct DedicatedParams { - float epsilon; - bool with_inv_var; - bool with_scale_shift; - bool with_mean_var_out = false; - }; - - DedicatedParams batchNormParams; - - virtual ParamsKey GetParamsKey() const { - return base_params::GetParamsKey(); - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// batch_norm_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct batch_norm_optional_params : optional_params { - batch_norm_optional_params() : optional_params(KernelType::BATCH_NORM_GRAD) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// BatchNormKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class BatchNormKernelBase : public common_kernel_base { -public: - using common_kernel_base::common_kernel_base; - virtual ~BatchNormKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - bool Validate(const Params& params, const optional_params& options) const override; - KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const; - virtual JitConstants GetJitConstants(const batch_norm_params& params) const; - virtual DispatchData SetDefault(const batch_norm_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp deleted file mode 100644 index e839de45283..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "batch_norm_kernel_ref.h" - -namespace kernel_selector { -ParamsKey BatchNormKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableBatching(); - return k; -} - -KernelsData BatchNormKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h deleted file mode 100644 index 117b068446c..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_ref.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "batch_norm_kernel_base.h" - -namespace kernel_selector { -class BatchNormKernelRef : public BatchNormKernelBase { -public: - BatchNormKernelRef() : BatchNormKernelBase("batch_norm_gpu_ref") {} - virtual ~BatchNormKernelRef() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp deleted file mode 100644 index 5d48a80933d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "batch_norm_kernel_selector.h" -#include "batch_norm_kernel_ref.h" - -namespace kernel_selector { - -batch_norm_kernel_selector::batch_norm_kernel_selector() { - Attach(); -} - -KernelsData batch_norm_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::BATCH_NORM_GRAD); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h deleted file mode 100644 index 25915b6bb2c..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm/batch_norm_kernel_selector.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class batch_norm_kernel_selector : public kernel_selector_base { -public: - static batch_norm_kernel_selector& Instance() { - static batch_norm_kernel_selector instance_; - return instance_; - } - - batch_norm_kernel_selector(); - - virtual ~batch_norm_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp deleted file mode 100644 index b5f679179a7..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "batch_norm_grad_kernel_base.h" - -namespace kernel_selector { -bool BatchNormGradKernelBase::Validate(const Params& p, const optional_params& o) const { - if (p.GetType() != KernelType::BATCH_NORM_GRAD || - o.GetType() != KernelType::BATCH_NORM_GRAD) { - return false; - } - - return true; -} - -JitConstants BatchNormGradKernelBase::GetJitConstants(const batch_norm_grad_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - return jit; -} - -BatchNormGradKernelBase::DispatchData BatchNormGradKernelBase::SetDefault(const batch_norm_grad_params& params) const { - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - kd.gws0 = params.inputs[0].Batch().v; - kd.gws1 = params.inputs[0].Feature().v; - kd.gws2 = 1; - - kd.lws0 = params.inputs[0].Batch().v; - kd.lws1 = 1; - kd.lws2 = 1; - - return kd; -} - -KernelsData BatchNormGradKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const { - if (!Validate(params, options)) { - return {}; - } - - const batch_norm_grad_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - - KernelData kd = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 3); - - kd.estimatedTime = estimatedTime; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h deleted file mode 100644 index f89a6ec3cf8..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_base.h +++ /dev/null @@ -1,57 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "common_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// batch_norm_grad_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct batch_norm_grad_params : public base_params { - batch_norm_grad_params() : base_params(KernelType::BATCH_NORM_GRAD) {} - - virtual ParamsKey GetParamsKey() const { - return base_params::GetParamsKey(); - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// batch_norm_grad_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct batch_norm_grad_optional_params : optional_params { - batch_norm_grad_optional_params() : optional_params(KernelType::BATCH_NORM_GRAD) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// BatchNormGradKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class BatchNormGradKernelBase : public common_kernel_base { -public: - using common_kernel_base::common_kernel_base; - virtual ~BatchNormGradKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - bool Validate(const Params& params, const optional_params& options) const override; - KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const; - virtual JitConstants GetJitConstants(const batch_norm_grad_params& params) const; - virtual DispatchData SetDefault(const batch_norm_grad_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp deleted file mode 100644 index c775d379fa9..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "batch_norm_grad_kernel_ref.h" - -namespace kernel_selector { -ParamsKey BatchNormGradKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableBatching(); - return k; -} - -KernelsData BatchNormGradKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h deleted file mode 100644 index f24fbc2ca98..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_ref.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "batch_norm_grad_kernel_base.h" - -namespace kernel_selector { -class BatchNormGradKernelRef : public BatchNormGradKernelBase { -public: - BatchNormGradKernelRef() : BatchNormGradKernelBase("batch_norm_grad_gpu_ref") {} - virtual ~BatchNormGradKernelRef() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp deleted file mode 100644 index 6891bd11ed3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.cpp +++ /dev/null @@ -1,29 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "batch_norm_grad_kernel_selector.h" -#include "batch_norm_grad_kernel_ref.h" - -namespace kernel_selector { - -batch_norm_grad_kernel_selector::batch_norm_grad_kernel_selector() { - Attach(); -} - -KernelsData batch_norm_grad_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::BATCH_NORM_GRAD); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h deleted file mode 100644 index 9a20745f9fb..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/batch_norm_grad/batch_norm_grad_kernel_selector.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class batch_norm_grad_kernel_selector : public kernel_selector_base { -public: - static batch_norm_grad_kernel_selector& Instance() { - static batch_norm_grad_kernel_selector instance_; - return instance_; - } - - batch_norm_grad_kernel_selector(); - - virtual ~batch_norm_grad_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp deleted file mode 100644 index 111971e0a6d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "contract_kernel_base.h" -#include -#include "kernel_selector_utils.h" - -namespace kernel_selector { -JitConstants ContractKernelBase::GetJitConstants(const contract_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - - const size_t no_dim_flag = 6; - std::vector output_dims(4, no_dim_flag); - int out_dim = 2; - for (int i = 3; i >= 0; --i) { - if (std::find(params.reduction_axes.begin(), params.reduction_axes.end(), i) == params.reduction_axes.end()) - output_dims.at(i) = out_dim--; - } - - if (output_dims[3] != no_dim_flag) - jit.AddConstants({MakeJitConstant("DIM_X", output_dims.at(3))}); - if (output_dims[2] != no_dim_flag) - jit.AddConstants({MakeJitConstant("DIM_Y", output_dims.at(2))}); - if (output_dims[1] != no_dim_flag) - jit.AddConstants({MakeJitConstant("DIM_F", output_dims.at(1))}); - if (output_dims[0] != no_dim_flag) - jit.AddConstants({MakeJitConstant("DIM_B", output_dims.at(0))}); - - jit.AddConstants({MakeJitConstant("REDUCE_X", output_dims.at(3) == no_dim_flag), - MakeJitConstant("REDUCE_Y", output_dims.at(2) == no_dim_flag), - MakeJitConstant("REDUCE_F", output_dims.at(1) == no_dim_flag), - MakeJitConstant("REDUCE_B", output_dims.at(0) == no_dim_flag)}); - - switch (params.mode) { - case ContractMode::SUM: - jit.AddConstants({MakeJitConstant("REDUCE_SEED", "0"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a + b")}); - break; - case ContractMode::PRODUCT: - jit.AddConstants({MakeJitConstant("REDUCE_SEED", "1"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a * b")}); - break; - case ContractMode::ALL: - jit.AddConstants( - {MakeJitConstant("REDUCE_SEED", "1"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a && b")}); - break; - case ContractMode::ANY: - jit.AddConstants( - {MakeJitConstant("REDUCE_SEED", "0"), MakeJitConstant("REDUCE_OPERATION(a, b)", "a || b")}); - break; - case ContractMode::MAX: - jit.AddConstants({MakeJitConstant("REDUCE_SEED", "UNIT_VAL_MIN"), - MakeJitConstant("REDUCE_OPERATION(a, b)", "UNIT_MAX_FUNC(a,b)")}); - break; - } - - return jit; -} - -ContractKernelBase::DispatchData ContractKernelBase::SetDefault(const contract_params& params) { - const auto& output = params.output; - - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - std::vector global{output.Feature().v, output.Y().v, output.X().v}; - const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); - - kd.gws0 = global[0]; - kd.gws1 = global[1]; - kd.gws2 = global[2]; - - kd.lws0 = local[0]; - kd.lws1 = local[1]; - kd.lws2 = local[2]; - - return kd; -} - -KernelsData ContractKernelBase::GetCommonKernelsData(const Params& params, - const optional_params& options, - float estimated_time) const { - assert(params.GetType() == KernelType::CONTRACT); - - const auto& prim_params = - static_cast(params); - - auto run_info = SetDefault(prim_params); - KernelData k_data = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(prim_params); - auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = k_data.kernels[0]; - FillCLKernelData(kernel, run_info, params.engineInfo, kernelName, jit, entry_point); - k_data.estimatedTime = estimated_time; - - return {k_data}; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h deleted file mode 100644 index e5bb4e81f06..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_base.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "common_kernel_base.h" -#include "kernel_selector_params.h" -#include - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// contract_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct contract_params : public base_params { - contract_params() : base_params(KernelType::CONTRACT), mode(ContractMode::ANY) {} - ContractMode mode; - std::vector reduction_axes; -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// contract_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct contract_optional_params : optional_params { - contract_optional_params() : optional_params(KernelType::CONTRACT) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ContractKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class ContractKernelBase : public common_kernel_base { -public: - using common_kernel_base::common_kernel_base; - - using DispatchData = CommonDispatchData; - -protected: - JitConstants GetJitConstants(const contract_params& params) const; - static DispatchData SetDefault(const contract_params& params); - KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp deleted file mode 100644 index a9ad9a42496..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "contract_kernel_ref.h" - -namespace kernel_selector { -ParamsKey ContractKernelRef::GetSupportedKey() const { - ParamsKey k; - - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableInputDataType(Datatype::INT32); - k.EnableInputDataType(Datatype::INT64); - - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT32); - k.EnableOutputDataType(Datatype::INT64); - - k.EnableInputLayout(DataLayout::bfyx); - - k.EnableOutputLayout(DataLayout::bfyx); - - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - - return k; -} - -KernelsData ContractKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h deleted file mode 100644 index feabcafab76..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_ref.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "contract_kernel_base.h" - -namespace kernel_selector { -class ContractKernelRef : public ContractKernelBase { -public: - ContractKernelRef() : ContractKernelBase("contract_ref") {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp deleted file mode 100644 index e339c1fefa6..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "contract_kernel_selector.h" -#include "contract_kernel_ref.h" - -namespace kernel_selector { -contract_kernel_selector::contract_kernel_selector() { Attach(); } - -KernelsData contract_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::CONTRACT); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h deleted file mode 100644 index b286988d504..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/contract/contract_kernel_selector.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class contract_kernel_selector : public kernel_selector_base { -public: - static contract_kernel_selector& Instance() { - static contract_kernel_selector instance; - return instance; - } - - contract_kernel_selector(); - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp deleted file mode 100644 index ce52aedc295..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byx8_f4); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::Validate(const Params& p, const optional_params& o) const { - if (!Parent::Validate(p, o)) { - return false; - } - - return true; -} - -size_t static get_wg_batch_size(const convolution_params& params) { - if (params.inputs[0].Batch().v % 64 == 0) - return 32; - return 1; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::SetDefault( - const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - runInfo.gws0 = (arg.output.Batch().v * arg.output.Feature().v) / (4 * 2); - runInfo.gws1 = arg.output.X().v / 8; - runInfo.gws2 = arg.output.Y().v / 2; - - runInfo.lws0 = 8 * get_wg_batch_size(arg); - runInfo.lws1 = 1; - runInfo.lws2 = 1; - - return runInfo; -} - -JitConstants ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetJitConstants(const convolution_params& params, - const DispatchData& kd) const { - auto jits = ConvolutionKernelBase::GetJitConstants(params, kd); - - jits.AddConstant(MakeJitConstant("WG_BATCH_SIZE", get_wg_batch_size(params))); - - return jits; -} - -KernelsData ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options, " -Dcl_intel_subgroups_char"); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_3; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h deleted file mode 100644 index b7eeb1e51fa..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32 : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32() - : ConvolutionKernelBase("convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32") {} - virtual ~ConvolutionKernel_byx8_f4__fs_bs_yx_bsv4_fsv32() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - ConvolutionKernelBase::DispatchData SetDefault(const convolution_params& arg, int) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_y_x8_osv8_isv4_swizzled_by_4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp deleted file mode 100644 index 45bb5a16728..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::SetDefault( - const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - runInfo.gws0 = (arg.output.Batch().v * arg.output.Feature().v) / 4; - runInfo.gws1 = arg.output.X().v / 8; - runInfo.gws2 = arg.output.Y().v; - - runInfo.lws0 = 8; - runInfo.lws1 = 1; - runInfo.lws2 = 1; - - return runInfo; -} - -KernelsData ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetCommonKernelsData(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h deleted file mode 100644 index 3507b157074..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32 : public ConvolutionKernelBase { -public: - ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32() : ConvolutionKernelBase("convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32") {} - virtual ~ConvolutionKernel_byxf_fs_bs_yx_bsv4_fsv32() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - ConvolutionKernelBase::DispatchData SetDefault(const convolution_params& arg, int) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::yxio; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp deleted file mode 100644 index 35fade3594b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_1x1_gemm.h" - -namespace kernel_selector { - -ParamsKey ConvolutionKernel_mmad_1x1_gemm::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableDepthwiseSeparableOpt(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_1x1_gemm::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o)) { - return false; - } - - const auto& params = static_cast(p); - - if (params.filterSize.x != 1 || params.filterSize.y != 1) - return false; - - if (params.stride.x != 1 || params.stride.y != 1) - return false; - - if (params.padding.x != 0 || params.padding.y != 0) - return false; - - const auto& input = params.inputs[0]; - - // we do not support padded input - if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0) - return false; - - if (params.split != 1) - return false; - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_1x1_gemm::SetDefault(const convolution_params& arg, int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - // Sub-group size used by "convolution_1x1_gemm_MMAD" kernel. - constexpr size_t sub_group_size = 8; - - const auto of_maps = arg.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - - runInfo.efficiency = FORCE_PRIORITY_2; - - runInfo.gws0 = RoundUp(arg.output.X().v * arg.output.Y().v, 8) / 8; - runInfo.gws1 = of_threads_per_batch * arg.output.Batch().v; - runInfo.gws2 = 1; - - runInfo.lws0 = 1; - runInfo.lws1 = sub_group_size; - runInfo.lws2 = 1; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_1x1_gemm::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_1x1_gemm::GetKernelsData(const Params& params, const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h deleted file mode 100644 index 001e92a16c8..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_1x1_gemm.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_1x1_gemm : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_1x1_gemm() : ConvolutionKernelBase("convolution_gpu_1x1_gemm_MMAD") {} - virtual ~ConvolutionKernel_mmad_1x1_gemm() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - bool Validate(const Params& p, const optional_params& o) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::os_is_yx_isa8_osv8_isv4; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp deleted file mode 100644 index 69d79e7527e..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -static const size_t _SG_TILE_M = 32; -static const size_t _SG_TILE_N = 32; -static const size_t _SG_SIZE = 8; // sub group size -static const size_t _TILES_PER_SG_X = 1; // Persistent threads -static const size_t _TILES_PER_SG_Y = 1; // Persistent threads - -ParamsKey ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - // make sure it's 1x1 conv - if (cp.filterSize.x != 1 || cp.filterSize.y != 1) - return false; - - // make sure stride is 1x1 - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - // input padding not supported - if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 || - cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0) - return false; - - // input and output spatial sizes must match - if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) - return false; - - const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v; - const auto k = cp.inputs[0].Feature().v; - const auto n = cp.output.Feature().v; - - if (m % 32 != 0 && m % 128 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 - return false; - - if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 - return false; - - if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 - return false; - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault( - const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; - size_t mat_n = arg.output.Feature().v; - - size_t _MATRIX_M = mat_m; - size_t _MATRIX_N = mat_n; - - size_t _WG_TILE_M = 128; - size_t _WG_TILE_N = 128; - - // Calculate number of threads needed - const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; - const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y; - - // Define execution setup for kernel: - size_t globalWorkSize[3] = {threadsX, threadsY, 1}; - size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1}; - - runInfo.gws0 = globalWorkSize[0]; - runInfo.gws1 = globalWorkSize[1]; - runInfo.gws2 = globalWorkSize[2]; - - runInfo.lws0 = localWorkSize[0]; - runInfo.lws1 = localWorkSize[1]; - runInfo.lws2 = localWorkSize[2]; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("WG_TILE_M", 128)); // Work-Group tile size M, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1)); // Persistent threads - jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1)); // Persistent threads - - // Do not change values below - jit.AddConstant(MakeJitConstant("DIM_X", 0)); - jit.AddConstant(MakeJitConstant("DIM_Y", 1)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); - jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); - jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); - jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); - jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); - jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); - jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); - - jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); - jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); - jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); - - const auto& input = params.inputs[0]; - const auto& output = params.output; - - auto m = output.X().v * output.Y().v * output.Batch().v; - auto k = input.Feature().v; - auto n = output.Feature().v; - - jit.AddConstant(MakeJitConstant("MATRIX_M", m)); - jit.AddConstant(MakeJitConstant("MATRIX_K", k)); - jit.AddConstant(MakeJitConstant("MATRIX_N", n)); - - const size_t out_x_pitch = 32 * 4; - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); - const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); - const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); - - bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; - jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_1; // _3 - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h deleted file mode 100644 index 4ae916967fc..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8 : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8() - : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_128x128wg_slm_int8") {} - - virtual ~ConvolutionKernel_mmad_32x32sg_128x128wg_slm_int8() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::is_o32_yx_isv32_swizzled_by_4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp deleted file mode 100644 index 6360cd68b1d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -static const size_t _SG_TILE_M = 32; -static const size_t _SG_TILE_N = 32; -static const size_t _SG_SIZE = 8; // sub group size -static const size_t _TILES_PER_SG_X = 1; // Persistent threads -static const size_t _TILES_PER_SG_Y = 1; // Persistent threads - -ParamsKey ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - // make sure it's 1x1 conv - if (cp.filterSize.x != 1 || cp.filterSize.y != 1) - return false; - - // make sure stride is 1x1 - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - // input padding not supported - if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 || - cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0) - return false; - - // input and output spatial sizes must match - if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) - return false; - - const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v; - const auto k = cp.inputs[0].Feature().v; - const auto n = cp.output.Feature().v; - - if (m % 32 != 0 && m % 224 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 - return false; - - if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 - return false; - - if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 - return false; - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault( - const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; - size_t mat_n = arg.output.Feature().v; - - size_t _MATRIX_M = mat_m; - size_t _MATRIX_N = mat_n; - - size_t _WG_TILE_M = 224; - size_t _WG_TILE_N = 128; - - // Calculate number of threads needed - const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; - const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y; - - // Define execution setup for kernel: - size_t globalWorkSize[3] = {threadsX, threadsY, 1}; - size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1}; - - runInfo.gws0 = globalWorkSize[0]; - runInfo.gws1 = globalWorkSize[1]; - runInfo.gws2 = globalWorkSize[2]; - - runInfo.lws0 = localWorkSize[0]; - runInfo.lws1 = localWorkSize[1]; - runInfo.lws2 = localWorkSize[2]; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("WG_TILE_M", 224)); // Work-Group tile size M, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X)); - jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y)); - - // Do not change values below - jit.AddConstant(MakeJitConstant("DIM_X", 0)); - jit.AddConstant(MakeJitConstant("DIM_Y", 1)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); - jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); - jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); - jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); - jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); - jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); - jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); - - jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); - jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); - jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); - - const auto& input = params.inputs[0]; - const auto& output = params.output; - - auto m = output.X().v * output.Y().v * output.Batch().v; - auto k = input.Feature().v; - auto n = output.Feature().v; - - jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M - jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N - - const size_t out_x_pitch = 32 * 4; - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); - const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); - const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); - - bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; - jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_1; // _3 - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h deleted file mode 100644 index 4ac16cf5e1b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8 : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8() - : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_224x128wg_slm_int8") {} - - virtual ~ConvolutionKernel_mmad_32x32sg_224x128wg_slm_int8() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::is_o32_yx_isv32_swizzled_by_4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp deleted file mode 100644 index 141ec7cf439..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.cpp +++ /dev/null @@ -1,176 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_kernel_mmad_32x32sg_slm_int8.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -static const size_t _SG_TILE_M = 32; -static const size_t _SG_TILE_N = 32; -static const size_t _SG_SIZE = 8; // sub group size -static const size_t _TILES_PER_SG_X = 1; // Persistent threads -static const size_t _TILES_PER_SG_Y = 1; // Persistent threads - -ParamsKey ConvolutionKernel_mmad_32x32sg_slm_int8::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableQuantization(QuantizationType::SYMMETRIC); - k.DisableTuning(); - return k; -} - -bool ConvolutionKernel_mmad_32x32sg_slm_int8::Validate(const Params& p, const optional_params& o) const { - if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - // make sure it's 1x1 conv - if (cp.filterSize.x != 1 || cp.filterSize.y != 1) - return false; - - // make sure stride is 1x1 - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - // input padding not supported - if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 || - cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0) - return false; - - // input and output spatial sizes must match - if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) - return false; - - const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v; - const auto k = cp.inputs[0].Feature().v; - const auto n = cp.output.Feature().v; - - if (m % 32 != 0) // Matrix size M, Must be mutliple of 32 - return false; - - if (k % 32 != 0) // Matrix size K, Must be multiple of 32 - return false; - - if (n % 32 != 0) // Matrix size N, Must be mutliple of 32 - return false; - - return true; -} - -ConvolutionKernelBase::DispatchData ConvolutionKernel_mmad_32x32sg_slm_int8::SetDefault(const convolution_params& arg, - int) const { - DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_2; - - size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; - size_t mat_n = arg.output.Feature().v; - - size_t _MATRIX_M = mat_m; - size_t _MATRIX_N = mat_n; - - size_t _WG_TILE_M = 32; - size_t _WG_TILE_N = 32; - - // Calculate number of threads needed - const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; - const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y; - - // Define execution setup for kernel: - size_t globalWorkSize[3] = {threadsX, threadsY, 1}; - size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1}; - - runInfo.gws0 = globalWorkSize[0]; - runInfo.gws1 = globalWorkSize[1]; - runInfo.gws2 = globalWorkSize[2]; - - runInfo.lws0 = localWorkSize[0]; - runInfo.lws1 = localWorkSize[1]; - runInfo.lws2 = localWorkSize[2]; - - return runInfo; -} - -JitConstants ConvolutionKernel_mmad_32x32sg_slm_int8::GetJitConstants(const convolution_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("WG_TILE_M", 32)); // Work-Group tile size M, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("WG_TILE_N", 32)); // Work-Group tile size N, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X)); - jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y)); - - // Do not change values below - jit.AddConstant(MakeJitConstant("DIM_X", 0)); - jit.AddConstant(MakeJitConstant("DIM_Y", 1)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); - jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); - jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); - jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); - jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); - jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); - jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); - - jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); - jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); - jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); - - const auto& input = params.inputs[0]; - const auto& output = params.output; - - auto m = output.X().v * output.Y().v * output.Batch().v; - auto k = input.Feature().v; - auto n = output.Feature().v; - - jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M - jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N - - const size_t out_x_pitch = 32 * 4; - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); - const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); - const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); - - return jit; -} - -KernelsData ConvolutionKernel_mmad_32x32sg_slm_int8::GetKernelsData(const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_2; // _3 - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h deleted file mode 100644 index 6a9250d5e8a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_mmad_32x32sg_slm_int8.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_kernel_base.h" -#include - -namespace kernel_selector { - -class ConvolutionKernel_mmad_32x32sg_slm_int8 : public ConvolutionKernelBase { -public: - using Parent = ConvolutionKernelBase; - ConvolutionKernel_mmad_32x32sg_slm_int8() : ConvolutionKernelBase("convolution_gpu_mmad_32x32sg_slm_int8") {} - - virtual ~ConvolutionKernel_mmad_32x32sg_slm_int8() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override { - return WeightsLayout::is_o_yx_isv32; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp index a1f16f37da9..558b2265399 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp @@ -35,19 +35,8 @@ #include "convolution_kernel_winograd_6x3_s1_fused.h" #include "convolution_kernel_mmad.h" #include "convolution_kernel_mmad_blocks.h" -#include "convolution_kernel_mmad_1x1_gemm.h" #include "convolution_kernel_imad_byxf_af32_depthwise.h" -#include "convolution_kernel_mmad_batched.h" #include "convolution_kernel_bfyx_depthwise_weights_lwg.h" -#include "convolution_kernel_mmad_slm_2x14_rep4.h" -#include "convolution_kernel_mmad_slm_7x7_rep4.h" -#include "convolution_kernel_byxf_fs_bs_yx_bsv4_fsv32.h" -#include "convolution_kernel_mmad_batched_block.h" -#include "convolution_kernel_mmad_batched_block_1x1.h" -#include "convolution_kernel_mmad_32x32sg_128x128wg_slm_int8.h" -#include "convolution_kernel_mmad_32x32sg_224x128wg_slm_int8.h" -#include "convolution_kernel_mmad_32x32sg_slm_int8.h" -#include "convolution_kernel_byx8_f4__fs_bs_yx_bsv4_fsv32.h" #include "convolution_kernel_imad.h" #include "convolution_kernel_fs_byx_fsv32.h" #include "convolution_kernel_fs_byx_fsv32_1x1.h" @@ -134,19 +123,6 @@ convolution_kernel_selector::convolution_kernel_selector() { Attach(); Attach(); Attach(); - Attach(); - - // fs_bs_yx_bsv4_fsv32 int8 - Attach(); - Attach(); - Attach(); - Attach(); - Attach(); - Attach(); - Attach(); - Attach(); - Attach(); - // Attach(); // b_fs_yx_fsv4 kernels Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp deleted file mode 100644 index 85a7d07a936..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.cpp +++ /dev/null @@ -1,67 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_grad_weights_kernel_1x1.h" - -namespace kernel_selector { - -ParamsKey ConvolutionGradWeightsKernel1x1::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableSubGroup(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableMomentum(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} - -bool ConvolutionGradWeightsKernel1x1::Validate(const Params& p, const optional_params&) const { - const convolution_grad_weights_params& params = static_cast(p); - - if (params.filterSize.x != 1 || params.filterSize.y != 1) - return false; - return true; -} - -ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel1x1::SetDefault( - const convolution_grad_weights_params& params) const { - auto input_features = params.weights.IFM().v; - auto output_features = params.weights.OFM().v; - - DispatchData kd; - - kd.gws0 = 16; - kd.gws1 = input_features; - kd.gws2 = output_features; - kd.lws0 = 16; - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = FORCE_PRIORITY_8; - return kd; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h deleted file mode 100644 index 7770075a385..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_1x1.h +++ /dev/null @@ -1,32 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "convolution_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class ConvolutionGradWeightsKernel1x1 : public ConvolutionGradWeightsKernelBase { -public: - ConvolutionGradWeightsKernel1x1() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_1x1") {} - virtual ~ConvolutionGradWeightsKernel1x1() {} - - DispatchData SetDefault(const convolution_grad_weights_params& params) const override; - bool Validate(const Params& p, const optional_params& o) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp deleted file mode 100644 index 6158d7a7ead..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_grad_weights_kernel_3x3.h" -#include - -namespace kernel_selector { - -ParamsKey ConvolutionGradWeightsKernel3x3::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableMomentum(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} - -bool ConvolutionGradWeightsKernel3x3::Validate(const Params& p, const optional_params&) const { - const auto& params = static_cast(p); - - if (params.stride.x != 1 || params.stride.y != 1) - return false; - if (params.filterSize.x != 3 || params.filterSize.y != 3) - return false; - return true; -} - -ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel3x3::SetDefault( - const convolution_grad_weights_params& params) const { - auto input_features = params.weights.IFM().v; - auto output_features = params.weights.OFM().v; - - DispatchData kd; - - kd.gws0 = Align(output_features, 16); - kd.gws1 = input_features; - kd.gws2 = 1; - kd.lws0 = std::min(std::max(kd.gws0, static_cast(1)), static_cast(32)); - while (kd.gws0 % kd.lws0 != 0) { - kd.lws0 -= 16; - } - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = FORCE_PRIORITY_8; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h deleted file mode 100644 index 48f3591a1bb..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_3x3.h +++ /dev/null @@ -1,32 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "convolution_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class ConvolutionGradWeightsKernel3x3 : public ConvolutionGradWeightsKernelBase { -public: - ConvolutionGradWeightsKernel3x3() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_3x3") {} - virtual ~ConvolutionGradWeightsKernel3x3() {} - - DispatchData SetDefault(const convolution_grad_weights_params& params) const override; - bool Validate(const Params& p, const optional_params& o) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp deleted file mode 100644 index 2e4254d3cd4..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_grad_weights_kernel_7x7.h" -#include - -namespace kernel_selector { - -ParamsKey ConvolutionGradWeightsKernel7x7::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableMomentum(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} - -bool ConvolutionGradWeightsKernel7x7::Validate(const Params& p, const optional_params&) const { - const auto& params = static_cast(p); - - if (params.filterSize.x != 7 || params.filterSize.y != 7) - return false; - return true; -} - -ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel7x7::SetDefault( - const convolution_grad_weights_params& params) const { - auto input_features = params.weights.IFM().v; - auto output_features = params.weights.OFM().v; - - DispatchData kd; - - kd.gws0 = 8; - kd.gws1 = Align(output_features, 16); - kd.gws2 = input_features; - kd.lws0 = 1; - kd.lws1 = std::min(std::max(kd.gws1, static_cast(1)), static_cast(32)); - while (kd.gws1 % kd.lws1 != 0) { - kd.lws1 -= 16; - } - kd.lws2 = 1; - kd.efficiency = FORCE_PRIORITY_8; - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h deleted file mode 100644 index a1f99ce0799..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_7x7.h +++ /dev/null @@ -1,32 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "convolution_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class ConvolutionGradWeightsKernel7x7 : public ConvolutionGradWeightsKernelBase { -public: - ConvolutionGradWeightsKernel7x7() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_7x7") {} - virtual ~ConvolutionGradWeightsKernel7x7() {} - - DispatchData SetDefault(const convolution_grad_weights_params& params) const override; - bool Validate(const Params& p, const optional_params& o) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp deleted file mode 100644 index 6d799f73849..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "convolution_grad_weights_kernel_base.h" -#include "kernel_selector_utils.h" -#include -#include -#include - -namespace kernel_selector { -std::string convolution_grad_weights_params::to_string() const { - std::stringstream s; - - s << base_params::to_string() << "_"; - if (bias.empty()) { - s << "no_bias" - << "_"; - } else { - s << "bias_" << bias[0].PhysicalSize() << "_"; - } - s << filterSize.x << "_" << filterSize.y << "_"; - s << stride.x << "_" << stride.y << "_"; - s << dilation.x << "_" << dilation.y << "_"; - s << padding.x << "_" << padding.y << "_"; - s << split; - - return s.str(); -} - -JitConstants ConvolutionGradWeightsKernelBase::GetJitConstants(const convolution_grad_weights_params& cp) const { - JitConstants jit = training_kernel_base::GetJitConstants(cp); - const auto& padding = cp.padding; - const auto& input = cp.inputs[0]; - - int64_t input_offset_with_padding = (int64_t)input.GetFirstElementOffset() - - (cp.filterSize.x - 1 + padding.x) * input.X().pitch - - (cp.filterSize.y - 1 + padding.y) * input.Y().pitch; - input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0); - - jit.AddConstants({ - MakeJitConstant("STRIDE", cp.stride), - MakeJitConstant("PADDING", cp.padding), - MakeJitConstant("DILATION", cp.dilation), - MakeJitConstant("FILTER_ARRAY_NUM", cp.split), - MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), - MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", cp.depthwise_separable_opt), - MakeJitConstant("OUTPUT_GRAD_W", cp.output_grad_w), - }); - - return jit; -} - -ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernelBase::SetDefault( - const convolution_grad_weights_params& params) const { - auto input_features = params.weights.IFM().v; - auto output_features = params.weights.OFM().v; - - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - size_t gws0 = output_features * input_features; - size_t lws0 = std::min(gws0, static_cast(32)); - while (gws0 % lws0) { - lws0--; - } - kd.gws0 = gws0; - kd.gws1 = params.weights.X().v; - kd.gws2 = params.weights.Y().v; - kd.lws0 = lws0; - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - return kd; -} - -KernelsData ConvolutionGradWeightsKernelBase::GetKernelsData(const Params& params, - const optional_params& options) const { - assert(params.GetType() == KernelType::CONVOLUTION_GRAD_WEIGHTS); - - if (!Validate(params, options)) { - return {}; - } - - const convolution_grad_weights_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - KernelData kd = KernelData::Default(params); - convolution_grad_weights_params& newParams = *static_cast(kd.params.get()); - - bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oiyx, kd.weightsReorderParams); - - if (!succeed) { - return {}; - } - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, - runInfo, - params.engineInfo, - kernelName, - jit, - entry_point, - DEFAULT, - true, - !orgParams.bias.empty()); - if (newParams.use_momentum) { - kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0}); - if (!newParams.bias.empty()) - kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_BIAS_GRADIENT, 0}); - } - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); - kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0}); - kernel.arguments.push_back({ArgumentDescriptor::Types::LEARNING_RATE, 0}); - - kd.estimatedTime = runInfo.efficiency; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h deleted file mode 100644 index d3f843b174d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_base.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "training_kernel_base.h" -#include "kernel_selector_params.h" -#include - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// convolution_grad_weights_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct convolution_grad_weights_params : public training_params { - convolution_grad_weights_params() : training_params(KernelType::CONVOLUTION_GRAD_WEIGHTS) {} - - uSize filterSize; - uSize stride; - uSize dilation; - uSize padding; - uint32_t split = 1; - bool depthwise_separable_opt = false; - bool output_grad_w = false; - - std::string to_string() const override; - - ParamsKey GetParamsKey() const override { - ParamsKey k = training_params::GetParamsKey(); - - if (split > 1) { - k.EnableSplitSupport(); - } - - if (dilation.x != 1 || dilation.y != 1) { - k.EnableDilation(); - } - - if (depthwise_separable_opt) { - k.EnableDepthwiseSeparableOpt(); - } - return k; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// convolution_grad_weights_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct convolution_grad_weights_optional_params : training_optional_params { - convolution_grad_weights_optional_params() : training_optional_params(KernelType::CONVOLUTION_GRAD_WEIGHTS) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ConvolutionGradWeightsKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class ConvolutionGradWeightsKernelBase : public training_kernel_base { -public: - using training_kernel_base::training_kernel_base; - virtual ~ConvolutionGradWeightsKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const; - virtual JitConstants GetJitConstants(const convolution_grad_weights_params& params) const; - virtual DispatchData SetDefault(const convolution_grad_weights_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp deleted file mode 100644 index 6ce107dcec7..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_grad_weights_kernel_ref.h" - -namespace kernel_selector { - -ParamsKey ConvolutionGradWeightsKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableMomentum(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h deleted file mode 100644 index 141ca55ec6f..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_ref.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "convolution_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class ConvolutionGradWeightsKernelRef : public ConvolutionGradWeightsKernelBase { -public: - ConvolutionGradWeightsKernelRef() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_ref") {} - virtual ~ConvolutionGradWeightsKernelRef() {} - - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp deleted file mode 100644 index 405c6801516..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "convolution_grad_weights_kernel_selector.h" -#include "convolution_grad_weights_kernel_ref.h" -#include "convolution_grad_weights_kernel_1x1.h" -#include "convolution_grad_weights_kernel_yxfb.h" -#include "convolution_grad_weights_kernel_3x3.h" -#include "convolution_grad_weights_kernel_7x7.h" - -namespace kernel_selector { -convolution_grad_weights_kernel_selector::convolution_grad_weights_kernel_selector() { - Attach(); - Attach(); - Attach(); - Attach(); - Attach(); -} - -KernelsData convolution_grad_weights_kernel_selector::GetBestKernels(const Params& params, - const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::CONVOLUTION_GRAD_WEIGHTS); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h deleted file mode 100644 index ed5a30c6df5..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_selector.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class convolution_grad_weights_kernel_selector : public kernel_selector_base { -public: - static convolution_grad_weights_kernel_selector& Instance() { - static convolution_grad_weights_kernel_selector instance_; - return instance_; - } - - convolution_grad_weights_kernel_selector(); - - virtual ~convolution_grad_weights_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp deleted file mode 100644 index d5b63fe62d7..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "convolution_grad_weights_kernel_yxfb.h" - -namespace kernel_selector { - -ParamsKey ConvolutionGradWeightsKernel_yxfb::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableSubGroup(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableMomentum(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} - -bool ConvolutionGradWeightsKernel_yxfb::Validate(const Params& p, const optional_params&) const { - const convolution_grad_weights_params& params = static_cast(p); - auto batch = params.inputs[0].Batch().v; - - if (batch % 16 != 0) - return false; - if (params.stride.x != 1 || params.stride.y != 1) - return false; - return true; -} - -ConvolutionGradWeightsKernelBase::DispatchData ConvolutionGradWeightsKernel_yxfb::SetDefault( - const convolution_grad_weights_params& params) const { - auto input_features = params.weights.IFM().v; - auto output_features = params.weights.OFM().v; - auto x = params.weights.X().v; - auto y = params.weights.Y().v; - - DispatchData kd; - - kd.gws0 = 16; - kd.gws1 = input_features * output_features; - kd.gws2 = x * y; - - kd.lws0 = 16; - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = FORCE_PRIORITY_7; - - return kd; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h deleted file mode 100644 index 6e897babfa2..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution_grad_weights/convolution_grad_weights_kernel_yxfb.h +++ /dev/null @@ -1,32 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "convolution_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class ConvolutionGradWeightsKernel_yxfb : public ConvolutionGradWeightsKernelBase { -public: - ConvolutionGradWeightsKernel_yxfb() : ConvolutionGradWeightsKernelBase("convolution_grad_weights_yxfb") {} - virtual ~ConvolutionGradWeightsKernel_yxfb() {} - - DispatchData SetDefault(const convolution_grad_weights_params& params) const override; - bool Validate(const Params& p, const optional_params& o) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp index 8ec74cd406f..4084bdb8ae1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_bfyx_opt.cpp @@ -35,7 +35,6 @@ ParamsKey DeconvolutionKernel_bfyx_opt::GetSupportedKey() const { k.EnableBatching(); k.EnableSplitSupport(); k.EnableDepthwiseSeparableOpt(); - k.EnableGradient(); k.EnableGroupedConvolution(); return k; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp index 4f6bfc29ade..b3d4268b4d9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/deconvolution/deconvolution_kernel_ref.cpp @@ -55,7 +55,6 @@ ParamsKey DeconvolutionKernelRef::GetSupportedKey() const { k.EnableBatching(); k.EnableSplitSupport(); k.EnableDepthwiseSeparableOpt(); - k.EnableGradient(); k.EnableGroupedConvolution(); k.EnableDifferentTypes(); k.EnableDifferentInputWeightsTypes(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp index 102fcf2a59f..38c69c3c017 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp @@ -222,9 +222,7 @@ KernelsData EltwiseKernel_b_fs_yx_fsv16::GetKernelsData(const Params& params, co kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, - false, - newParams.int8_quantization, - newParams.output_calibration); + false); kd.estimatedTime = runInfo.efficiency; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp deleted file mode 100644 index 1f173bb8886..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.cpp +++ /dev/null @@ -1,288 +0,0 @@ -/* -// Copyright (c) 2019-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "eltwise_kernel_b_fs_yx_fsv4.h" -#include "kernel_selector_utils.h" -#include -#include - -namespace kernel_selector { - -ParamsKey EltwiseKernel_b_fs_yx_fsv4::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnableInt8Quantization(); - k.EnableEltwiseStride(); - return k; -} - -EltwiseKernelBase::DispatchData EltwiseKernel_b_fs_yx_fsv4::SetDefault(const eltwise_params& params) const { - DispatchData kd; - - // Because of very specific requirements for data, we may linearize the data, - // i.e. use only one dimension, e.g. 'X'. - - // GWS: - // we process 4*4 (4 int8 bytes per on block_read4 reading) features per workitem - kd.gws0 = params.output.X().v * params.output.Y().v * params.output.Batch().v * params.output.Feature().v / (4 * 4); - kd.gws1 = 1; - kd.gws2 = 1; - // LWS: - kd.lws0 = 8; - kd.lws1 = 1; - kd.lws2 = 1; - - kd.efficiency = FORCE_PRIORITY_1; - return kd; -} - -bool EltwiseKernel_b_fs_yx_fsv4::Validate(const Params& params, const optional_params& options) const { - // Requirents to use 'eltwise_b_fs_yx_fsv4' kernel are below: - // 1. No stride - // 2. All dimensions for all inputs are the same - // 3. No padding - // So, it can be linearized - - if (!Parent::Validate(params, options)) { - return false; - } - - KernelData kd = KernelData::Default(params); - eltwise_params& newParams = *static_cast(kd.params.get()); - - // 1. No stride - if (!newParams.stride.empty()) { - return false; - } - - for (size_t i = 0; i < newParams.inputs.size() - 1; i++) { - // 2. All dimensions for all inputs are the same - if (!(newParams.inputs[i] == newParams.inputs[i + 1])) { - return false; - } - } - - const auto& in = newParams.inputs[0]; - for (size_t i = 0; i < in.Dimentions(); i++) { - // 3. No padding - if ((in.GetDims()[i].pad.before != 0) || (in.GetDims()[i].pad.after != 0)) { - return false; - } - } - - return true; -} - -JitConstants EltwiseKernel_b_fs_yx_fsv4::GetJitConstants(const eltwise_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - - if (params.inputs[0].GetDType() == Datatype::UINT8) { - // Special handler for unsigned types - jit.AddConstants({MakeJitConstant("ELTW_UNSIGNED", 1)}); - } - - /////////////// - jit.AddConstants({ - MakeJitConstant("ELTWISE_LAYOUT_BASED", params.layoutBased), - MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization), - }); - - if (params.int8_quantization) { - if (params.output_calibration) { - jit.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.output_calibration)); - jit.AddConstant(MakeJitConstant("O_QF", params.output_calibration_factors[0])); - - } else { - jit.AddConstants({MakeJitConstant("O_QF", params.output_quantization_factor)}); - } - } - - std::string inputs_decls; - auto& updateInputs = params.updateInputIds; - - for (size_t i = 0; i < params.inputs.size(); i++) { - // const should be added only to inputs which will not be updated - std::string const_str = "const"; - for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) { - if (updateInputs[update_input_idx].inputId == i) { - const_str = ""; - break; - } - } - - inputs_decls += - const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", "; - } - - jit.AddConstant(MakeJitConstant("INPUTS_DECLS", inputs_decls)); - jit.AddConstant(MakeJitConstant("ELTWISE_NO_PITCH_SAME_DIMS", CheckInputsOutputNoPitchSameDims(params))); - - std::string do_eltwise; - - auto& operations = params.operations; - auto& coefficients = params.coefficients; - - for (size_t op_num = 0; op_num < operations.size(); op_num++) { - const std::string op_num_str = std::to_string(op_num); - const auto& ew = operations[op_num]; - - for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { - const auto& input = ew.inputs[input_idx]; - const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx); - switch (input.mode) { - case EltwiseInputMode::SCALAR: - jit.AddConstant(MakeJitConstant(name, input.scalar)); - break; - case EltwiseInputMode::INPUT_BUFFER: - jit.AddConstant(MakeJitConstant(name, - "GET_INPUT(input" + std::to_string(input.index) + ", INPUT" + - std::to_string(input.index) + ")")); - break; - case EltwiseInputMode::OUTPUT_BUFFER: - jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT, )]")); - break; - case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER: - jit.AddConstant(MakeJitConstant( - name, - "input" + std::to_string(input.index) + "[(size_t)tmp" + std::to_string(input.tmpIndex) + "]")); - break; - case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX: - jit.AddConstant(MakeJitConstant(name, "tmp" + std::to_string(input.tmpIndex))); - break; - default: - break; - } - } - std::string input0_str, input1_str, cast_type, op; - - cast_type = "(int16)"; - op = "const int16 tmp" + op_num_str + " = "; - - input0_str = cast_type + "INPUT_" + op_num_str + "_0"; - input1_str = cast_type + "INPUT_" + op_num_str + "_1"; - - if (ew.mode == EltwiseMode::ADD) { - std::vector coeff_strings(ew.inputs.size(), ""); - for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { - const auto& input = ew.inputs[input_idx]; - if (input.mode == EltwiseInputMode::INPUT_BUFFER && input.index < coefficients.size()) { - const float c = coefficients[input.index]; - if (c != 1.0f) - coeff_strings[input_idx] = cast_type + "(" + std::to_string(c) + ")*"; - } - } - - input0_str = coeff_strings[0] + input0_str; - input1_str = coeff_strings[1] + input1_str; - } - - switch (ew.mode) { - case EltwiseMode::ADD: - op += input0_str + " + " + input1_str; - break; - case EltwiseMode::SUB: - op += input0_str + " - " + input1_str; - break; - case EltwiseMode::MUL: - op += input0_str + " * " + input1_str; - break; - case EltwiseMode::DIV: - op += input0_str + " / " + input1_str; - break; - case EltwiseMode::MODULU: - case EltwiseMode::MIN: - case EltwiseMode::MAX: { - auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max")); - auto input_0_type = params.inputs[0].GetDType(); - auto input_1_type = params.inputs[1].GetDType(); - - // input_0 == int - if (input_0_type == kernel_selector::Datatype::INT8 || - input_0_type == kernel_selector::Datatype::UINT8) { - // input_0 == int && input_1 == int - if (input_1_type == kernel_selector::Datatype::INT8 || - input_1_type == kernel_selector::Datatype::UINT8) { - if (ew.mode == EltwiseMode::MODULU) - op += input0_str + " % " + input1_str; - else - op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")"; - // input_0 == int && input_1 != int - } else { - op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")"; - } - // input_0 != int && input_1 == int - } else if (input_1_type == kernel_selector::Datatype::INT8 || - input_1_type == kernel_selector::Datatype::UINT8) { - op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))"; - // input_0 != int && input_1 != int - } else { - op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")"; - } - } break; - case EltwiseMode::POW: - op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; - break; - case EltwiseMode::SQRT: - op += cast_type + "sqrt(" + input0_str + ")"; - break; - case EltwiseMode::RSQRT: - op += cast_type + "1/sqrt(" + input0_str + ")"; - break; - case EltwiseMode::ASSIGN: - op += input0_str; - break; - default: - break; - } - - std::string opname = "OPERATION" + op_num_str; - jit.AddConstant(MakeJitConstant(opname, op)); - do_eltwise += "\\\n\t" + opname + ";"; - } - - for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) - do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + "[GET_INDEX(INPUT, " + - std::to_string(updateInputs[update_input_idx].inputId) + ")] = tmp" + - std::to_string(updateInputs[update_input_idx].tmpId) + ";"; - - do_eltwise += "\\\n\tres = tmp" + std::to_string(operations.size() - 1) + ";"; - - jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise)); - - if (params.layoutBased || params.int8_quantization) { - jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); - } - - if (!params.stride.empty()) { - jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); - } - - /////////////// - return jit; -} - -KernelsData EltwiseKernel_b_fs_yx_fsv4::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h deleted file mode 100644 index 72d9d5a09cf..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv4.h +++ /dev/null @@ -1,36 +0,0 @@ -/* -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "eltwise_kernel_base.h" - -namespace kernel_selector { -class EltwiseKernel_b_fs_yx_fsv4 : public EltwiseKernelBase { -public: - using Parent = EltwiseKernelBase; - EltwiseKernel_b_fs_yx_fsv4() : EltwiseKernelBase("eltwise_b_fs_yx_fsv4") {} - virtual ~EltwiseKernel_b_fs_yx_fsv4() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& params, const optional_params& options) const override; - JitConstants GetJitConstants(const eltwise_params& params) const override; - DispatchData SetDefault(const eltwise_params& params) const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp index e3b4b7ddf6c..dc702492587 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.cpp @@ -51,17 +51,6 @@ static uint32_t GetNumberOfInputs(EltwiseMode m) { ParamsKey eltwise_params::GetParamsKey() const { ParamsKey k = base_params::GetParamsKey(); - if (int8_quantization) { - k.EnableInt8Quantization(); - } - - if (output_calibration) { - k.EnableOutputCalibration(); - } - - if (inputs_calibration) { - k.EnableEltwiseInputsCalibration(); - } if (!stride.empty()) { k.EnableEltwiseStride(); @@ -617,9 +606,7 @@ KernelsData EltwiseKernelBase::GetCommonKernelsData(const Params& params, const kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); kernel.arguments = GetArgsDesc((uint32_t)newParams.inputs.size(), false, - false, - newParams.int8_quantization, - newParams.output_calibration); + false); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h index 22d398d0b55..0e59efa68b1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_base.h @@ -84,14 +84,8 @@ struct eltwise_params : public base_params { bool layoutBased = false; bool int8_quantization = false; - bool output_calibration = false; - float output_quantization_factor = 1.0f; - bool inputs_calibration = false; bool broadcast = false; - MultiDataTensor output_calibration_factors; - MultiDataTensor inputs_calibration_factors; - std::vector input_quantization_factors; virtual ParamsKey GetParamsKey() const; }; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp deleted file mode 100644 index 4e8ff935db3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.cpp +++ /dev/null @@ -1,301 +0,0 @@ -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "eltwise_kernel_fs_bs_yx_bsv4_fsv32.h" -#include "kernel_selector_utils.h" -#include -#include - -namespace kernel_selector { - -ParamsKey EltwiseKernel_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnableInt8Quantization(); - k.EnableEltwiseStride(); - return k; -} - -EltwiseKernelBase::DispatchData EltwiseKernel_fs_bs_yx_bsv4_fsv32::SetDefault(const eltwise_params& params) const { - DispatchData kd; - - kd.gws0 = params.output.X().v; - kd.gws1 = params.output.Y().v; - // we process 4 batches and 4 features per workitem - kd.gws2 = (params.output.Batch().v / 4) * (params.output.Feature().v / 4); - kd.lws0 = 1; - kd.lws1 = 1; - kd.lws2 = 8; - - kd.efficiency = FORCE_PRIORITY_3; - return kd; -} - -JitConstants EltwiseKernel_fs_bs_yx_bsv4_fsv32::GetJitConstants(const eltwise_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - - /////////////// - jit.AddConstants({ - MakeJitConstant("ELTWISE_LAYOUT_BASED", params.layoutBased), - MakeJitConstant("QUANTIZATION_TERM", params.int8_quantization), - }); - - if (params.int8_quantization) { - if (params.output_calibration) { - jit.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.output_calibration)); - jit.AddConstant(MakeJitConstant("O_QF", params.output_calibration_factors[0])); - - } else { - jit.AddConstants({MakeJitConstant("O_QF", params.output_quantization_factor)}); - } - } - - std::string inputs_decls; - auto& updateInputs = params.updateInputIds; - - for (size_t i = 0; i < params.inputs.size(); i++) { - // const should be added only to inputs which will not be updated - std::string const_str = "const"; - for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) { - if (updateInputs[update_input_idx].inputId == i) { - const_str = ""; - break; - } - } - - inputs_decls += - const_str + " __global " + toCLType(params.inputs[i].GetDType()) + "* input" + std::to_string(i) + ", "; - - if (!params.stride.empty()) { - jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_X", params.stride[i].x)); - jit.AddConstant(MakeJitConstant("INPUT" + std::to_string(i) + "_STRIDE_Y", params.stride[i].y)); - } - } - - jit.AddConstant(MakeJitConstant("INPUTS_DECLS", inputs_decls)); - jit.AddConstant(MakeJitConstant("ELTWISE_NO_PITCH_SAME_DIMS", CheckInputsOutputNoPitchSameDims(params))); - - std::string do_eltwise; - - auto& operations = params.operations; - auto& coefficients = params.coefficients; - - for (size_t op_num = 0; op_num < operations.size(); op_num++) { - const std::string op_num_str = std::to_string(op_num); - const auto& ew = operations[op_num]; - - for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { - const auto& input = ew.inputs[input_idx]; - const std::string name = "INPUT_" + op_num_str + "_" + std::to_string(input_idx); - switch (input.mode) { - case EltwiseInputMode::SCALAR: - jit.AddConstant(MakeJitConstant(name, input.scalar)); - break; - case EltwiseInputMode::INPUT_BUFFER: - jit.AddConstant(MakeJitConstant(name, - "GET_INPUT(input" + std::to_string(input.index) + ", INPUT" + - std::to_string(input.index) + ")")); - break; - case EltwiseInputMode::OUTPUT_BUFFER: - jit.AddConstant(MakeJitConstant(name, "output[GET_INDEX(OUTPUT, )]")); - break; - case EltwiseInputMode::UNORDERED_ACCESS_INPUT_BUFFER: - jit.AddConstant(MakeJitConstant( - name, - "input" + std::to_string(input.index) + "[(size_t)tmp" + std::to_string(input.tmpIndex) + "]")); - break; - case EltwiseInputMode::INTERMEDIATE_RESULTS_INDEX: - jit.AddConstant(MakeJitConstant(name, "tmp" + std::to_string(input.tmpIndex))); - break; - default: - break; - } - } - std::string input0_str, input1_str, cast_type, op; - - if (params.int8_quantization) { - cast_type = "(int16)"; - op = "const int16 tmp" + op_num_str + " = "; - } else { - cast_type = "(UNIT_TYPE)"; - op = "const UNIT_TYPE tmp" + op_num_str + " = "; - } - - input0_str = cast_type + "INPUT_" + op_num_str + "_0"; - input1_str = cast_type + "INPUT_" + op_num_str + "_1"; - - if (ew.mode == EltwiseMode::ADD) { - std::vector coeff_strings(ew.inputs.size(), ""); - for (size_t input_idx = 0; input_idx < ew.inputs.size(); input_idx++) { - const auto& input = ew.inputs[input_idx]; - if (input.mode == EltwiseInputMode::INPUT_BUFFER && input.index < coefficients.size()) { - const float c = coefficients[input.index]; - if (c != 1.0f) - coeff_strings[input_idx] = cast_type + "(" + std::to_string(c) + ")*"; - } - } - - input0_str = coeff_strings[0] + input0_str; - input1_str = coeff_strings[1] + input1_str; - } - - switch (ew.mode) { - case EltwiseMode::ADD: - op += input0_str + " + " + input1_str; - break; - case EltwiseMode::SUB: - op += input0_str + " - " + input1_str; - break; - case EltwiseMode::MUL: - op += input0_str + " * " + input1_str; - break; - case EltwiseMode::DIV: - op += input0_str + " / " + input1_str; - break; - case EltwiseMode::MODULU: - case EltwiseMode::MIN: - case EltwiseMode::MAX: { - auto mode = (ew.mode == EltwiseMode::MODULU ? "mod" : (ew.mode == EltwiseMode::MIN ? "min" : "max")); - auto input_0_type = params.inputs[0].GetDType(); - auto input_1_type = params.inputs[1].GetDType(); - - // input_0 == int - if (input_0_type == kernel_selector::Datatype::INT8 || - input_0_type == kernel_selector::Datatype::INT32 || - input_0_type == kernel_selector::Datatype::INT64) { - // input_0 == int && input_1 == int - if (input_1_type == kernel_selector::Datatype::INT8 || - input_1_type == kernel_selector::Datatype::INT32 || - input_1_type == kernel_selector::Datatype::INT64) { - if (ew.mode == EltwiseMode::MODULU) - op += input0_str + " % " + input1_str; - else - op += cast_type + mode + "(" + input0_str + ", " + input1_str + ")"; - // input_0 == int && input_1 != int - } else { - op += cast_type + "f" + mode + "(convert_float(" + input0_str + "), " + input1_str + ")"; - } - // input_0 != int && input_1 == int - } else if (input_1_type == kernel_selector::Datatype::INT8 || - input_1_type == kernel_selector::Datatype::INT32 || - input_1_type == kernel_selector::Datatype::INT64) { - op += cast_type + "f" + mode + "(" + input0_str + ", convert_float(" + input1_str + "))"; - // input_0 != int && input_1 != int - } else { - op += cast_type + "f" + mode + "(" + input0_str + ", " + input1_str + ")"; - } - } break; - case EltwiseMode::POW: - op += cast_type + "pow(" + input0_str + ", " + input1_str + ")"; - break; - case EltwiseMode::SQRT: - op += cast_type + "sqrt(" + input0_str + ")"; - break; - case EltwiseMode::RSQRT: - op += cast_type + "1/sqrt(" + input0_str + ")"; - break; - case EltwiseMode::SQUARED_DIFF: - op += cast_type + "((" + input0_str + " - " + input1_str + - ")" - " * (" + - input0_str + " - " + input1_str + "))"; - break; - case EltwiseMode::EQ: - op += cast_type + "(" + input0_str + " == " + input1_str + ")"; - break; - case EltwiseMode::NE: - op += cast_type + "(" + input0_str + " != " + input1_str + ")"; - break; - case EltwiseMode::LT: - op += cast_type + "(" + input0_str + " < " + input1_str + ")"; - break; - case EltwiseMode::LE: - op += cast_type + "(" + input0_str + " <= " + input1_str + ")"; - break; - case EltwiseMode::GT: - op += cast_type + "(" + input0_str + " > " + input1_str + ")"; - break; - case EltwiseMode::GE: - op += cast_type + "(" + input0_str + " >= " + input1_str + ")"; - break; - case EltwiseMode::LOGIC_AND: - op += cast_type + "(" + input0_str + " && " + input1_str + ")"; - break; - case EltwiseMode::LOGIC_OR: - op += cast_type + "(" + input0_str + " || " + input1_str + ")"; - break; - case EltwiseMode::LOGIC_XOR: - op += cast_type + "(!" + input0_str + " != !" + input1_str + ")"; - break; - case EltwiseMode::FLOOR_MOD: - op += cast_type + "(" + input0_str + " - " + input0_str + " / " + input1_str + " * " + input1_str + ")"; - break; - case EltwiseMode::ASSIGN: - op += input0_str; - break; - default: - break; - } - - std::string opname = "OPERATION" + op_num_str; - jit.AddConstant(MakeJitConstant(opname, op)); - do_eltwise += "\\\n\t" + opname + ";"; - } - - for (size_t update_input_idx = 0; update_input_idx < updateInputs.size(); update_input_idx++) - do_eltwise += "\\\n\tinput" + std::to_string(updateInputs[update_input_idx].inputId) + "[GET_INDEX(INPUT, " + - std::to_string(updateInputs[update_input_idx].inputId) + ")] = tmp" + - std::to_string(updateInputs[update_input_idx].tmpId) + ";"; - - do_eltwise += "\\\n\tres = tmp" + std::to_string(operations.size() - 1) + ";"; - - jit.AddConstant(MakeJitConstant("DO_ELTWISE", do_eltwise)); - - if (params.layoutBased || params.int8_quantization) { - jit.Merge(GetTensorFriendlyWorkGroupsJit(params.inputs[0])); - } - - if (!params.stride.empty()) { - jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); - } - - /////////////// - return jit; -} - -KernelsData EltwiseKernel_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetCommonKernelsData(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h deleted file mode 100644 index d13407a1b7e..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_fs_bs_yx_bsv4_fsv32.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "eltwise_kernel_base.h" - -namespace kernel_selector { -class EltwiseKernel_fs_bs_yx_bsv4_fsv32 : public EltwiseKernelBase { -public: - EltwiseKernel_fs_bs_yx_bsv4_fsv32() : EltwiseKernelBase("eltwise_fs_bs_yx_bsv4_fsv32") {} - virtual ~EltwiseKernel_fs_bs_yx_bsv4_fsv32() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const eltwise_params& params) const override; - DispatchData SetDefault(const eltwise_params& params) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp index aeda60ba4ae..f8021ff16cf 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_ref.cpp @@ -38,7 +38,6 @@ ParamsKey EltwiseKernelRef::GetSupportedKey() const { k.EnableTensorOffset(); k.EnableTensorPitches(); k.EnableBatching(); - k.EnableInt8Quantization(); k.EnableEltwiseStride(); k.EnableEltwiseBroadcast(); return k; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp index 04d9ad9a9d0..697e6a847f1 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_selector.cpp @@ -16,8 +16,6 @@ #include "eltwise_kernel_selector.h" #include "eltwise_kernel_ref.h" #include "eltwise_kernel_vload8.h" -#include "eltwise_kernel_fs_bs_yx_bsv4_fsv32.h" -#include "eltwise_kernel_b_fs_yx_fsv4.h" #include "eltwise_kernel_fs_b_yx_fsv32.h" #include "eltwise_kernel_b_fs_yx_fsv16.h" #include "eltwise_kernel_mixed_byxf_and_fs_b_yx_fsv32.h" @@ -26,8 +24,6 @@ namespace kernel_selector { eltwise_kernel_selector::eltwise_kernel_selector() { Attach(); Attach(); - Attach(); - Attach(); Attach(); Attach(); Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp deleted file mode 100644 index 1b8e52e2dba..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "embed_kernel_ref.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" -#include - -namespace kernel_selector { - -ParamsKey EmbedKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::F16); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableAllInputLayout(); - k.EnableOutputLayout(DataLayout::bf); - k.EnableBiasPerOutput(); - k.EnableBiasPerFeature(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnableNonBiasTerm(); - return k; -} - -JitConstants EmbedKernelRef::GetJitConstants(const embed_params& params) const { - JitConstants jit = WeightBiasKernelBase::GetJitConstants(params); - const auto& input = params.inputs[0]; - const auto x_size = input.LogicalSize() / input.Batch().v; - const auto w_size = params.weights.OFM().v; - jit.AddConstant(MakeJitConstant("INPUT0_ELEMENTS_COUNT", x_size)); - jit.AddConstant(MakeJitConstant("NUM_OUTPUT_SIZE", w_size)); - - return jit; -} - -EmbedKernelRef::DispatchData EmbedKernelRef::SetDefault(const embed_params& params) const { - DispatchData kd; - std::vector global = {params.inputs[0].X().v, params.weights.OFM().v, params.inputs[0].Batch().v}; - std::vector local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); - - kd.gws0 = global[0]; - kd.gws1 = global[1]; - kd.gws2 = global[2]; - - kd.lws0 = local[0]; - kd.lws1 = local[1]; - kd.lws2 = 1; - return kd; -} - -KernelsData EmbedKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - assert(params.GetType() == KernelType::EMBED); - - const embed_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - KernelData kd = KernelData::Default(params); - embed_params& newParams = *static_cast(kd.params.get()); - - bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oiyx, kd.weightsReorderParams); - - if (!succeed) { - return {}; - } - - auto cldnn_jit = GetJitConstants(newParams); - auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - - FillCLKernelData(kernel, - runInfo, - params.engineInfo, - kernelName, - jit, - entry_point, - DEFAULT, - true, - !newParams.bias.empty()); - - kd.estimatedTime = runInfo.efficiency; - - return {kd}; -} - -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h deleted file mode 100644 index ccedf630559..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_ref.h +++ /dev/null @@ -1,42 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "weight_bias_kernel_base.h" -#include "embed_params.h" -#include "common_kernel_base.h" - -namespace kernel_selector { - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// EmbedKernelRef -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class EmbedKernelRef : public WeightBiasKernelBase { -public: - EmbedKernelRef() : WeightBiasKernelBase("embed_ref") {} - virtual ~EmbedKernelRef() {} - - struct DispatchData : public CommonDispatchData {}; - - ParamsKey GetSupportedKey() const override; - -protected: - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - virtual JitConstants GetJitConstants(const embed_params& params) const; - virtual DispatchData SetDefault(const embed_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp deleted file mode 100644 index d9de5af8951..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.cpp +++ /dev/null @@ -1,27 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "embed_kernel_selector.h" -#include "embed_kernel_ref.h" - -namespace kernel_selector { - -embed_kernel_selector::embed_kernel_selector() { Attach(); } - -KernelsData embed_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::EMBED); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h deleted file mode 100644 index 1e2db97263c..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_kernel_selector.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class embed_kernel_selector : public kernel_selector_base { -public: - static embed_kernel_selector& Instance() { - static embed_kernel_selector instance_; - return instance_; - } - - embed_kernel_selector(); - - virtual ~embed_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h deleted file mode 100644 index 94826c29858..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/embed/embed_params.h +++ /dev/null @@ -1,51 +0,0 @@ -/* -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "weight_bias_params.h" -#include - -namespace kernel_selector { - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// embed_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct embed_params : public weight_bias_params { - embed_params() : weight_bias_params(KernelType::EMBED) {} - - std::string to_string() const { - std::stringstream s; - - s << base_params::to_string() << "_"; - if (bias.empty()) { - s << "no_bias" - << "_"; - } else { - s << "bias_" << bias[0].PhysicalSize() << "_"; - } - return s.str(); - } - virtual ParamsKey GetParamsKey() const { return weight_bias_params::GetParamsKey(); } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// embed_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct embed_optional_params : weight_bias_optional_params { - embed_optional_params() : weight_bias_optional_params(KernelType::EMBED) {} -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp index 56d1ad1f1a3..9617e458cca 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_base.cpp @@ -29,8 +29,6 @@ JitConstants FullyConnectedKernelBase::GetJitConstants(const fully_connected_par jit.AddConstant(MakeJitConstant("INPUT0_ELEMENTS_COUNT", x_size)); - jit.AddConstant(MakeJitConstant("QUANTIZATION_TERM", params.quantization != QuantizationType::NONE)); - return jit; } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp deleted file mode 100644 index 035fc421d93..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.cpp +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2016-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_kernel_mmad_batched.h" - -namespace kernel_selector { -ParamsKey FullyConnected_mmad_batched::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::bf); - k.EnableBiasPerOutput(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnableInt8Quantization(); - k.EnableOutputCalibration(); - return k; -} - -bool FullyConnected_mmad_batched::Validate(const Params& p, const optional_params& o) const { - if (!FullyConnectedKernelBase::Validate(p, o)) { - return false; - } - - const auto& params = static_cast(p); - - // we do not support padded input - if (params.inputs[0].X().pad.Total() != 0 || params.inputs[0].Y().pad.Total() != 0) - return false; - - size_t batch = params.inputs[0].Batch().v; - // batch must be a multiple of 8 - if (batch % 8 != 0) { - return false; - } - - return true; -} - -JitConstants FullyConnected_mmad_batched::GetJitConstants(const fully_connected_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1)); - - // pitch for special block format used in this kernel - const size_t ifm_32_aligned = Align(params.weights.IFM().v, 32); - const size_t filter_ofm_block_pitch = - (ifm_32_aligned / 32) * params.weights.X().v * params.weights.Y().v * 4 * 8 * 8; - jit.AddConstant(MakeJitConstant("FILTER_OFM_BLOCK_PITCH", filter_ofm_block_pitch)); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - - return jit; -} - -FullyConnected_mmad_batched::DispatchData FullyConnected_mmad_batched::SetDefault(const fully_connected_params& params, - int) const { - auto runInfo = Parent::SetDefault(params); - - constexpr size_t sub_group_size = 8; - - const auto of_maps = params.output.Feature().v; - const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size); - - runInfo.gws0 = params.output.Batch().v / 8; // we process 8 batches in a single WG - runInfo.gws1 = of_threads_per_batch; - runInfo.gws2 = 1; - - runInfo.lws0 = 1; - runInfo.lws1 = sub_group_size; - runInfo.lws2 = 1; - - runInfo.efficiency = FORCE_PRIORITY_1; - return runInfo; -} - -KernelsData FullyConnected_mmad_batched::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData res = {}; - for (size_t i = 0; i < autoTuneOptions.size(); i++) { - KernelsData kd = GetTunedKernelsDataByIndex(params, - options, - DataLayout::fs_bs_yx_bsv4_fsv32, - WeightsLayout::os_is_yx_isa8_osv8_isv4, - FORCE_PRIORITY_1, - static_cast(i)); - if (!kd.empty()) { - res.emplace_back(kd[0]); - } - } - return res; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h deleted file mode 100644 index 07feee159a0..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad_batched.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2016 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fully_connected_kernel_base.h" - -namespace kernel_selector { - -class FullyConnected_mmad_batched : public FullyConnectedKernelBase { -public: - using Parent = FullyConnectedKernelBase; - - FullyConnected_mmad_batched() : Parent("fully_connected_gpu_mmad_batched") {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp index 3135d00e592..fc7d28aedcf 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_selector.cpp @@ -28,7 +28,6 @@ #include "fully_connected_kernel_fb_io_block.h" #include "fully_connected_kernel_bf_io_input_spatial.h" #include "fully_connected_kernel_mmad.h" -#include "fully_connected_kernel_mmad_batched.h" #include "fully_connected_kernel_imad.h" #include "fully_connected_kernel_fs_byx_fsv32.h" @@ -49,7 +48,6 @@ fully_connected_kernel_selector::fully_connected_kernel_selector() { Attach(); Attach(); Attach(); - // Attach(); Attach(); Attach(); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp deleted file mode 100644 index a5eb45ed347..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_grad_input_kernel_base.h" -#include "kernel_selector_utils.h" -#include -#include - -namespace kernel_selector { -JitConstants FullyConnectedGradInputKernelBase::GetJitConstants(const fully_connected_grad_input_params& params) const { - return WeightBiasKernelBase::GetJitConstants(params); -} - -FullyConnectedGradInputKernelBase::DispatchData FullyConnectedGradInputKernelBase::SetDefault( - const fully_connected_grad_input_params& params) const { - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - size_t gws0 = params.output.Batch().v * params.weights.IFM().v; - size_t lws0 = std::min(gws0, static_cast(32)); - while (gws0 % lws0) { - lws0--; - } - kd.gws0 = gws0; - kd.gws1 = params.weights.X().v; - kd.gws2 = params.weights.Y().v; - kd.lws0 = lws0; - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - return kd; -} - -KernelsData FullyConnectedGradInputKernelBase::GetKernelsData(const Params& params, - const optional_params& options) const { - assert(params.GetType() == KernelType::FULLY_CONNECTED_GRAD_INPUT); - - const fully_connected_grad_input_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - KernelData kd = KernelData::Default(params); - fully_connected_grad_input_params& newParams = *static_cast(kd.params.get()); - - bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oi, kd.weightsReorderParams); - - if (!succeed) { - return {}; - } - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, - runInfo, - params.engineInfo, - kernelName, - jit, - entry_point, - DEFAULT, - true, - !orgParams.bias.empty()); - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); - - kd.estimatedTime = runInfo.efficiency; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h deleted file mode 100644 index 29ada244e3a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_base.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "weight_bias_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fully_connected_grad_input_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct fully_connected_grad_input_params : public weight_bias_params { - fully_connected_grad_input_params() : weight_bias_params(KernelType::FULLY_CONNECTED_GRAD_INPUT) {} - - virtual ParamsKey GetParamsKey() const { return weight_bias_params::GetParamsKey(); } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fully_connected_grad_input_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct fully_connected_grad_input_optional_params : weight_bias_optional_params { - fully_connected_grad_input_optional_params() - : weight_bias_optional_params(KernelType::FULLY_CONNECTED_GRAD_INPUT) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// FullyConnectedGradInputKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class FullyConnectedGradInputKernelBase : public WeightBiasKernelBase { -public: - using WeightBiasKernelBase::WeightBiasKernelBase; - virtual ~FullyConnectedGradInputKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const; - virtual JitConstants GetJitConstants(const fully_connected_grad_input_params& params) const; - virtual DispatchData SetDefault(const fully_connected_grad_input_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp deleted file mode 100644 index 4eeab782044..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_grad_input_kernel_ref.h" - -namespace kernel_selector { - -ParamsKey FullyConnectedGradInputKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F16); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - // TODO: add support to batching, figure out the way to update weights/biases for multiple batches at the same time - k.EnableBatching(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h deleted file mode 100644 index 4ccab494f70..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_ref.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fully_connected_grad_input_kernel_base.h" - -namespace kernel_selector { - -class FullyConnectedGradInputKernelRef : public FullyConnectedGradInputKernelBase { -public: - FullyConnectedGradInputKernelRef() : FullyConnectedGradInputKernelBase("fully_connected_grad_input_gpu_ref") {} - virtual ~FullyConnectedGradInputKernelRef() {} - - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp deleted file mode 100644 index a7df113d338..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_grad_input_kernel_selector.h" -#include "fully_connected_grad_input_kernel_ref.h" - -namespace kernel_selector { -fully_connected_grad_input_kernel_selector::fully_connected_grad_input_kernel_selector() { - Attach(); -} - -KernelsData fully_connected_grad_input_kernel_selector::GetBestKernels(const Params& params, - const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::FULLY_CONNECTED_GRAD_INPUT); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h deleted file mode 100644 index b2d165d1b4e..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_input/fully_connected_grad_input_kernel_selector.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class fully_connected_grad_input_kernel_selector : public kernel_selector_base { -public: - static fully_connected_grad_input_kernel_selector& Instance() { - static fully_connected_grad_input_kernel_selector instance_; - return instance_; - } - - fully_connected_grad_input_kernel_selector(); - - virtual ~fully_connected_grad_input_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp deleted file mode 100644 index a5e4cdae69a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_grad_weights_kernel_base.h" -#include "kernel_selector_utils.h" -#include -#include - -namespace kernel_selector { -JitConstants FullyConnectedGradWeightsKernelBase::GetJitConstants( - const fully_connected_grad_weights_params& params) const { - JitConstants jit = training_kernel_base::GetJitConstants(params); - - return jit; -} - -FullyConnectedGradWeightsKernelBase::DispatchData FullyConnectedGradWeightsKernelBase::SetDefault( - const fully_connected_grad_weights_params& params) const { - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - size_t gws0 = params.weights.OFM().v * params.weights.IFM().v; - size_t lws0 = std::min(gws0, static_cast(32)); - while (gws0 % lws0) { - lws0--; - } - kd.gws0 = gws0; - kd.gws1 = params.weights.X().v; - kd.gws2 = params.weights.Y().v; - kd.lws0 = lws0; - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - return kd; -} - -KernelsData FullyConnectedGradWeightsKernelBase::GetKernelsData(const Params& params, - const optional_params& options) const { - assert(params.GetType() == KernelType::FULLY_CONNECTED_GRAD_WEIGHTS); - - const fully_connected_grad_weights_params& orgParams = - static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - KernelData kd = KernelData::Default(params); - fully_connected_grad_weights_params& newParams = - *static_cast(kd.params.get()); - - bool succeed = UpdateWeightsParams(newParams, options, WeightsLayout::oi, kd.weightsReorderParams); - - if (!succeed) { - return {}; - } - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, - runInfo, - params.engineInfo, - kernelName, - jit, - entry_point, - DEFAULT, - true, - !orgParams.bias.empty()); - if (orgParams.use_momentum) { - kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0}); - if (!orgParams.bias.empty()) - kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_BIAS_GRADIENT, 0}); - } - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); - kernel.arguments.push_back({ArgumentDescriptor::Types::LEARNING_RATE, 0}); - - kd.estimatedTime = runInfo.efficiency; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h deleted file mode 100644 index 38115d73fcc..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_base.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "training_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fully_connected_grad_weights_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct fully_connected_grad_weights_params : public training_params { - fully_connected_grad_weights_params() : training_params(KernelType::FULLY_CONNECTED_GRAD_WEIGHTS) {} - - virtual ParamsKey GetParamsKey() const { - ParamsKey k = training_params::GetParamsKey(); - - return k; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fully_connected_grad_weights_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct fully_connected_grad_weights_optional_params : training_optional_params { - fully_connected_grad_weights_optional_params() - : training_optional_params(KernelType::FULLY_CONNECTED_GRAD_WEIGHTS) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// FullyConnectedGradWeightsKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class FullyConnectedGradWeightsKernelBase : public training_kernel_base { -public: - using training_kernel_base::training_kernel_base; - virtual ~FullyConnectedGradWeightsKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const; - virtual JitConstants GetJitConstants(const fully_connected_grad_weights_params& params) const; - virtual DispatchData SetDefault(const fully_connected_grad_weights_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp deleted file mode 100644 index ef14d53e6a3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_grad_weights_kernel_ref.h" - -namespace kernel_selector { - -ParamsKey FullyConnectedGradWeightsKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableMomentum(); - k.EnableBatching(); - k.EnableGradient(); - k.DisableTuning(); - return k; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h deleted file mode 100644 index 196d07d6579..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_ref.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fully_connected_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class FullyConnectedGradWeightsKernelRef : public FullyConnectedGradWeightsKernelBase { -public: - FullyConnectedGradWeightsKernelRef() - : FullyConnectedGradWeightsKernelBase("fully_connected_grad_weights_gpu_ref") {} - virtual ~FullyConnectedGradWeightsKernelRef() {} - - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp deleted file mode 100644 index 0887084a7bb..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fully_connected_grad_weights_kernel_selector.h" -#include "fully_connected_grad_weights_kernel_ref.h" - -namespace kernel_selector { -fully_connected_grad_weights_kernel_selector::fully_connected_grad_weights_kernel_selector() { - Attach(); -} - -KernelsData fully_connected_grad_weights_kernel_selector::GetBestKernels(const Params& params, - const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::FULLY_CONNECTED_GRAD_WEIGHTS); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h deleted file mode 100644 index 680b2229313..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected_grad_weights/fully_connected_grad_weights_kernel_selector.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class fully_connected_grad_weights_kernel_selector : public kernel_selector_base { -public: - static fully_connected_grad_weights_kernel_selector& Instance() { - static fully_connected_grad_weights_kernel_selector instance_; - return instance_; - } - - fully_connected_grad_weights_kernel_selector(); - - virtual ~fully_connected_grad_weights_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp deleted file mode 100644 index aea2eb96edb..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "fused_conv_bn_scale_kernel_base.h" -#include "kernel_selector_utils.h" -#include "common_tools.h" -#include -#include - -namespace kernel_selector { -bool fused_conv_bn_scale_kernel_base::Validate(const Params& p, const optional_params& o) const { - if (p.GetType() != KernelType::FUSED_CONV_BN_SCALE || o.GetType() != KernelType::FUSED_CONV_BN_SCALE) { - return false; - } - - const fused_conv_bn_scale_params& params = static_cast(p); - const fused_conv_bn_scale_optional_params& optParams = static_cast(o); - - bool bSupportedWeightsLayout = params.weights.GetLayout() == GetPreferredWeightsLayout(params); - - const bool bWeightsOK = bSupportedWeightsLayout || optParams.allowStaticInputReordering; - - return bWeightsOK; -} - -JitConstants fused_conv_bn_scale_kernel_base::GetJitConstants(const fused_conv_bn_scale_params& params, - const DispatchData&) const { - JitConstants mem_consts = WeightBiasKernelBase::GetJitConstants(params); - const auto& padding = params.padding; - const auto& input = params.inputs[0]; - - int64_t input_offset_with_padding = - (int64_t)input.GetFirstElementOffset() - padding.x * input.X().pitch - input.Y().pitch * padding.y; - input_offset_with_padding = std::max(input_offset_with_padding, (int64_t)0); - - mem_consts.AddConstants({MakeJitConstant("STRIDE", params.stride), - MakeJitConstant("PADDING", params.padding), - MakeJitConstant("FILTER_ARRAY_NUM", params.split), - MakeJitConstant("DILATION", params.dilation), - MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), - MakeJitConstant("EPSILON", params.epsilon)}); - - if (params.fused_in_training) - mem_consts.AddConstant(MakeJitConstant("FUSED_TRAINING", 1)); - if (params.scale_bias) - mem_consts.AddConstant(MakeJitConstant("SCALE_BIAS_TERM", 1)); - - return mem_consts; -} - -bool fused_conv_bn_scale_kernel_base::CheckWorkGroups(const DispatchData& kd) { - if (kd.gws0 == 0 || kd.gws1 == 0 || kd.gws2 == 0 || kd.lws0 == 0 || kd.lws1 == 0 || kd.lws2 == 0) { - return false; - } - - if ((kd.gws0 % kd.lws0) != 0 || (kd.gws1 % kd.lws1) != 0 || (kd.gws2 % kd.lws2) != 0) { - return false; - } - - return true; -} - -fused_conv_bn_scale_kernel_base::DispatchData fused_conv_bn_scale_kernel_base::SetDefault( - const fused_conv_bn_scale_params& params) const { - DispatchData kd; - - const auto& out = params.output; - kd.fp16UnitUsed = out.GetDType() == Datatype::F16; - std::vector global; - if (params.output.GetLayout() == DataLayout::bfyx || params.output.GetLayout() == DataLayout::byxf) { - global = {out.X().v, out.Y().v, out.Feature().v * out.Batch().v}; - } else { - global = {out.Feature().v * out.Batch().v, out.X().v, out.Y().v}; - } - - auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); - - kd.gws0 = global[0]; - kd.gws1 = global[1]; - kd.gws2 = global[2]; - - kd.lws0 = local[0]; - kd.lws1 = local[1]; - kd.lws2 = local[2]; - - kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - return kd; -} - -KernelsData fused_conv_bn_scale_kernel_base::GetCommonKernelsData(const Params& params, - const optional_params& options, - float estimated_time) const { - if (!Validate(params, options)) { - return {}; - } - - KernelData kd = KernelData::Default(params); - fused_conv_bn_scale_params& newParams = *static_cast(kd.params.get()); - - DispatchData runInfo = SetDefault(newParams); - - if (!CheckWorkGroups(runInfo)) { - // Internal Error - wrong calculation of global/local work group sizes - return {}; - } - - bool succeed = - UpdateWeightsParams(newParams, options, GetPreferredWeightsLayout(newParams), kd.weightsReorderParams); - - if (!succeed) { - return {}; - } - - auto finalKernelName = GetKernelName(newParams); - auto cldnnJit = GetJitConstants(newParams, runInfo); - auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options); - auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, - runInfo, - params.engineInfo, - finalKernelName, - jit, - entryPoint, - "", - true, - !newParams.bias.empty(), - 1); - kernel.arguments.push_back({ArgumentDescriptor::Types::SPLIT, 0}); - uint32_t idx = 1; - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++}); - if (newParams.scale_bias) - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++}); - if (newParams.fused_in_training) { - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++}); - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx++}); - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, idx}); - } - - kd.estimatedTime = estimated_time; - - return {kd}; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h deleted file mode 100644 index 6abddd9f09f..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_base.h +++ /dev/null @@ -1,77 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "weight_bias_kernel_base.h" -#include "actual_kernels/convolution/convolution_params.h" -#include -#include - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fused_conv_bn_scale_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct fused_conv_bn_scale_params : public weight_bias_params { - fused_conv_bn_scale_params() : weight_bias_params(KernelType::FUSED_CONV_BN_SCALE) {} - - uSize filterSize; - uSize stride; - uSize dilation; - uSize padding; - uint32_t split = 1; - bool fused_in_training = false; - bool scale_bias = false; - float epsilon = 0.00001f; - - ParamsKey GetParamsKey() const override { - ParamsKey k = weight_bias_params::GetParamsKey(); - - if (split > 1) { - k.EnableSplitSupport(); - } - - return k; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fused_conv_bn_scale_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct fused_conv_bn_scale_optional_params : weight_bias_optional_params { - fused_conv_bn_scale_optional_params() : weight_bias_optional_params(KernelType::FUSED_CONV_BN_SCALE) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// fused_conv_bn_scale_kernel_base -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class fused_conv_bn_scale_kernel_base : public WeightBiasKernelBase { -public: - using WeightBiasKernelBase::WeightBiasKernelBase; - virtual ~fused_conv_bn_scale_kernel_base() {} - - using DispatchData = CommonDispatchData; - -protected: - virtual WeightsLayout GetPreferredWeightsLayout(const fused_conv_bn_scale_params &) const = 0; - virtual std::string GetKernelName(const fused_conv_bn_scale_params&) const { return kernelName; } - bool Validate(const Params& p, const optional_params& o) const override; - virtual JitConstants GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& kd) const; - virtual DispatchData SetDefault(const fused_conv_bn_scale_params& params) const; - static bool CheckWorkGroups(const DispatchData&); - KernelsData GetCommonKernelsData(const Params& params, const optional_params& options, float estimated_time) const; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp deleted file mode 100644 index ebd7e52624c..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.cpp +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fused_conv_bn_scale_kernel_ref.h" -#include "kernel_selector_utils.h" -#include - -namespace kernel_selector { - -ParamsKey fused_conv_bn_scale_kernel_ref::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableSplitSupport(); - k.EnableBatching(); - k.DisableTuning(); - return k; -} - -fused_conv_bn_scale_kernel_base::DispatchData fused_conv_bn_scale_kernel_ref::SetDefault( - const fused_conv_bn_scale_params& arg) const { - DispatchData runInfo = fused_conv_bn_scale_kernel_base::SetDefault(arg); - - runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - - runInfo.gws0 = arg.output.Batch().v; - runInfo.gws1 = arg.output.Feature().v; - runInfo.gws2 = 1; - - runInfo.lws0 = std::min(std::max(runInfo.gws0, static_cast(1)), static_cast(32)); - while (runInfo.gws0 % runInfo.lws0 != 0) { - --runInfo.lws0; - } - runInfo.lws1 = 1; - runInfo.lws2 = 1; - - return runInfo; -} - -JitConstants fused_conv_bn_scale_kernel_ref::GetJitConstants(const fused_conv_bn_scale_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - return jit; -} - -KernelsData fused_conv_bn_scale_kernel_ref::GetKernelsData(const Params& params, const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options, DONT_USE_IF_HAVE_SOMETHING_ELSE); - - return kd; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h deleted file mode 100644 index 9e8222fc71b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_ref.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fused_conv_bn_scale_kernel_base.h" -#include - -namespace kernel_selector { - -class fused_conv_bn_scale_kernel_ref : public fused_conv_bn_scale_kernel_base { -public: - using Parent = fused_conv_bn_scale_kernel_base; - - fused_conv_bn_scale_kernel_ref() : fused_conv_bn_scale_kernel_base("fused_conv_bn_scale_kernel_ref") {} - virtual ~fused_conv_bn_scale_kernel_ref() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - WeightsLayout GetPreferredWeightsLayout(const fused_conv_bn_scale_params &) const override { - return WeightsLayout::oiyx; - } - DispatchData SetDefault(const fused_conv_bn_scale_params& arg) const override; - JitConstants GetJitConstants(const fused_conv_bn_scale_params& params, const DispatchData& kd) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp deleted file mode 100644 index 04674987c7e..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fused_conv_bn_scale_kernel_selector.h" -#include "fused_conv_bn_scale_kernel_ref.h" - -namespace kernel_selector { -fused_conv_bn_scale_kernel_selector::fused_conv_bn_scale_kernel_selector() { Attach(); } - -KernelsData fused_conv_bn_scale_kernel_selector::GetBestKernels(const Params& params, - const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::FUSED_CONV_BN_SCALE); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h deleted file mode 100644 index db78aaa79fe..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_bn_scale/fused_conv_bn_scale_kernel_selector.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class fused_conv_bn_scale_kernel_selector : public kernel_selector_base { -public: - static fused_conv_bn_scale_kernel_selector& Instance() { - static fused_conv_bn_scale_kernel_selector instance_; - return instance_; - } - - fused_conv_bn_scale_kernel_selector(); - - virtual ~fused_conv_bn_scale_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp deleted file mode 100644 index 1ecf94d0916..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* -// Copyright (c) 2019-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "fused_conv_eltwise_kernel_af32_imad_1x1.h" - -static size_t GetTileLength(size_t out_xy, size_t out_f, size_t min_threads) { - for (int tile_len = 14; tile_len > 0; tile_len--) { - // Kernel writes 32 output features per HW thread - size_t threads = (out_xy / tile_len) * out_xy * out_f / 32; - // Chose largest valid tile with enough HW threads - if ((out_xy % tile_len == 0) && (threads >= min_threads)) { - return tile_len; - } - } - return out_xy % 8 ? (out_xy % 7 ? 1 : 7) : 8; -} - -namespace kernel_selector { - -ParamsKey fused_conv_eltwise_kernel_af32_imad_1x1::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::byxf_af32); - k.EnableOutputLayout(DataLayout::byxf_af32); - k.EnableDifferentTypes(); - k.EnableDifferentInputWeightsTypes(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableBiasPerOutput(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableSplitSupport(); - k.EnableDepthwiseSeparableOpt(); - k.EnableInt8Quantization(); - k.EnableOutputCalibration(); - k.EnableFusedConvEltwInt8Quantization(); - k.EnableFusedConvEltwOutputCalibration(); - k.DisableTuning(); - k.EnableFusedConvEltwiseRWOutOpt(); - k.EnableEltwiseStride(); - return k; -} - -bool fused_conv_eltwise_kernel_af32_imad_1x1::Validate(const Params& p, const optional_params& o) const { - if (!Parent::Validate(p, o)) { - return false; - } - - KernelData kd = KernelData::Default(p); - fused_conv_eltwise_params& newParams = *static_cast(kd.params.get()); - - if (newParams.conv.filterSize.x != 1 || newParams.conv.filterSize.y != 1) - return false; - - if (newParams.conv.padding.x != 0 || newParams.conv.padding.y != 0) - return false; - - if (newParams.output.Feature().v % 32 != 0) - return false; - - const auto& input = newParams.inputs[0]; - - // we do not support padded input - if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0) - return false; - - if (newParams.conv.split != 1) - return false; - - return true; -} - -fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_af32_imad_1x1::SetDefault( - const fused_conv_eltwise_params& arg, - int) const { - DispatchData runInfo = Parent::SetDefault(arg); - - // Sub-group size - constexpr size_t sub_group_size = 8; - - const auto of_maps = arg.output.Feature().v; - const size_t of_maps_per_batch = RoundUp(of_maps, 32); - const size_t of_maps_total = of_maps_per_batch * arg.output.Batch().v; - - // Need to have at least 4 HW threads per EU - const size_t tile_length = GetTileLength(arg.output.X().v, of_maps_total, arg.engineInfo.computeUnitsCount * 4); - runInfo.cldnnStyle.blockWidth = tile_length; - - runInfo.efficiency = FORCE_PRIORITY_1; - - runInfo.gws0 = arg.output.X().v * arg.output.Y().v / tile_length; - runInfo.gws1 = of_maps_total / 4; // TILE_DEPTH==4 - runInfo.gws2 = 1; - - runInfo.lws0 = 1; - runInfo.lws1 = sub_group_size; - runInfo.lws2 = 1; - - return runInfo; -} - -JitConstants fused_conv_eltwise_kernel_af32_imad_1x1::GetJitConstants(const fused_conv_eltwise_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws1)); - - jit.AddConstant(MakeJitConstant("TILE_LENGTH", runInfo.cldnnStyle.blockWidth)); - jit.AddConstant(MakeJitConstant("TILE_DEPTH", 4)); - - if (params.non_conv_scale != 1.0f) - jit.AddConstant(MakeJitConstant("NON_CONV_SCALE", params.non_conv_scale)); - - jit.Merge(MakeActivationJitConstants(params.conv.activations, GetUnitType(params), "_CONV_TYPED", true)); - jit.Merge(MakeActivationJitConstants(params.activations, GetUnitType(params), "_ELTW_TYPED", true)); - jit.Merge(MakeTypeJitConstants(Datatype::F32, "float")); - - return jit; -} - -KernelsData fused_conv_eltwise_kernel_af32_imad_1x1::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h deleted file mode 100644 index aa0f954e884..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_af32_imad_1x1.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "fused_conv_eltwise_kernel_base.h" -#include - -namespace kernel_selector { - -class fused_conv_eltwise_kernel_af32_imad_1x1 : public fused_conv_eltwise_kernel_base { -public: - using Parent = fused_conv_eltwise_kernel_base; - fused_conv_eltwise_kernel_af32_imad_1x1() - : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_af32_imad_1x1") {} - virtual ~fused_conv_eltwise_kernel_af32_imad_1x1() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; - bool Validate(const Params& p, const optional_params& o) const override; - WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override { - return WeightsLayout::os_is_osv32_isv32_swizzled_by_4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp index ac48606f673..515e2b26264 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp @@ -74,14 +74,6 @@ ParamsKey fused_conv_eltwise_params::GetParamsKey() const { k.EnableFusedConvEltwTranspose(); } - if (conv.int8_quantization) { - k.EnableFusedConvEltwInt8Quantization(); - } - - if (conv.output_calibration) { - k.EnableFusedConvEltwOutputCalibration(); - } - if (conv.local_convolution) { k.EnableFusedConvEltwLocalConvolution(); } @@ -133,22 +125,8 @@ JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_el MakeJitConstant("FILTER_ARRAY_NUM", params.conv.split), MakeJitConstant("INPUT0_OFFSET_WITH_PADDING", input_offset_with_padding), MakeJitConstant("DEPTHWISE_SEPARABLE_OPT", params.conv.depthwise_separable_opt), - MakeJitConstant("QUANTIZATION_TERM", params.conv.int8_quantization), }); - if (params.conv.int8_quantization) { - mem_consts.AddConstants({MakeJitConstant("W_QF", params.conv.weights_quantization_factors[0])}); - mem_consts.AddConstants({MakeJitConstant("I_QF", params.conv.input_quantization_factor)}); - - if (params.conv.output_calibration) { - mem_consts.AddConstant(MakeJitConstant("CALIBRATION_TERM", params.conv.output_calibration)); - mem_consts.AddConstant(MakeJitConstant("O_QF", params.conv.output_calibration_factors[0])); - - } else { - mem_consts.AddConstants({MakeJitConstant("O_QF", params.conv.output_quantization_factor)}); - } - } - if (params.conv.local_convolution) { mem_consts.AddConstants({MakeJitConstant("LOCAL_CONVOLUTION", params.conv.local_convolution)}); } @@ -157,7 +135,6 @@ JitConstants fused_conv_eltwise_kernel_base::GetJitConstants(const fused_conv_el mem_consts.Merge(eltw_activations); JitConstants conv_activations = MakeActivationJitConstants(params.conv.activations, GetUnitType(params), "_CONV"); mem_consts.Merge(conv_activations); - mem_consts.AddConstant(MakeJitConstant("ELTW_CALIBRATION_TERM", params.eltw.output_calibration)); if (!params.eltw.stride.empty()) { mem_consts.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x)); @@ -332,8 +309,6 @@ KernelsData fused_conv_eltwise_kernel_base::GetCommonKernelsData(const Params& p } else { kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); } - if (!newParams.eltw.output_calibration_factors.empty()) - kernel.arguments.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 1}); kd.estimatedTime = runInfo.efficiency; kd.autoTuneIndex = autoTuneIndex; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h index 43d3c814277..4d1d1aa9856 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h @@ -37,13 +37,7 @@ struct fused_conv_eltwise_params : public weight_bias_params { uint32_t split = 1; bool depthwise_separable_opt = false; bool transposed = false; - bool int8_quantization = false; - bool output_calibration = false; bool local_convolution = false; - float input_quantization_factor = 1.0f; - float output_quantization_factor = 1.0f; - MultiDataTensor weights_quantization_factors; - MultiDataTensor output_calibration_factors; std::vector activations; } conv; @@ -55,14 +49,8 @@ struct fused_conv_eltwise_params : public weight_bias_params { std::vector stride; bool layoutBased = false; - bool int8_quantization = false; - bool output_calibration = false; - float output_quantization_factor = 1.0f; - - MultiDataTensor output_calibration_factors; } eltw; - float non_conv_scale = 1.0f; bool second_input_in_output = false; bool depth_to_space_already_fused = false; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp deleted file mode 100644 index 837430da70d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "fused_conv_eltwise_kernel_gemm.h" -#include "kernel_selector_utils.h" -#include -#include - -namespace kernel_selector { - -ParamsKey fused_conv_eltwise_kernel_gemm::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F16); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableSubGroup(); - // k.EnableSubGroupShort(); // we need it for FP16 only. we check it on the Validate phase - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableFusedConvEltwSplitSupport(); - return k; -} - -std::string fused_conv_eltwise_kernel_gemm::GetKernelName(const fused_conv_eltwise_params& params) const { - if (params.inputs[0].GetDType() == Datatype::F32) { - return kernelName + "_fp32"; - } else { - return kernelName + "_fp16"; - } -} - -bool fused_conv_eltwise_kernel_gemm::Validate(const Params& p, const optional_params& o) const { - if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - // make sure it's 1x1 conv - if (cp.filterSize.x != 1 || cp.filterSize.y != 1) - return false; - - // make sure stride is 1x1 - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - // input padding not supported - if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 || - cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0) - return false; - - // input and output spatial sizes must match - if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) - return false; - - return true; -} - -WeightsLayout fused_conv_eltwise_kernel_gemm::GetPreferreddWeightsLayout( - const fused_conv_eltwise_params ¶ms) const { - if (params.inputs[0].GetDType() == Datatype::F16) { - return WeightsLayout::iy_xs_os_xsv2_osv16__ao32; - } else { - return WeightsLayout::iy_xs_os_xsv2_osv8__ao32; - } -} - -fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_gemm::SetDefault( - const fused_conv_eltwise_params& arg, - int) const { - DispatchData runInfo = Parent::SetDefault(arg); - - runInfo.lws0 = 1; - runInfo.lws2 = 1; - - if (arg.inputs[0].GetDType() == Datatype::F16) { - runInfo.gemmStyle = {1, arg.conv.filterSize.x, 32, 32, 1, 1}; - runInfo.lws1 = 16; - runInfo.efficiency = FORCE_PRIORITY_6; - } else { - runInfo.gemmStyle = {2, arg.conv.filterSize.x, 32, 32, 2, 1}; - runInfo.lws1 = 8; - runInfo.efficiency = FORCE_PRIORITY_8; - } - - size_t sgemm_m = RoundUp(arg.output.X().v * arg.output.Y().v, runInfo.gemmStyle.subBlockDimM); - size_t sgemm_n = RoundUp(arg.output.Feature().v, runInfo.gemmStyle.subBlockDimN); - - runInfo.gws0 = RoundUp(CeilDiv(sgemm_n, runInfo.gemmStyle.globalWorkSizeDX), runInfo.lws0); - runInfo.gws1 = RoundUp(CeilDiv(sgemm_m, runInfo.gemmStyle.globalWorkSizeDY), runInfo.lws1); - runInfo.gws2 = arg.output.Batch().v; - - return runInfo; -} - -JitConstants fused_conv_eltwise_kernel_gemm::GetJitConstants(const fused_conv_eltwise_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstants({ - MakeJitConstant("ALIGNED_OFM", RoundUp(params.output.Feature().v, runInfo.gemmStyle.subBlockDimN)), - MakeJitConstant("DX", runInfo.gemmStyle.globalWorkSizeDX), - MakeJitConstant("DY", runInfo.gemmStyle.globalWorkSizeDY), - MakeJitConstant("FILTER_SIZE_X_DIV2", params.conv.filterSize.x / 2), - MakeJitConstant("INPUT_BUFFER_WIDTH_PADDED", ""), // TODO: enable non padding path again - MakeJitConstant("INPUT_BUFFER_HEIGHT_PADDED", ""), - }); - - if (CeilDiv(RoundUp(params.output.X().v * params.output.Y().v, runInfo.gemmStyle.subBlockDimM), - runInfo.gemmStyle.globalWorkSizeDY) % - runInfo.lws1 != - 0) - jit.AddConstant(MakeJitConstant("LEFTOVERS", 1)); - - return jit; -} - -KernelsData fused_conv_eltwise_kernel_gemm::GetKernelsData(const Params& params, const optional_params& options) const { - return GetTunedKernelsDataByIndex(params, options); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h deleted file mode 100644 index 9696f96253a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_gemm.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fused_conv_eltwise_kernel_base.h" -#include -#include - -namespace kernel_selector { - -class fused_conv_eltwise_kernel_gemm : public fused_conv_eltwise_kernel_base { -public: - using Parent = fused_conv_eltwise_kernel_base; - fused_conv_eltwise_kernel_gemm() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_gemm") {} - - virtual ~fused_conv_eltwise_kernel_gemm() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override; - std::string GetKernelName(const fused_conv_eltwise_params& params) const override; - bool NeedPaddedInput() const override { return true; } - JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; - bool Validate(const Params& p, const optional_params& o) const override; - DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp deleted file mode 100644 index e299c2cfaa8..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.cpp +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) 2019-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "fused_conv_eltwise_kernel_imad.h" -#include "common_tools.h" -#include "kernel_selector_utils.h" -#include - -// -// Kernel specific constants -// -#define SIMD_SIZE 16 - -static bool getOutBlock_WH(size_t output_size, - size_t stride, - size_t kernel_size, - size_t& output_block_w, - size_t& output_block_h) { - bool verify_output_ranges = false; - - output_block_w = output_block_h = 0; - - size_t upper_border = output_size < SIMD_SIZE ? output_size : SIMD_SIZE; - - size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1)) / stride; - - size_t max_posible_tile_size = upper_border < stride_restrictions ? upper_border : stride_restrictions; - - if (output_size % max_posible_tile_size == 0) { - output_block_w = max_posible_tile_size; - } else { - size_t min_horisontal_block_size = 2; // 4; - - size_t block_size = 0; - - for (size_t i = min_horisontal_block_size; i < max_posible_tile_size; i++) { - if (output_size % i == 0) - block_size = i; - } - - if (block_size != 0) { - output_block_w = block_size; - } else { - output_block_w = max_posible_tile_size; - verify_output_ranges = true; - } - } - - if (output_block_w <= 4) - output_block_h = output_block_w; - else - output_block_h = 1; - - return verify_output_ranges; -} -namespace kernel_selector { - -ParamsKey fused_conv_eltwise_kernel_imad::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputWeightsType(WeightsType::UINT8); - k.EnableInputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4); - k.EnableOutputLayout(DataLayout::byxf_af32); - - k.EnableDifferentTypes(); - k.EnableDifferentInputWeightsTypes(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableDilation(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableBatching(); - k.EnableFusedConvEltwInt8Quantization(); - k.EnableFusedConvEltwOutputCalibration(); - k.DisableTuning(); - k.EnableFusedConvEltwiseRWOutOpt(); - k.EnableEltwiseStride(); - return k; -} - -KernelsData fused_conv_eltwise_kernel_imad::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); -} - -JitConstants fused_conv_eltwise_kernel_imad::GetJitConstants(const fused_conv_eltwise_params& params, - const DispatchData& kd) const { - auto mem_consts = Parent::GetJitConstants(params, kd); - - const auto& input = params.inputs[0]; - const auto& output = params.output; - mem_consts.Merge(MakeActivationJitConstants(params.conv.activations, GetUnitType(params), "_CONV_TYPED", true)); - mem_consts.Merge(MakeActivationJitConstants(params.activations, GetUnitType(params), "_ELTW_TYPED", true)); - mem_consts.Merge(MakeTypeJitConstants(Datatype::F32, "float")); - - const auto& iDims = input.GetDims(); - const auto& oDims = output.GetDims(); - const auto& weights = params.weights; - const auto& wDims = weights.GetDims(); - const int iX = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::X); - const int iY = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::Y); - const int iF = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::FEATURE); - const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM); - const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X); - const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y); - mem_consts.AddConstants({ - MakeJitConstant("_IW", iDims[iX].v), - MakeJitConstant("_IH", iDims[iY].v), - MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)), - MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after), - MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after), - MakeJitConstant("_OW", oDims[oX].v), - MakeJitConstant("_OH", oDims[oY].v), - MakeJitConstant("_OD", wDims[wOD].v), - MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after), - MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after), - MakeJitConstant("SIMD_SIZE", SIMD_SIZE), - MakeJitConstant("K_HEIGHT", wDims[iY].v), - MakeJitConstant("K_WIDTH", wDims[iX].v), - MakeJitConstant("K_STRIDE", params.conv.stride.x), // X and Y must be equal - MakeJitConstant("NON_BLOCK_LOAD", 1), - }); - - size_t obw, obh; - bool verify_output_ranges = getOutBlock_WH(oDims[oX].v, params.conv.stride.x, wDims[iX].v, obw, obh); - mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", obw), - MakeJitConstant("OUT_BLOCK_HEIGHT", obh), - MakeJitConstant("NEED_TO_VERIFY_OUTPUT_RANGES", verify_output_ranges)}); - if (params.non_conv_scale != 1.0f) - mem_consts.AddConstant(MakeJitConstant("NON_CONV_SCALE", params.non_conv_scale)); - - return mem_consts; -} // GetJitConstants - -fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_imad::SetDefault( - const fused_conv_eltwise_params& params, - int) const { - DispatchData kd; - - const auto& in = params.inputs[0]; - const auto& weights = params.weights; - const auto& iDims = in.GetDims(); - const auto& wDims = weights.GetDims(); - const int iX = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::X); - const int iY = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::Y); - const int iB = DataTensor::Channelndex(in.GetLayout(), Tensor::DataChannelName::BATCH); - const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM); - - size_t otw, oth; - getOutBlock_WH(iDims[iX].v, params.conv.stride.x, iDims[iX].pad.before + iDims[iX].pad.after, otw, oth); - - size_t dim_add = ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE); - if (dim_add != 0) - dim_add = SIMD_SIZE - dim_add; - - - std::vector global = {// globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW; - // number of tiles needed to cover output width - (((iDims[iX].v / params.conv.stride.x) + (otw - 1)) / otw), - - // globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH; - // number of tiles needed to cover output height - (((iDims[iY].v / params.conv.stride.y) + (oth - 1)) / oth), - - // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE); - // round depth range up - ((wDims[wOD].v * iDims[iB].v) + dim_add)}; - - std::vector local = {1, 1, SIMD_SIZE}; - - kd.gws0 = global[0]; - kd.gws1 = global[1]; - kd.gws2 = global[2]; - - kd.lws0 = local[0]; - kd.lws1 = local[1]; - kd.lws2 = local[2]; - - kd.cldnnStyle = {0, 0, 0, 0, 0}; - kd.gemmStyle = {0, 0, 0, 0, 0, 0}; - kd.efficiency = FORCE_PRIORITY_2; - - return kd; -} // SetDefault - -bool fused_conv_eltwise_kernel_imad::Validate(const Params& params, const optional_params& options) const { - if (!Parent::Validate(params, options)) { - return false; - } - - KernelData kd = KernelData::Default(params); - fused_conv_eltwise_params& newParams = *static_cast(kd.params.get()); - - if (newParams.conv.stride.x != newParams.conv.stride.y) { - // Strides must be equial - return false; - } else if ((newParams.conv.filterSize.x != m_FilterSizeX) || (newParams.conv.filterSize.y != m_FilterSizeY)) { - // Kernel does not support such filter size - return false; - } - - return true; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h deleted file mode 100644 index 7af500b2975..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_imad.h +++ /dev/null @@ -1,46 +0,0 @@ -/* -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "fused_conv_eltwise_kernel_base.h" -#include - -namespace kernel_selector { - -class fused_conv_eltwise_kernel_imad : public fused_conv_eltwise_kernel_base { -public: - using Parent = fused_conv_eltwise_kernel_base; - fused_conv_eltwise_kernel_imad() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_imad") {} - - virtual ~fused_conv_eltwise_kernel_imad() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& params, const optional_params& options) const override; - JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; - bool NeedPaddedInput() const override { return true; } - WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override { - return WeightsLayout::os_is_yx_osv16_isv4; - } - - size_t m_FilterSizeX = 1; - size_t m_FilterSizeY = 1; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp deleted file mode 100644 index 4859a0c9729..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -static const size_t _SG_TILE_M = 32; -static const size_t _SG_TILE_N = 32; -static const size_t _SG_SIZE = 8; // sub group size -static const size_t _TILES_PER_SG_X = 1; // Persistent threads -static const size_t _TILES_PER_SG_Y = 1; // Persistent threads - -ParamsKey fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableFusedConvEltwInt8Quantization(); - k.EnableFusedConvEltwOutputCalibration(); - k.DisableTuning(); - k.EnableFusedConvEltwiseRWOutOpt(); - return k; -} - -bool fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::Validate(const Params& p, - const optional_params& o) const { - if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) { - return false; - } - - const fused_conv_eltwise_params& cp = static_cast(p); - - // make sure it's 1x1 conv - if (cp.conv.filterSize.x != 1 || cp.conv.filterSize.y != 1) - return false; - - // make sure stride is 1x1 - if (cp.conv.stride.x != 1 || cp.conv.stride.y != 1) - return false; - - // input padding not supported - if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 || - cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0) - return false; - - // input and output spatial sizes must match - if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) - return false; - - const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v; - const auto k = cp.inputs[0].Feature().v; - const auto n = cp.output.Feature().v; - - if (m % 32 != 0 && m % 128 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 - return false; - - if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 - return false; - - if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 - return false; - - return true; -} - -fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::SetDefault( - const fused_conv_eltwise_params& arg, - int) const { - DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; - size_t mat_n = arg.output.Feature().v; - - size_t _MATRIX_M = mat_m; - size_t _MATRIX_N = mat_n; - - size_t _WG_TILE_M = 128; - size_t _WG_TILE_N = 128; - - // Calculate number of threads needed - const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; - const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y; - - // Define execution setup for kernel: - size_t globalWorkSize[3] = {threadsX, threadsY, 1}; - size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1}; - - runInfo.gws0 = globalWorkSize[0]; - runInfo.gws1 = globalWorkSize[1]; - runInfo.gws2 = globalWorkSize[2]; - - runInfo.lws0 = localWorkSize[0]; - runInfo.lws1 = localWorkSize[1]; - runInfo.lws2 = localWorkSize[2]; - - return runInfo; -} - -JitConstants fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetJitConstants( - const fused_conv_eltwise_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("WG_TILE_M", 128)); // Work-Group tile size M, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", 1)); // Persistent threads - jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", 1)); // Persistent threads - - // Do not change values below - jit.AddConstant(MakeJitConstant("DIM_X", 0)); - jit.AddConstant(MakeJitConstant("DIM_Y", 1)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); - jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); - jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); - jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); - jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); - jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); - jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); - - jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); - jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); - jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); - - const auto& input = params.inputs[0]; - const auto& output = params.output; - - auto m = output.X().v * output.Y().v * output.Batch().v; - auto k = input.Feature().v; - auto n = output.Feature().v; - - jit.AddConstant(MakeJitConstant("MATRIX_M", m)); - jit.AddConstant(MakeJitConstant("MATRIX_K", k)); - jit.AddConstant(MakeJitConstant("MATRIX_N", n)); - - const size_t out_x_pitch = 32 * 4; - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); - const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); - const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); - - bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; - jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); - - bool eltw_padding = false; - if (!params.second_input_in_output) { - // for second input - const size_t in2_x_pitch = 32 * 4; - const size_t in2_y_pitch = 32 * 4 * params.inputs[1].X().LogicalDimPadded(); - const size_t in2_b_block_pitch = in2_y_pitch * params.inputs[1].Y().LogicalDimPadded(); - const size_t in2_f_block_pitch = in2_b_block_pitch * ((params.inputs[1].Batch().v + 3) / 4); - const size_t in2_offset = - in2_x_pitch * params.inputs[1].X().pad.before + in2_y_pitch * params.inputs[1].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN2_X_PITCH", in2_x_pitch)); - jit.AddConstant(MakeJitConstant("IN2_Y_PITCH", in2_y_pitch)); - jit.AddConstant(MakeJitConstant("IN2_B_BLOCK_PITCH", in2_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN2_F_BLOCK_PITCH", in2_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN2_OFFSET", in2_offset)); - - eltw_padding = params.inputs[1].X().pad.Total() != 0 || params.inputs[1].Y().pad.Total() != 0; - } else { - eltw_padding = out_padding; - } - - jit.AddConstant(MakeJitConstant("ELTW_WITH_PADDING", eltw_padding)); - - return jit; -} - -KernelsData fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8::GetKernelsData( - const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_1; // _3 - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h deleted file mode 100644 index a02d08ca643..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fused_conv_eltwise_kernel_base.h" -#include - -namespace kernel_selector { - -class fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8 : public fused_conv_eltwise_kernel_base { -public: - using Parent = fused_conv_eltwise_kernel_base; - fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8() - : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8") {} - - virtual ~fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override { - return WeightsLayout::is_o32_yx_isv32_swizzled_by_4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp deleted file mode 100644 index c0d8cd6f17a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -static const size_t _SG_TILE_M = 32; -static const size_t _SG_TILE_N = 32; -static const size_t _SG_SIZE = 8; // sub group size -static const size_t _TILES_PER_SG_X = 1; // Persistent threads -static const size_t _TILES_PER_SG_Y = 1; // Persistent threads - -ParamsKey fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableInputWeightsType(WeightsType::INT8); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableBatching(); - k.EnableFusedConvEltwInt8Quantization(); - k.EnableFusedConvEltwOutputCalibration(); - k.DisableTuning(); - k.EnableFusedConvEltwiseRWOutOpt(); - return k; -} - -bool fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::Validate(const Params& p, - const optional_params& o) const { - if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) { - return false; - } - - const convolution_params& cp = static_cast(p); - - // make sure it's 1x1 conv - if (cp.filterSize.x != 1 || cp.filterSize.y != 1) - return false; - - // make sure stride is 1x1 - if (cp.stride.x != 1 || cp.stride.y != 1) - return false; - - // input padding not supported - if (cp.inputs[0].X().pad.Total() != 0 || cp.inputs[0].Y().pad.Total() != 0 || - cp.inputs[0].Feature().pad.Total() != 0 || cp.inputs[0].Batch().pad.Total() != 0) - return false; - - // input and output spatial sizes must match - if (!(cp.output.X().v == cp.inputs[0].X().v) || !(cp.output.Y().v == cp.inputs[0].Y().v)) - return false; - - const auto m = cp.output.X().v * cp.output.Y().v * cp.output.Batch().v; - const auto k = cp.inputs[0].Feature().v; - const auto n = cp.output.Feature().v; - - if (m % 32 != 0 && m % 224 != 0) // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M=128 - return false; - - if (k % 32 != 0) // Matrix size K, Must be mutliple of 32 - return false; - - if (n % 32 != 0 && n % 128 != 0) // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N=128 - return false; - - return true; -} - -fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::SetDefault( - const fused_conv_eltwise_params& arg, - int) const { - DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(arg); - - runInfo.efficiency = FORCE_PRIORITY_1; - - size_t mat_m = arg.output.X().v * arg.output.Y().v * arg.output.Batch().v; - size_t mat_n = arg.output.Feature().v; - - size_t _MATRIX_M = mat_m; - size_t _MATRIX_N = mat_n; - - size_t _WG_TILE_M = 224; - size_t _WG_TILE_N = 128; - - // Calculate number of threads needed - const size_t threadsX = (_MATRIX_N / (_SG_TILE_N / _SG_SIZE)) / _TILES_PER_SG_X; - const size_t threadsY = (_MATRIX_M / _SG_TILE_M) / _TILES_PER_SG_Y; - - // Define execution setup for kernel: - size_t globalWorkSize[3] = {threadsX, threadsY, 1}; - size_t localWorkSize[3] = {_SG_SIZE * _WG_TILE_N / _SG_TILE_N, _WG_TILE_M / _SG_TILE_M, 1}; - - runInfo.gws0 = globalWorkSize[0]; - runInfo.gws1 = globalWorkSize[1]; - runInfo.gws2 = globalWorkSize[2]; - - runInfo.lws0 = localWorkSize[0]; - runInfo.lws1 = localWorkSize[1]; - runInfo.lws2 = localWorkSize[2]; - - return runInfo; -} - -JitConstants fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetJitConstants( - const fused_conv_eltwise_params& params, - const DispatchData& runInfo) const { - auto jit = Parent::GetJitConstants(params, runInfo); - - jit.AddConstant(MakeJitConstant("WG_TILE_M", 224)); // Work-Group tile size M, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("WG_TILE_N", 128)); // Work-Group tile size N, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("TILES_PER_SG_X", _TILES_PER_SG_X)); - jit.AddConstant(MakeJitConstant("TILES_PER_SG_Y", _TILES_PER_SG_Y)); - - // Do not change values below - jit.AddConstant(MakeJitConstant("DIM_X", 0)); - jit.AddConstant(MakeJitConstant("DIM_Y", 1)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K", 32)); - jit.AddConstant(MakeJitConstant("MATRIX_SMALL_K_BFLOAT", 16)); - jit.AddConstant(MakeJitConstant("SG_TILE_M", _SG_TILE_M)); - jit.AddConstant(MakeJitConstant("SG_TILE_N", _SG_TILE_N)); - jit.AddConstant(MakeJitConstant("SG_SIZE", _SG_SIZE)); - jit.AddConstant(MakeJitConstant("SIMD_LANE_M", "SG_TILE_M")); - jit.AddConstant(MakeJitConstant("SIMD_LANE_N", "(SG_TILE_N / SG_SIZE)")); - jit.AddConstant(MakeJitConstant("WG_SIZE", "(SG_SIZE * WG_TILE_N / SG_TILE_N) * (WG_TILE_M / SG_TILE_M)")); - - jit.AddConstant(MakeJitConstant("COMPILE_KERNELS", "")); - jit.AddConstant(MakeJitConstant("TILED_GLOBAL_LAYOUT", "")); - jit.AddConstant(MakeJitConstant("OUTPUT_TILED_GLOBAL_LAYOUT", "")); - - const auto& input = params.inputs[0]; - const auto& output = params.output; - - auto m = output.X().v * output.Y().v * output.Batch().v; - auto k = input.Feature().v; - auto n = output.Feature().v; - - jit.AddConstant(MakeJitConstant("MATRIX_M", m)); // Matrix size M, Must be mutliple of 32 and multiple of WG_TILE_M - jit.AddConstant(MakeJitConstant("MATRIX_K", k)); // Matrix size K, Must be mutliple of 32 - jit.AddConstant(MakeJitConstant("MATRIX_N", n)); // Matrix size N, Must be mutliple of 32 and multiple of WG_TILE_N - - const size_t out_x_pitch = 32 * 4; - const size_t out_y_pitch = 32 * 4 * params.output.X().LogicalDimPadded(); - const size_t out_b_block_pitch = out_y_pitch * params.output.Y().LogicalDimPadded(); - const size_t out_f_block_pitch = out_b_block_pitch * ((params.output.Batch().v + 3) / 4); - const size_t out_offset = out_x_pitch * params.output.X().pad.before + out_y_pitch * params.output.Y().pad.before; - - jit.AddConstant(MakeJitConstant("OUT_X_PITCH", out_x_pitch)); - jit.AddConstant(MakeJitConstant("OUT_Y_PITCH", out_y_pitch)); - jit.AddConstant(MakeJitConstant("OUT_B_BLOCK_PITCH", out_b_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_F_BLOCK_PITCH", out_f_block_pitch)); - jit.AddConstant(MakeJitConstant("OUT_OFFSET", out_offset)); - - bool out_padding = output.X().pad.Total() != 0 || output.Y().pad.Total() != 0; - jit.AddConstant(MakeJitConstant("OUT_WITH_PADDING", out_padding)); - - bool eltw_padding = false; - if (!params.second_input_in_output) { - // for second input - const size_t in2_x_pitch = 32 * 4; - const size_t in2_y_pitch = 32 * 4 * params.inputs[1].X().LogicalDimPadded(); - const size_t in2_b_block_pitch = in2_y_pitch * params.inputs[1].Y().LogicalDimPadded(); - const size_t in2_f_block_pitch = in2_b_block_pitch * ((params.inputs[1].Batch().v + 3) / 4); - const size_t in2_offset = - in2_x_pitch * params.inputs[1].X().pad.before + in2_y_pitch * params.inputs[1].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN2_X_PITCH", in2_x_pitch)); - jit.AddConstant(MakeJitConstant("IN2_Y_PITCH", in2_y_pitch)); - jit.AddConstant(MakeJitConstant("IN2_B_BLOCK_PITCH", in2_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN2_F_BLOCK_PITCH", in2_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN2_OFFSET", in2_offset)); - - eltw_padding = params.inputs[1].X().pad.Total() != 0 || params.inputs[1].Y().pad.Total() != 0; - } else { - eltw_padding = out_padding; - } - - jit.AddConstant(MakeJitConstant("ELTW_WITH_PADDING", eltw_padding)); - - return jit; -} - -KernelsData fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8::GetKernelsData( - const Params& params, - const optional_params& options) const { - KernelsData kd = GetCommonKernelsData(params, options); - if (!kd.empty()) - kd[0].estimatedTime = FORCE_PRIORITY_1; // _3 - return kd; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h deleted file mode 100644 index a43f3232824..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "fused_conv_eltwise_kernel_base.h" -#include - -namespace kernel_selector { - -class fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8 : public fused_conv_eltwise_kernel_base { -public: - using Parent = fused_conv_eltwise_kernel_base; - fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8() - : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8") {} - - virtual ~fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params& p, const optional_params& o) const override; - JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override; - DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override; - WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params &) const override { - return WeightsLayout::is_o32_yx_isv32_swizzled_by_4; - } -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp index fe1976bc44d..da1c3c8e16f 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp @@ -14,26 +14,16 @@ #include "fused_conv_eltwise_kernel_selector.h" -#include "fused_conv_eltwise_kernel_gemm.h" #include "fused_conv_eltwise_kernel_bfyx_1x1_opt.h" #include "fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h" -#include "fused_conv_eltwise_kernel_mmad_32x32sg_128x128wg_slm_int8.h" -#include "fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8.h" #include "fused_conv_eltwise_kernel_yxfb_yxio_b16.h" -#include "fused_conv_eltwise_kernel_imad.h" -#include "fused_conv_eltwise_kernel_af32_imad_1x1.h" #include "fused_conv_eltwise_kernel_bfyx_iyxo.h" namespace kernel_selector { fused_conv_eltwise_kernel_selector::fused_conv_eltwise_kernel_selector() { - // Attach(); Attach(); Attach(); Attach(); - Attach(); - Attach(); - Attach(); - Attach(); Attach(); } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp deleted file mode 100644 index 6afc7a8adf7..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) 2018-2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "index_select_kernel_base.h" - -#include "kernel_selector_utils.h" -#include -#include - -namespace kernel_selector { -JitConstants IndexSelectKernelBase::GetJitConstants(const index_select_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - - jit.AddConstant(MakeJitConstant("AXES_NUMBER", params.axes.size())); - - if (params.reverse) { - jit.AddConstant(MakeJitConstant("REVERSE", 1)); - } - - for (size_t i = 0; i < params.axes.size(); i++) { - std::string size_name = "REVERSE_AXIS_SIZE"; - size_t size_value = 0; - if (params.axes.size() > 1) { - std::stringstream ss; - ss << "REVERSE_" << toString(params.axes[i]) << "_SIZE"; - size_name = ss.str(); - } - jit.AddConstant(MakeJitConstant(toString(params.axes[i]), "")); - if (params.reverse) { - if (params.axes[i] == IndexSelectAxis::BATCH) { - size_value = params.inputs.at(0).Batch().v; - } else if (params.axes[i] == IndexSelectAxis::X) { - size_value = params.inputs.at(0).X().v; - } else if (params.axes[i] == IndexSelectAxis::Y) { - size_value = params.inputs.at(0).Y().v; - } else if (params.axes[i] == IndexSelectAxis::FEATURE) { - size_value = params.inputs.at(0).Feature().v; - } - } - jit.AddConstant(MakeJitConstant(size_name, size_value)); - } - - return jit; -} - -IndexSelectKernelBase::DispatchData IndexSelectKernelBase::SetDefault(const index_select_params& params) { - const auto& output = params.output; - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - std::vector global; - - if (params.axes.size() == 1) { - if (params.reverse) { - if (params.axes[0] == IndexSelectAxis::BATCH) { - global = {1, params.inputs.at(0).Batch().v, output.Feature().v}; - } else if (params.axes[0] == IndexSelectAxis::X) { - global = {output.Batch().v, params.inputs.at(0).X().v, output.Feature().v}; - } else if (params.axes[0] == IndexSelectAxis::Y) { - global = {output.Batch().v, params.inputs.at(0).Y().v, output.Feature().v}; - } else if (params.axes[0] == IndexSelectAxis::FEATURE) { - global = {output.Batch().v, params.inputs.at(0).Feature().v, output.Y().v}; - } - } else { - const auto indices = params.inputs.at(1).X().v; - - if (params.axes[0] == IndexSelectAxis::BATCH) { - global = {1, indices, output.Feature().v}; - } else if (params.axes[0] == IndexSelectAxis::X || params.axes[0] == IndexSelectAxis::Y) { - global = {output.Batch().v, indices, output.Feature().v}; - } else if (params.axes[0] == IndexSelectAxis::FEATURE) { - global = {output.Batch().v, indices, output.Y().v}; - } - } - } else { - if (params.reverse) { - global = {output.Batch().v, output.Y().v, output.Feature().v}; - } - } - - const auto& local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); - - kd.gws0 = global[0]; - kd.gws1 = global[1]; - kd.gws2 = global[2]; - - kd.lws0 = local[0]; - kd.lws1 = local[1]; - kd.lws2 = local[2]; - - return kd; -} - -KernelsData IndexSelectKernelBase::GetCommonKernelsData(const Params& params, - const optional_params& options, - float estimated_time) const { - assert(params.GetType() == KernelType::INDEX_SELECT); - - const auto& prim_params = - static_cast(params); - - auto run_info = SetDefault(prim_params); - KernelData k_data = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(prim_params); - auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = k_data.kernels[0]; - FillCLKernelData(kernel, - run_info, - params.engineInfo, - kernelName, - jit, - entry_point, - DEFAULT, - false, - false, - (uint32_t)prim_params.inputs.size()); - - k_data.estimatedTime = estimated_time; - - return {k_data}; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h deleted file mode 100644 index 3d19510f333..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_base.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018-2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "common_kernel_base.h" -#include "kernel_selector_params.h" -#include - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// index_select_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct index_select_params : public base_params { - index_select_params() : base_params(KernelType::INDEX_SELECT) {} - - std::vector axes = {IndexSelectAxis::BATCH}; - bool reverse = false; -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// index_select_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct index_select_optional_params : optional_params { - index_select_optional_params() : optional_params(KernelType::INDEX_SELECT) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// IndexSelectKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class IndexSelectKernelBase : public common_kernel_base { -public: - using common_kernel_base::common_kernel_base; - virtual ~IndexSelectKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - JitConstants GetJitConstants(const index_select_params& params) const; - static DispatchData SetDefault(const index_select_params& params); - KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimated_time) const; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp deleted file mode 100644 index 47f4a7554d2..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "index_select_kernel_ref.h" - -namespace kernel_selector { -ParamsKey IndexSelectKernelRef::GetSupportedKey() const { - ParamsKey k; - - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableInputDataType(Datatype::UINT8); - k.EnableInputDataType(Datatype::INT32); - - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::INT32); - - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::yxfb); - - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - - k.EnableBatching(); - - k.EnableIndexSelectAxis(IndexSelectAxis::BATCH); - k.EnableIndexSelectAxis(IndexSelectAxis::FEATURE); - k.EnableIndexSelectAxis(IndexSelectAxis::Y); - k.EnableIndexSelectAxis(IndexSelectAxis::X); - - k.EnableDifferentTypes(); - - return k; -} - -KernelsData IndexSelectKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h deleted file mode 100644 index a185b0deb3e..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_ref.h +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "index_select_kernel_base.h" - -namespace kernel_selector { -class IndexSelectKernelRef : public IndexSelectKernelBase { -public: - IndexSelectKernelRef() : IndexSelectKernelBase("index_select_gpu_ref") {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp deleted file mode 100644 index 674d5ca544f..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "index_select_kernel_selector.h" -#include "index_select_kernel_ref.h" - -namespace kernel_selector { -index_select_kernel_selector::index_select_kernel_selector() { Attach(); } - -KernelsData index_select_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::INDEX_SELECT); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h deleted file mode 100644 index f8030c98432..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/index_select/index_select_kernel_selector.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class index_select_kernel_selector : public kernel_selector_base { -public: - static index_select_kernel_selector& Instance() { - static index_select_kernel_selector instance; - return instance; - } - - index_select_kernel_selector(); - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp deleted file mode 100644 index cca47b2aa18..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "lookup_table_kernel_axis.h" -#include - -namespace kernel_selector { -ParamsKey LookUpTableKernelAxis::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableLookUpTableIndicesFormat(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableLookUpTableAxis(LookUpTableAxis::BATCH); - k.EnableLookUpTableAxis(LookUpTableAxis::X); - k.EnableLookUpTableAxis(LookUpTableAxis::Y); - k.EnableLookUpTableAxis(LookUpTableAxis::FEATURE); - k.EnableBatching(); - return k; -} - -KernelsData LookUpTableKernelAxis::GetKernelsData(const Params& params, const optional_params& options) const { - if (!Validate(params, options)) { - return {}; - } - - const lookup_table_params& orgParams = static_cast(params); - - DispatchData runInfo; - runInfo.fp16UnitUsed = orgParams.inputs[0].GetDType() == Datatype::F16; - - if (orgParams.lookUpTableAxis == LookUpTableAxis::BATCH) { - runInfo.gws0 = orgParams.inputs[0].X().v; - runInfo.gws1 = orgParams.inputs[0].Y().v; - runInfo.gws2 = orgParams.inputs[0].Feature().v; - } else if (orgParams.lookUpTableAxis == LookUpTableAxis::FEATURE) { - runInfo.gws0 = orgParams.inputs[0].X().v; - runInfo.gws1 = orgParams.inputs[0].Y().v; - runInfo.gws2 = orgParams.inputs[0].Batch().v; - } else if (orgParams.lookUpTableAxis == LookUpTableAxis::Y) { - runInfo.gws0 = orgParams.inputs[0].X().v; - runInfo.gws1 = orgParams.inputs[0].Feature().v; - runInfo.gws2 = orgParams.inputs[0].Batch().v; - } else if (orgParams.lookUpTableAxis == LookUpTableAxis::X) { - runInfo.gws0 = orgParams.inputs[0].Y().v; - runInfo.gws1 = orgParams.inputs[0].Feature().v; - runInfo.gws2 = orgParams.inputs[0].Batch().v; - } - - runInfo.lws0 = std::min(std::max(runInfo.gws0, static_cast(1)), static_cast(32)); - while (runInfo.gws0 % runInfo.lws0 != 0) { - --runInfo.lws0; - } - runInfo.lws1 = 1; - runInfo.lws2 = 1; - - KernelData kd = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2); - - kd.estimatedTime = FORCE_PRIORITY_9; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h deleted file mode 100644 index 90bb61011b4..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_axis.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "lookup_table_kernel_base.h" - -namespace kernel_selector { -class LookUpTableKernelAxis : public LookUpTableKernelBase { -public: - LookUpTableKernelAxis() : LookUpTableKernelBase("lookup_table_axis") {} - virtual ~LookUpTableKernelAxis() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp deleted file mode 100644 index 6874efa07fd..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "lookup_table_kernel_base.h" -#include - -namespace kernel_selector { -bool LookUpTableKernelBase::Validate(const Params& p, const optional_params& o) const { - if (p.GetType() != KernelType::LOOKUP_TABLE || o.GetType() != KernelType::LOOKUP_TABLE) { - return false; - } - - return true; -} - -JitConstants LookUpTableKernelBase::GetJitConstants(const lookup_table_params& params) const { - JitConstants jit = MakeBaseParamsJitConstants(params); - - jit.AddConstants({ - MakeJitConstant("VAL_NUM", params.numberOfValues), - MakeJitConstant(toString(params.lookUpTableAxis) + "_AXIS", 1), - }); - - return jit; -} - -LookUpTableKernelBase::DispatchData LookUpTableKernelBase::SetDefault(const lookup_table_params& params) const { - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - // Determine global work sizes. - kd.gws0 = params.inputIndices.X().v; - kd.gws1 = params.inputIndices.Batch().v; // B - kd.gws2 = 1; - - kd.lws0 = std::min(std::max(kd.gws0, static_cast(1)), static_cast(32)); - while (kd.gws0 % kd.lws0 != 0) { - --kd.lws0; - } - kd.lws1 = 1; - kd.lws2 = 1; - - return kd; -} - -KernelsData LookUpTableKernelBase::GetCommonKernelsData(const Params& params, - const optional_params& options, - float estimatedTime) const { - if (!Validate(params, options)) { - return {}; - } - - const lookup_table_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - - KernelData kd = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point, "", false, false, 2); - - kd.estimatedTime = estimatedTime; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h deleted file mode 100644 index a221f417d68..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_base.h +++ /dev/null @@ -1,64 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "common_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// lookup_table_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct lookup_table_params : public base_params { - lookup_table_params() : base_params(KernelType::LOOKUP_TABLE) {} - - LookUpTableAxis lookUpTableAxis = LookUpTableAxis::XYF; - uint32_t numberOfValues = 0; - DataTensor inputIndices; - - virtual ParamsKey GetParamsKey() const { - ParamsKey k = base_params::GetParamsKey(); - k.EnableLookUpTableAxis(lookUpTableAxis); - k.EnableLookUpTableIndicesFormat(inputIndices.GetDType()); - return k; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// lookup_table_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct lookup_table_optional_params : optional_params { - lookup_table_optional_params() : optional_params(KernelType::LOOKUP_TABLE) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// lookup_table_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class LookUpTableKernelBase : public common_kernel_base { -public: - using common_kernel_base::common_kernel_base; - virtual ~LookUpTableKernelBase() {} - - struct DispatchData : public CommonDispatchData {}; - -protected: - bool Validate(const Params&, const optional_params&) const override; - virtual JitConstants GetJitConstants(const lookup_table_params& params) const; - virtual DispatchData SetDefault(const lookup_table_params& params) const; - KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp deleted file mode 100644 index cd4006c8292..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "lookup_table_kernel_ref.h" - -namespace kernel_selector { -ParamsKey LookUpTableKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableLookUpTableIndicesFormat(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableLookUpTableAxis(LookUpTableAxis::XYF); - k.EnableBatching(); - return k; -} - -KernelsData LookUpTableKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_9); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h deleted file mode 100644 index fab406cba16..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_ref.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "lookup_table_kernel_base.h" - -namespace kernel_selector { -class LookUpTableKernelRef : public LookUpTableKernelBase { -public: - LookUpTableKernelRef() : LookUpTableKernelBase("lookup_table_ref") {} - virtual ~LookUpTableKernelRef() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp deleted file mode 100644 index 3ad1358bbb3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "lookup_table_kernel_selector.h" -#include "lookup_table_kernel_ref.h" -#include "lookup_table_kernel_axis.h" - -namespace kernel_selector { - -lookup_table_kernel_selector::lookup_table_kernel_selector() { - Attach(); - Attach(); -} - -KernelsData lookup_table_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::LOOKUP_TABLE); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h deleted file mode 100644 index 7dcc535411a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/lookup_table/lookup_table_kernel_selector.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class lookup_table_kernel_selector : public kernel_selector_base { -public: - static lookup_table_kernel_selector& Instance() { - static lookup_table_kernel_selector instance_; - return instance_; - } - - lookup_table_kernel_selector(); - - virtual ~lookup_table_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp index 28eb5adc1ee..65b7b99d089 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/permute/permute_kernel_ref.cpp @@ -101,7 +101,7 @@ KernelsData PermuteKernelRef::GetKernelsData(const Params& params, const optiona kernel.workGroups.global = {in.X().v, in.Y().v * in.Z().v * in.W().v, in.Feature().v * in.Batch().v}; kernel.workGroups.local = GetOptimalLocalWorkGroupSizes(kernel.workGroups.global, params.engineInfo); kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); - kernel.arguments = GetArgsDesc(1, false, false, false, false, GetFusedPrimitiveInputsCount(params)); + kernel.arguments = GetArgsDesc(1, false, false, GetFusedPrimitiveInputsCount(params)); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp deleted file mode 100644 index 5e20ef6349a..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "pooling_kernel_gpu_average_opt.h" - -namespace kernel_selector { -ParamsKey PoolingKernelGPUAverageOpt::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnablePoolType(PoolType::AVG); - k.EnablePoolRemainder(PoolRemainder::FLOOR); - k.EnablePoolRemainder(PoolRemainder::CEIL); - k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); - k.EnableDifferentTypes(); - return k; -} - -bool PoolingKernelGPUAverageOpt::Validate(const Params& p, const optional_params& o) const { - if (!PoolingKernelBase::Validate(p, o)) { - return false; - } - - const pooling_params& params = static_cast(p); - - if (!params.activations.empty()) { - return {}; - } - - if ((params.poolSize.x != 3) || (params.poolSize.y != 3) || (params.poolStride.x != 1) || - (params.poolStride.y != 1) || (params.poolPad.x != 1) || (params.poolPad.y != 1) || - !(params.inputs[0] == params.output) || params.inputs[0].PitchesDifferFromLogicalDims() || - params.output.PitchesDifferFromLogicalDims()) { - return false; - } - - return true; -} - -static uSize GetTileDimentions() { - constexpr int simdSize = 16; - - return {simdSize - 2, 7}; -} - -PoolingKernelBase::DispatchData PoolingKernelGPUAverageOpt::SetDefault(const pooling_params& params) const { - constexpr int simdSize = 16; - - DispatchData runInfo = PoolingKernelBase::SetDefault(params); - - auto tileDims = GetTileDimentions(); - - const int numTilesX = - static_cast(std::ceil(static_cast(params.inputs[0].X().v) / static_cast(tileDims.x))); - const int numTilesY = - static_cast(std::ceil(static_cast(params.inputs[0].Y().v) / static_cast(tileDims.y))); - - runInfo.gws0 = numTilesX * simdSize; - runInfo.gws1 = numTilesY; - runInfo.gws2 = params.inputs[0].Feature().v; - runInfo.lws0 = simdSize; - runInfo.lws1 = 1; - runInfo.lws2 = 1; - - return runInfo; -} - -JitConstants PoolingKernelGPUAverageOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const { - auto tileDims = GetTileDimentions(); - auto jit = PoolingKernelBase::GetJitConstants(params, kd); - - if (tileDims.y != 0 && tileDims.x != 0) { - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0)); - jit.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y)); - jit.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x)); - jit.AddConstant(MakeJitConstant("ONE_OVER_POOL_SIZE", 1.f / (params.poolSize.x * params.poolSize.y))); - } - - return jit; -} - -KernelsData PoolingKernelGPUAverageOpt::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_7); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h deleted file mode 100644 index 828434705fa..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "pooling_kernel_base.h" - -namespace kernel_selector { -class PoolingKernelGPUAverageOpt : public PoolingKernelBase { -public: - PoolingKernelGPUAverageOpt() : PoolingKernelBase("pooling_gpu_average_opt") {} - virtual ~PoolingKernelGPUAverageOpt() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - -protected: - bool Validate(const Params&, const optional_params&) const override; - JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; - DispatchData SetDefault(const pooling_params& params) const override; -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp deleted file mode 100644 index 9f5a2520440..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h" - -namespace kernel_selector { -ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnablePoolType(PoolType::MAX); - k.EnablePoolType(PoolType::AVG); - k.EnablePoolRemainder(PoolRemainder::FLOOR); - k.EnablePoolRemainder(PoolRemainder::CEIL); - k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); - k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC); - k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING); - k.EnableDifferentTypes(); - return k; -} - -PoolingKernelBase::DispatchData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::SetDefault(const pooling_params& params) const { - constexpr int simdSize = 8; - - DispatchData runInfo = PoolingKernelBase::SetDefault(params); - - runInfo.gws0 = params.output.X().v; - runInfo.gws1 = params.output.Y().v; - // we got fs_bs_yx_bsv4_fsv32 format, we process 4 batches and 4 features per workitem - runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * RoundUp(params.output.Batch().v, 4)) / (4 * 4); - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = simdSize; - - return runInfo; -} - -JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetJitConstants(const pooling_params& params, DispatchData kd) const { - auto jit = PoolingKernelBase::GetJitConstants(params, kd); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); - jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); - - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf = {"", - {"b + bi", "f", "y", "x"}, - "char_result", - input_dt, - 4, - LoadType::LT_UNALIGNED, - BoundaryCheck::ENABLED, - IndexType::TENSOR_COORD, - Tensor::DataChannelName::FEATURE}; - jit.Merge(MakeFusedOpsJitConstants(params, {conf})); - } - - return jit; -} - -bool PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::Validate(const Params& params, const optional_params& options) const { - if (!PoolingKernelBase::Validate(params, options)) { - return false; - } - - auto p = dynamic_cast(params); - - if (p.quantization != QuantizationType::NONE && p.poolType == PoolType::AVG) { - return false; - } - - return true; -} -KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_2); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h deleted file mode 100644 index 307b426a563..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "pooling_kernel_base.h" - -namespace kernel_selector { -class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32 : public PoolingKernelBase { -public: - PoolingKerneGPU_fs_bs_yx_bsv4_fsv32() : PoolingKernelBase("pooling_gpu_fs_bs_yx_bsv4_fsv32") {} - virtual ~PoolingKerneGPU_fs_bs_yx_bsv4_fsv32() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - DispatchData SetDefault(const pooling_params& params) const override; - -protected: - JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; - bool Validate(const Params&, const optional_params&) const override; - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp deleted file mode 100644 index 34f97ab9cc9..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h" - -namespace kernel_selector { -ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::INT8); - k.EnableOutputDataType(Datatype::UINT8); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - k.EnablePoolType(PoolType::MAX); - k.EnablePoolRemainder(PoolRemainder::FLOOR); - k.EnablePoolRemainder(PoolRemainder::CEIL); - k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED); - k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC); - k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING); - k.EnableDifferentTypes(); - return k; -} - -size_t static get_batch_sub_groups_count(const pooling_params& params) { - if (params.inputs[0].Batch().v % 32 == 0) - return 8; // divided by 4 because we process 4 batches per subgroup - return 1; -} - -PoolingKernelBase::DispatchData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::SetDefault( - const pooling_params& params) const { - constexpr int simdSize = 32; - - DispatchData runInfo = PoolingKernelBase::SetDefault(params); - - runInfo.gws0 = params.output.X().v; - runInfo.gws1 = params.output.Y().v; - // we got fs_bs_yx_bsv4_fsv32 format, we process 4 batches and 4 features per workitem - runInfo.gws2 = (RoundUp(params.output.Feature().v, 32) * RoundUp(params.output.Batch().v, 4)) / (4); // *4); - - runInfo.lws0 = 1; - runInfo.lws1 = 1; - runInfo.lws2 = simdSize * get_batch_sub_groups_count(params); - - return runInfo; -} - -JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetJitConstants(const pooling_params& params, - DispatchData kd) const { - auto jit = PoolingKernelBase::GetJitConstants(params, kd); - - const size_t in_x_pitch = 32 * 4; - const size_t in_y_pitch = 32 * 4 * params.inputs[0].X().LogicalDimPadded(); - const size_t in_b_block_pitch = in_y_pitch * params.inputs[0].Y().LogicalDimPadded(); - const size_t in_f_block_pitch = in_b_block_pitch * ((params.inputs[0].Batch().v + 3) / 4); - const size_t in_offset = - in_x_pitch * params.inputs[0].X().pad.before + in_y_pitch * params.inputs[0].Y().pad.before; - - jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch)); - jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch)); - jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch)); - jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset)); - jit.AddConstant(MakeJitConstant("BATCH_SG_COUNT", get_batch_sub_groups_count(params))); - jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION")); - jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR")); - - if (!params.fused_ops.empty()) { - auto input_dt = GetActivationType(params); - FusedOpsConfiguration conf = {"", - {"b", "f", "y", "x"}, - "pool_result", - input_dt, - 4, - LoadType::LT_UNALIGNED, - BoundaryCheck::ENABLED, - IndexType::TENSOR_COORD, - Tensor::DataChannelName::FEATURE}; - jit.Merge(MakeFusedOpsJitConstants(params, {conf})); - } - - return jit; -} - -KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetKernelsData(const Params& params, - const optional_params& options) const { - return GetCommonKernelsData(params, options, FORCE_PRIORITY_1); -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h deleted file mode 100644 index 3e2de8f1a3b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#pragma once - -#include "pooling_kernel_base.h" - -namespace kernel_selector { -class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32 : public PoolingKernelBase { -public: - PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32() : PoolingKernelBase("pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32") {} - virtual ~PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; - DispatchData SetDefault(const pooling_params& params) const override; - -protected: - JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override; - std::vector GetSupportedFusedOps() const override { - return { FusedOpType::QUANTIZE, - FusedOpType::SCALE, - FusedOpType::ACTIVATION }; - } -}; -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp index f6a5bf0aa73..11ae5cf90ae 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp @@ -16,14 +16,11 @@ #include "pooling_kernel_selector.h" #include "pooling_kernel_gpu_ref.h" #include "pooling_kernel_gpu_byxf_opt.h" -#include "pooling_kernel_gpu_average_opt.h" #include "pooling_kernel_gpu_bfyx_block_opt.h" #include "pooling_kernel_gpu_byxf_padding_opt.h" #include "pooling_kernel_gpu_byxf_af32.h" #include "pooling_kernel_gpu_int8_ref.h" -#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h" #include "pooling_kernel_gpu_b_fs_yx_fsv4.h" -#include "pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h" #include "pooling_kernel_gpu_fs_b_yx_fsv32.h" #include "pooling_kernel_gpu_b_fs_yx_fsv16.h" #include "pooling_kernel_gpu_bsv16_fsv16.h" @@ -34,15 +31,12 @@ namespace kernel_selector { pooling_kernel_selector::pooling_kernel_selector() { Attach(); - //Attach(); TODO: fix the kernel as it reads out of bounds now Attach(); Attach(); Attach(); Attach(); Attach(); - Attach(); Attach(); - Attach(); Attach(); Attach(); Attach(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp index e9c1b39c8df..5ec60546c3b 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_base.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ KernelsData QuantizeKernelBase::GetKernelsData(const Params& params, const optio kernel.workGroups.global = {runInfo.gws0, runInfo.gws1, runInfo.gws2}; kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2}; kernel.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, DEFAULT); - kernel.arguments = GetArgsDesc(static_cast(newParams.inputs.size()), false, false, false, false); + kernel.arguments = GetArgsDesc(static_cast(newParams.inputs.size()), false, false); kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE; diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp deleted file mode 100644 index c34f803bd07..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.cpp +++ /dev/null @@ -1,79 +0,0 @@ -/* -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "scale_grad_weights_kernel_base.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { -JitConstants ScaleGradWeightsKernelBase::GetJitConstants(const scale_grad_weights_params& params) const { - JitConstants jit = training_kernel_base::GetJitConstants(params); - - return jit; -} - -ScaleGradWeightsKernelBase::DispatchData ScaleGradWeightsKernelBase::SetDefault( - const scale_grad_weights_params& params) const { - DispatchData kd; - - kd.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - kd.gws0 = params.inputs[0].Batch().v; - kd.gws1 = params.inputs[0].Feature().v; - kd.gws2 = 1; - - kd.lws0 = params.inputs[0].Batch().v; - kd.lws1 = 1; - kd.lws2 = 1; - kd.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - return kd; -} - -KernelsData ScaleGradWeightsKernelBase::GetKernelsData(const Params& params, const optional_params& options) const { - assert(params.GetType() == KernelType::SCALE_GRAD_WEIGHTS); - - const scale_grad_weights_params& orgParams = static_cast(params); - - DispatchData runInfo = SetDefault(orgParams); - KernelData kd = KernelData::Default(params); - - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, - runInfo, - params.engineInfo, - kernelName, - jit, - entry_point, - DEFAULT, - true, - !orgParams.bias.empty(), - 2); - - if (orgParams.use_momentum) { - kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_WEIGHTS_GRADIENT, 0}); - if (!orgParams.bias.empty()) - kernel.arguments.push_back({ArgumentDescriptor::Types::PREV_BIAS_GRADIENT, 0}); - } - kernel.arguments.push_back({ArgumentDescriptor::Types::LEARNING_RATE, 0}); - - kd.estimatedTime = runInfo.efficiency; - - return {kd}; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h deleted file mode 100644 index 3f83d746146..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_base.h +++ /dev/null @@ -1,58 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "training_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// scale_grad_weights_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct scale_grad_weights_params : public training_params { - scale_grad_weights_params() : training_params(KernelType::SCALE_GRAD_WEIGHTS) {} - - virtual ParamsKey GetParamsKey() const { - ParamsKey k = training_params::GetParamsKey(); - - return k; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// scale_grad_weights_optional_params -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct scale_grad_weights_optional_params : training_optional_params { - scale_grad_weights_optional_params() : training_optional_params(KernelType::SCALE_GRAD_WEIGHTS) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// ScaleGradWeightsKernelBase -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -class ScaleGradWeightsKernelBase : public training_kernel_base { -public: - using training_kernel_base::training_kernel_base; - virtual ~ScaleGradWeightsKernelBase() {} - - using DispatchData = CommonDispatchData; - -protected: - virtual KernelsData GetKernelsData(const Params& params, const optional_params& options) const; - virtual JitConstants GetJitConstants(const scale_grad_weights_params& params) const; - virtual DispatchData SetDefault(const scale_grad_weights_params& params) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp deleted file mode 100644 index 25bf58b0513..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "scale_grad_weights_kernel_ref.h" - -namespace kernel_selector { - -ParamsKey ScaleGradWeightsKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableInputWeightsType(WeightsType::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBiasPerFeature(); - k.EnableNonBiasTerm(); - k.EnableGradient(); - k.EnableBatching(); - k.DisableTuning(); - k.EnableMomentum(); - return k; -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h deleted file mode 100644 index f0735a2bd55..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_ref.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "scale_grad_weights_kernel_base.h" - -namespace kernel_selector { - -class ScaleGradWeightsKernelRef : public ScaleGradWeightsKernelBase { -public: - ScaleGradWeightsKernelRef() : ScaleGradWeightsKernelBase("scale_grad_weights_gpu_ref") {} - virtual ~ScaleGradWeightsKernelRef() {} - - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp deleted file mode 100644 index 937ade1ab46..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.cpp +++ /dev/null @@ -1,27 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "scale_grad_weights_kernel_selector.h" -#include "scale_grad_weights_kernel_ref.h" - -namespace kernel_selector { -scale_grad_weights_kernel_selector::scale_grad_weights_kernel_selector() { Attach(); } - -KernelsData scale_grad_weights_kernel_selector::GetBestKernels(const Params& params, - const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::SCALE_GRAD_WEIGHTS); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h deleted file mode 100644 index 7022f96936b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/scale_grad_weights/scale_grad_weights_kernel_selector.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class scale_grad_weights_kernel_selector : public kernel_selector_base { -public: - static scale_grad_weights_kernel_selector& Instance() { - static scale_grad_weights_kernel_selector instance_; - return instance_; - } - - scale_grad_weights_kernel_selector(); - - virtual ~scale_grad_weights_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp deleted file mode 100644 index 256889e1b6b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2018-2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "softmax_loss_grad_kernel_base.h" -#include "kernel_selector_utils.h" -#include - -namespace kernel_selector { -JitConstants SoftmaxLossGradKernelBase::GetJitConstants(const softmax_loss_grad_params& params) const { - return MakeBaseParamsJitConstants(params); -} - -CommonDispatchData SoftmaxLossGradKernelBase::SetDefault(const softmax_loss_grad_params& params, - const optional_params&) const { - CommonDispatchData runInfo; - - std::vector global = {params.output.Batch().v * params.output.X().v, 1, 1}; - - auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo); - - runInfo.gws0 = global[0]; - runInfo.gws1 = global[1]; - runInfo.gws2 = global[2]; - - runInfo.lws0 = local[0]; - runInfo.lws1 = local[1]; - runInfo.lws2 = local[2]; - - runInfo.efficiency = DONT_USE_IF_HAVE_SOMETHING_ELSE; - - runInfo.fp16UnitUsed = params.inputs[0].GetDType() == Datatype::F16; - - return runInfo; -} - -bool SoftmaxLossGradKernelBase::Validate(const Params& p, const optional_params& o) const { - if (p.GetType() != KernelType::SOFT_MAX_LOSS_GRAD || o.GetType() != KernelType::SOFT_MAX_LOSS_GRAD) { - return false; - } - - return true; -} - -KernelsData SoftmaxLossGradKernelBase::GetCommonKernelsData(const Params& params, - const optional_params& options) const { - if (!Validate(params, options)) { - return {}; - } - - const softmax_loss_grad_params& orgParams = static_cast(params); - KernelData kd = KernelData::Default(params); - - auto runInfo = SetDefault(orgParams, options); - auto cldnn_jit = GetJitConstants(orgParams); - auto entry_point = GetEntryPoint(kernelName, orgParams.layerID, options); - auto jit = CreateJit(kernelName, cldnn_jit, entry_point); - - auto& kernel = kd.kernels[0]; - FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point); - kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); - - kd.estimatedTime = runInfo.efficiency; - - return {kd}; -} -} // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h deleted file mode 100644 index e16128da90f..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_base.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "common_kernel_base.h" -#include "kernel_selector_params.h" - -namespace kernel_selector { -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SoftmaxLossGradParams -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct softmax_loss_grad_params : public base_params { - softmax_loss_grad_params() : base_params(KernelType::SOFT_MAX_LOSS_GRAD) {} - - virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// SoftmaxLossGradOptionalParams -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct softmax_loss_grad_optional_params : optional_params { - softmax_loss_grad_optional_params() : optional_params(KernelType::SOFT_MAX_LOSS_GRAD) {} -}; - -class SoftmaxLossGradKernelBase : public common_kernel_base { -public: - using common_kernel_base::common_kernel_base; - virtual ~SoftmaxLossGradKernelBase() {} - -protected: - virtual bool Validate(const Params&, const optional_params&) const; - virtual JitConstants GetJitConstants(const softmax_loss_grad_params& params) const; - virtual CommonDispatchData SetDefault(const softmax_loss_grad_params& params, - const optional_params& optParams) const; - KernelsData GetCommonKernelsData(const Params& params, const optional_params& optParams) const; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp deleted file mode 100644 index db6af855c47..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "softmax_loss_grad_kernel_ref.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { -ParamsKey SoftmaxLossGradKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableInputLayout(DataLayout::bfyx); - k.EnableInputLayout(DataLayout::bf); - k.EnableInputLayout(DataLayout::yxfb); - k.EnableInputLayout(DataLayout::byxf); - k.EnableOutputLayout(DataLayout::bfyx); - k.EnableOutputLayout(DataLayout::bf); - k.EnableOutputLayout(DataLayout::yxfb); - k.EnableOutputLayout(DataLayout::byxf); - k.EnableBatching(); - k.EnableGradient(); - return k; -} - -KernelsData SoftmaxLossGradKernelRef::GetKernelsData(const Params& params, const optional_params& options) const { - return GetCommonKernelsData(params, options); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h deleted file mode 100644 index 52f9b66a5be..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_ref.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "softmax_loss_grad_kernel_base.h" - -namespace kernel_selector { -class SoftmaxLossGradKernelRef : public SoftmaxLossGradKernelBase { -public: - using Parent = SoftmaxLossGradKernelBase; - SoftmaxLossGradKernelRef() : Parent("softmax_loss_grad_gpu_ref") {} - virtual ~SoftmaxLossGradKernelRef() {} - - KernelsData GetKernelsData(const Params& params, const optional_params& options) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp deleted file mode 100644 index e4c1a71344b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "softmax_loss_grad_kernel_selector.h" -#include "softmax_loss_grad_kernel_ref.h" - -namespace kernel_selector { - -softmax_loss_grad_kernel_selector::softmax_loss_grad_kernel_selector() { Attach(); } - -KernelsData softmax_loss_grad_kernel_selector::GetBestKernels(const Params& params, - const optional_params& options) const { - return GetNaiveBestKernel(params, options, KernelType::SOFT_MAX_LOSS_GRAD); -} -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h deleted file mode 100644 index 03e00d1edd3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/softmax_loss_grad/softmax_loss_grad_kernel_selector.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class softmax_loss_grad_kernel_selector : public kernel_selector_base { -public: - static softmax_loss_grad_kernel_selector& Instance() { - static softmax_loss_grad_kernel_selector instance_; - return instance_; - } - - softmax_loss_grad_kernel_selector(); - - virtual ~softmax_loss_grad_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; -}; -} // namespace kernel_selector \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp index 78c15188611..b5b0acfdaa9 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/strided_slice/strided_slice_kernel_ref.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ static void makeJitConstForParam(JitConstants& jit, const std::string name, cons jit.AddConstant(MakeJitConstant(name + "_Y", vec[2])); jit.AddConstant(MakeJitConstant(name + "_X", vec[3])); } -}; +} static size_t GetUsedOutDimsCount(const strided_slice_params& params) { auto dims = params.output.GetDims(); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl deleted file mode 100644 index aaf60c3ce14..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_gpu_ref.cl +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#define LOCAL_SIZE INPUT0_BATCH_NUM - -__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) -KERNEL(batch_norm_gpu)( - const __global UNIT_TYPE* input, - #ifdef MEAN_VAR_OUT - __global UNIT_TYPE* mean_out, - __global UNIT_TYPE* variance_out, - #endif - #ifdef SCALE_SHIFT - __global UNIT_TYPE* scale, - __global UNIT_TYPE* shift, - #endif - #ifdef FORWARD - __global UNIT_TYPE* inv_var, - #endif - __global UNIT_TYPE* output) -{ - __local ACCUMULATOR_TYPE sum[LOCAL_SIZE]; - - const uint local_idx = (uint)get_global_id(0); - const uint f = (uint)get_global_id(1); - - sum[local_idx] = 0; - - uint input_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0); - for (uint y = 0; y < INPUT0_SIZE_Y; y++) - { - for (uint x = 0; x < INPUT0_SIZE_X; x++) - { - UNIT_TYPE in = input[input_idx]; - sum[local_idx] += in; - input_idx += INPUT0_X_PITCH; - } - input_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) - { - if (local_idx < offset) - { - sum[local_idx] += sum[local_idx + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); -#ifdef MEAN_VAR_OUT - mean_out[f] = mean; -#endif - sum[local_idx] = 0; - - input_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0); - for (uint y = 0; y < INPUT0_SIZE_Y; y++) - { - for (uint x = 0; x < INPUT0_SIZE_X; x++) - { - UNIT_TYPE in = input[input_idx] - mean; - sum[local_idx] += in * in; - input_idx += INPUT0_X_PITCH; - } - input_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) - { - if (local_idx < offset) - { - sum[local_idx] += sum[local_idx + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); -#ifdef MEAN_VAR_OUT - variance_out[f] = variance; -#endif - float inv_variance = (float)(1.0 / sqrt(variance + EPSILON)); -#ifdef FORWARD - if (local_idx == 0) - inv_var[f] = inv_variance; -#endif - - uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); - for (uint y = 0; y < OUTPUT_SIZE_Y; y++) - { - for (uint x = 0; x < OUTPUT_SIZE_X; x++) - { - #ifdef SCALE_SHIFT - output[out_idx] = (inv_variance * (input[out_idx] - mean)) * scale[f] + shift[f]; - #else - output[out_idx] = inv_variance * (input[out_idx] - mean); - #endif - out_idx += OUTPUT_X_PITCH; - } - out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; - } -} - -#undef LOCAL_SIZE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl deleted file mode 100644 index 0c698de0f91..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/batch_norm_grad_gpu_ref.cl +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#define LOCAL_SIZE INPUT0_BATCH_NUM - -__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) -KERNEL(batch_norm_grad_gpu)(const __global UNIT_TYPE* input_grad, __global UNIT_TYPE* input, __global UNIT_TYPE* inv_var, __global UNIT_TYPE* output) -{ - __local ACCUMULATOR_TYPE grad_sum[LOCAL_SIZE]; - __local ACCUMULATOR_TYPE grad_sum_in[LOCAL_SIZE]; - - const uint local_idx = (uint)get_local_id(0); - const uint f = (uint)get_global_id(1); - - grad_sum[local_idx] = 0; - grad_sum_in[local_idx] = 0; - - uint grad_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0); - for (uint y = 0; y < INPUT0_SIZE_Y; y++) - { - for (uint x = 0; x < INPUT0_SIZE_X; x++) - { - UNIT_TYPE in_g = input_grad[grad_idx]; - grad_sum[local_idx] += in_g; - grad_sum_in[local_idx] += in_g * input[grad_idx]; - grad_idx += INPUT0_X_PITCH; - } - grad_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) - { - if (local_idx < offset) - { - grad_sum[local_idx] += grad_sum[local_idx + offset]; - grad_sum_in[local_idx] += grad_sum_in[local_idx + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - UNIT_TYPE grad_mean = grad_sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); - UNIT_TYPE grad_mean_in = grad_sum_in[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); - - uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); - for (uint y = 0; y < OUTPUT_SIZE_Y; y++) - { - for (uint x = 0; x < OUTPUT_SIZE_X; x++) - { - UNIT_TYPE grad_out = inv_var[f] * (input_grad[out_idx] - grad_mean - grad_mean_in * input[out_idx]); - - if (grad_out > 5.0f) - grad_out = 5.0f; - else if (grad_out < -5.0f) - grad_out = -5.0f; - - output[out_idx] = grad_out; - out_idx += OUTPUT_X_PITCH; - } - out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; - } - -} - -#undef LOCAL_SIZE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl deleted file mode 100644 index b15787539d7..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/contract_ref.cl +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - - -KERNEL(contract_ref)( - const __global INPUT0_TYPE* input, - __global INPUT0_TYPE* output) -{ - INPUT0_TYPE out_val = REDUCE_SEED; - -#if REDUCE_B - for (uint in_b = 0; in_b < INPUT0_BATCH_NUM; ++in_b) { -#else - const uint in_b = (uint) get_global_id(DIM_B); -#endif - -#if REDUCE_F - for (uint in_f = 0; in_f < INPUT0_FEATURE_NUM; ++in_f) { -#else - const uint in_f = (uint) get_global_id(DIM_F); -#endif - -#if REDUCE_Y - for (uint in_y = 0; in_y < INPUT0_SIZE_Y; ++in_y) { -#else - const uint in_y = (uint) get_global_id(DIM_Y); -#endif - -#if REDUCE_X - for (uint in_x = 0; in_x < INPUT0_SIZE_X; ++in_x) { -#else - const uint in_x = (uint) get_global_id(DIM_X); -#endif - - out_val = REDUCE_OPERATION(out_val, input[GET_DATA_INDEX(INPUT0, in_b, in_f, in_y, in_x)]); - -#if REDUCE_X - } -#endif -#if REDUCE_Y - } -#endif -#if REDUCE_F - } -#endif -#if REDUCE_B - } -#endif - - output[GET_DATA_INDEX(OUTPUT, 0, get_global_id(0), get_global_id(1), get_global_id(2))] = out_val; -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl deleted file mode 100644 index 0cb1f5fc289..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_1x1_gemm_mmad.cl +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(convolution_1x1_gemm_MMAD)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if QUANTIZATION_TERM - __global float* quantizations, -#endif -#if CALIBRATION_TERM - __global float* calibrations, -#endif - uint split_idx) -{ - const uint sg_channel = get_sub_group_local_id(); - - const uint x = ((uint)get_group_id(0) * 8) % INPUT0_SIZE_X; - const uint y = ((uint)get_group_id(0) * 8) / INPUT0_SIZE_X; - const uint f = (uint)get_global_id(1) % FILTER_OFM_ALIGNED; - const uint b = (uint)get_global_id(1) / FILTER_OFM_ALIGNED; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; - - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; - uint in_addr = input_offset + input_x * INPUT0_X_PITCH + input_y * INPUT0_Y_PITCH; - - const uint filter_offset = ((uint)get_group_id(1) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - uint filter_idx = filter_offset; - - int8 tileA; - int8 tileB; - int8 tileC; - for(uint i = 0; i < 8; i++) - { - tileC[i] = 0; - } - - for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) - { - // load A tile ( input ) - for(uint i = 0; i < 8; i++) - { - uint tmp_addr = in_addr + i * INPUT0_X_PITCH; - tileA[i] = as_int(intel_sub_group_block_read((const __global uint*)(input + tmp_addr))); - } - - // load B tile ( weights ) - tileB = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx))); - - // compute C tile ( output ) - tileC = MMAD_8x8(tileA, tileB, tileC); - - in_addr += 32; // 4 features per channel * 8 SIMD channels - filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel - } - -#if BIAS_TERM -#if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); -#elif BIAS_PER_OFM - const uint bias_index = f; -#endif - for(uint i = 0; i < 8; i++) - { -#if CALIBRATION_TERM - tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]); -#else // CALIBRATION_TERM - tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * O_QF); -#endif // CALIBRATION_TERM - } -#endif // BIAS_TERM - - const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; - // save to output - for(uint i = 0; i < 8; i++) - { - const uint curr_x = (x + i) % INPUT0_SIZE_X; - const uint curr_y = y + (x + i) / INPUT0_SIZE_X; - if(curr_x < INPUT0_SIZE_X && curr_y < INPUT0_SIZE_Y) - { - const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, curr_y, curr_x) + out_split_offset; - output[dst_index] = ACTIVATION(convert_char(tileC[i]), ACTIVATION_PARAMS); - } - } -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl deleted file mode 100644 index 1f9424253a3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byx8_f4__fs_bs_yx_bsv4_fsv32.cl +++ /dev/null @@ -1,202 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define FILTER_IFM_SLICES ((FILTER_IFM_NUM + 3) /4) -#define FILTER_SIZE_X_SLICES ((FILTER_SIZE_X + 7) / 8) - -#define FILTER_IFM_SLICE_PITCH (32 * (FILTER_SIZE_X_SLICES * 8) * FILTER_SIZE_Y) -#define FILTER_OFM_SLICE_PITCH (FILTER_IFM_SLICE_PITCH * FILTER_IFM_SLICES) - -#define OUT_BLOCK_BATCH 2 -#define OUT_BLOCK_HEIGHT 2 -#define WEIGHTS_PER_WORKITEM 4 // currently needs to be set to 4, check output stage and float4 on quantizations etc. - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION \ - out[w + pb * 4] = convert_uchar_sat((float)dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb][i] * SCALE + bias_f[w]); - -#elif NO_QUANTIZATION - -#define QUANTIZATION \ - out[w + pb * 4] = convert_uchar_sat(dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb][i]); - -#else - -#define QUANTIZATION \ - out[w + pb * 4] = as_uchar( ACTIVATION( convert_char( round( ( (float)dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w])), ACTIVATION_PARAMS)); - -#endif - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(convolution_gpu_byx8_f4_fs_bs_yx_bsv4_fsv32)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, - __global BIAS_TYPE* biases, - __global float* quantizations, -#if CALIBRATION_TERM - __global float* calibrations, -#endif - uint split_idx) -{ - const uint x = get_group_id(1) * 8; - const uint y = get_group_id(2) * OUT_BLOCK_HEIGHT; - - const uint bf_id = ((uint)get_group_id(0) * WG_BATCH_SIZE + (uint)get_sub_group_id()) * 8 * WEIGHTS_PER_WORKITEM; - - const uint f = (bf_id) % OUTPUT_FEATURE_NUM; - const uint b = OUT_BLOCK_BATCH * (bf_id / OUTPUT_FEATURE_NUM); - - int8 dotProd[OUT_BLOCK_BATCH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 }; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - uint filter_offset = (f/8)*FILTER_OFM_SLICE_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET; - - for (uint k = 0; k < FILTER_IFM_SLICES; ++k) - { - __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) - for (uint j = 0; j < FILTER_SIZE_Y ; ++j) - { - const int input_offset_y = input_y + j * DILATION_SIZE_Y; - - __attribute__((opencl_unroll_hint(FILTER_SIZE_X_SLICES))) - for(uint i = 0; i < FILTER_SIZE_X_SLICES; i++) - { - const uint filter_spatial_offset = 32 * (i*8 + (FILTER_SIZE_X_SLICES * 8) * j); - - int8 act_reg[OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH]; // activations for MMAD - - // preload batch data - __attribute__((opencl_unroll_hint(OUT_BLOCK_BATCH))) - for(uint pb = 0; pb < OUT_BLOCK_BATCH; pb++) - { - // preload spatial data - __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) - for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) - { - uint input_idx = GET_DATA_BYX8_F4_INDEX(INPUT0, b + pb, k * 4, input_offset_y + h * STRIDE_SIZE_Y, input_x + i * 8); - int2 _input_data_01 = as_int2(intel_sub_group_block_read2((__global uint*)(input + input_idx))); - int _input_data_2 = as_int(intel_sub_group_block_read((__global uint*)(input + input_idx + 8 * 8))); - - act_reg[h * OUT_BLOCK_BATCH + pb][0] = _input_data_01[0]; - act_reg[h * OUT_BLOCK_BATCH + pb][1] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 1); - act_reg[h * OUT_BLOCK_BATCH + pb][2] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 2); - act_reg[h * OUT_BLOCK_BATCH + pb][3] = intel_sub_group_shuffle_down(_input_data_01[0], _input_data_01[1], STRIDE_SIZE_X * 3); - act_reg[h * OUT_BLOCK_BATCH + pb][4] = _input_data_01[1]; - act_reg[h * OUT_BLOCK_BATCH + pb][5] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 1); - act_reg[h * OUT_BLOCK_BATCH + pb][6] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 2); - act_reg[h * OUT_BLOCK_BATCH + pb][7] = intel_sub_group_shuffle_down(_input_data_01[1], _input_data_2, STRIDE_SIZE_X * 3); - } - } - - uint filter_idx = filter_offset + filter_spatial_offset; - - // preload weights - int8 _weights[WEIGHTS_PER_WORKITEM]; - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights - { - _weights[w] = as_int8(intel_sub_group_block_read8((__global uint*)(weights + filter_idx))); - filter_idx += FILTER_OFM_SLICE_PITCH; - } - - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) // iterate over output feature channels for weights - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_BATCH))) - for(uint pb = 0; pb < OUT_BLOCK_BATCH; pb++) - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) - for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) - { - // MMAD on 8x WEIGHTS_PER_WORKITEM input channels elements for 8x outputs in WI - dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb] = MMAD_8x8(act_reg[h * OUT_BLOCK_BATCH + pb], _weights[w], dotProd[w*OUT_BLOCK_HEIGHT*OUT_BLOCK_BATCH + h*OUT_BLOCK_BATCH + pb]); - } - } - } - } - } - filter_offset += FILTER_IFM_SLICE_PITCH; - } - - -const uint sg_local_f = get_sub_group_local_id() * 4; -float4 quant_f = vload4(0, quantizations + f + sg_local_f); -float4 bias_f = vload4(0, biases + f + sg_local_f); -float4 calib_f = vload4(0, calibrations + f + sg_local_f); - -__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) -for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) -{ - const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y + h, x); - - __attribute__((opencl_unroll_hint(8))) - for(uint i = 0; i < 8; i++) - { - - #if WEIGHTS_PER_WORKITEM == 4 - - uchar8 out; - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint pb = 0; pb < OUT_BLOCK_BATCH; pb++) - { - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) - { - QUANTIZATION; - } - } - intel_sub_group_block_write2((__global unsigned int*)(output + dst_index + 32 * 4 * i), as_uint2(out)); - - #else - #error NOT IMPLEMENTED - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) - { - #if CALIBRATION_TERM - dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * calib_f[w]); - #else // CALIBRATION_TERM - dotProd[w*OUT_BLOCK_HEIGHT + h][i] = (UNIT_TYPE)round(((float)dotProd[w*OUT_BLOCK_HEIGHT + h][i] * quant_f[w] * I_QF + bias_f[w]) * O_QF); - #endif // CALIBRATION_TERM - output[dst_index + 32 * 4 * i + 8 * w] = ACTIVATION(convert_char(dotProd[w*OUT_BLOCK_HEIGHT + h][i]), ACTIVATION_PARAMS); - } - - #endif - } -} - -} - -#undef OUT_BLOCK_HEIGHT -#undef WEIGHTS_PER_WORKITEM - -#undef FILTER_SIZE_X_SLICES -#undef FILTER_IFM_SLICES - -#undef FILTER_IFM_SLICE_PITCH -#undef FILTER_OFM_SLICE_PITCH - -#undef SCALE -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl deleted file mode 100644 index 0da4d28d15f..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_byxf_fs_bs_yx_bsv4_fsv32.cl +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -#define OBS 8 -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(convolution)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if QUANTIZATION_TERM - __global float* quantizations, -#endif -#if CALIBRATION_TERM - __global float* calibrations, -#endif - uint split_idx) -{ - const uint f_pack = ((uint)get_group_id(0) * 32) % OUTPUT_FEATURE_NUM; - const uint b = ((uint)get_group_id(0) * 32) / OUTPUT_FEATURE_NUM; - - const uint x = (uint)get_group_id(1) * OBS; - const uint y = get_group_id(2); - - int4 dotProd[OBS] = { 0 }; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint filter_offset = f_pack*FILTER_OFM_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET; - - for (uint j = 0; j < FILTER_SIZE_Y ; ++j) - { - const int input_offset_y = input_y + j; - for (uint i = 0; i < FILTER_SIZE_X ; ++i) - { - const int input_offset_x = input_x + i + STRIDE_SIZE_X * get_sub_group_local_id(); - uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH; - uint filter_idx = filter_offset + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; - - char input_data[3]; - char2 _i = vload2(0, input + input_idx); - input_data[0] = _i.s0; - input_data[1] = _i.s1; - input_data[2] = input[input_idx + 2]; - - for (uint k = 0; k < FILTER_IFM_NUM; ++k) - { - char4 w_data = as_char4(intel_sub_group_block_read((const __global uint*)(weights + filter_idx))); - for(uint r = 0; r < OBS; r++) - { - char in = intel_sub_group_shuffle(input_data[k], r); - for(uint c = 0; c < 4; c++) - { - dotProd[r][c] += (int)in * (int)w_data[c]; - } - } - filter_idx += FILTER_IFM_PITCH; - } - } - } - - -const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f_pack, y, x + get_sub_group_local_id()); -const uint _f_idx = f_pack + get_sub_group_local_id() * 4; -float4 quants = vload4(0, quantizations + _f_idx ); -float4 calibs = vload4(0, calibrations + _f_idx ); -float4 bias = vload4(0, biases + _f_idx ); -for(uint r = 0; r < OBS; r++) -{ - char4 char_output; - for(uint c = 0; c < 4; c++) - { - const uint f_idx = f_pack + get_sub_group_local_id() * 4 + c; - #if BIAS_TERM - const uint bias_index = f_idx; - #if CALIBRATION_TERM - dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * calibs[c]); - #else // CALIBRATION_TERM - dotProd[r][c] = (UNIT_TYPE)round(((float)dotProd[r][c] * quants[c] * I_QF + bias[c]) * O_QF); - #endif // CALIBRATION_TERM - #endif - char_output[c] = ACTIVATION(convert_char(dotProd[r][c]), ACTIVATION_PARAMS); - } - const uint out_idx = intel_sub_group_shuffle(dst_index, r); - intel_sub_group_block_write( (__global uint*)(output + out_idx) , as_uint(char_output)); -} - -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl deleted file mode 100644 index 61ed1de16f9..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_128x128wg_slm_int8.cl +++ /dev/null @@ -1,396 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/mmad.cl" - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION(idx) \ - {\ - for(uint z = 0; z < 4; z++)\ - {\ - regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\ - regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\ - regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\ - regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\ - }\ - } - -#elif NO_QUANTIZATION - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\ - regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\ - regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\ - regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\ - \ - regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\ - regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\ - regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\ - regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\ - \ - regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\ - regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\ - regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\ - regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\ - \ - regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\ - regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\ - regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\ - regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]); - -#else - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - -#endif - - -inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) -{ -#if OUT_WITH_PADDING == 1 - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; - padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; - padded_offset += y_idx * OUT_Y_PITCH; - padded_offset += x_idx * OUT_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += OUT_OFFSET; - - return padded_offset; -#else - return cOffset; -#endif -} - -inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, - __local int8* l_tileB, const uint l_offsetTileB_col0, - const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, - const uint l_offsetTileB_col3, int8* rowA, int8* colB, - int8* regC) -{ - // Read tile A from SLM to regA - uint l_offsetTileATemp = l_offsetTileA; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); - l_offsetTileATemp += 8 * SG_SIZE; - } - // Read tile B from SLM to regB and compute mmad - colB[0] = l_tileB[l_offsetTileB_col0]; - colB[1] = l_tileB[l_offsetTileB_col1]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); - } - colB[0] = l_tileB[l_offsetTileB_col2]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); - } - colB[1] = l_tileB[l_offsetTileB_col3]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); - } - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); - } -} - -/* - * \brief GEMM kernel to compute MxN matrix using SLM - * \param g_inA - Input matrix - * \param g_inB - Input matrix - * \param g_outC - Output matrix - */ - -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) -KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8) - ( - __global char* const g_inA, - __global int* g_outC, - __global char* const g_inB, - #if BIAS_TERM - __global BIAS_TYPE* biases, - #endif - __global float* quantizations, - #if CALIBRATION_TERM - __global float* calibrations, - #endif - uint split_idx - - ) -{ - - __global int4* const g_matrixA = (__global int4*)g_inA; - __global int4* const g_matrixB = (__global int4*)g_inB; - __global int8* g_matrixC = (__global int8*)g_outC; - - // Each work-group works to compute 128x128 tile. - // Each work-group contains 16 sub-groups. - // Each sub-group within the work-group works to compute a 32x32 tile. - // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). - // 2) Each sub-group works to compute 32x32 tileC (stored in regC). - // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. - // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") - __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 - __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 - - __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; - __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; - __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; - - const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y); - - const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); - const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); - const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); - const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); - - // Thread IDs - const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY - const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX - const uint l_tidX = get_local_id(DIM_X); // 0,...,31 in WG - const uint l_tidY = get_local_id(DIM_Y); // 0,1,2,3 in WG - const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; // 0,1,2,...127 - - // SubGroup IDs - const uint sg_tid = get_sub_group_local_id(); // 0,1,...,8 - const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); //{0}/8 - const uint sg_global_idY = g_tidY; //{0} - - const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3} - const uint sg_local_idY = l_tidY; // 0,1,2,3 - const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX; // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4 - - const uint sub_group_id = get_sub_group_id(); - - - // Registers - int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts // (32/8)*4 - int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA - int8 colB[2]; // each lane will store 32x4 piece of matrixB - - // SLM indices - const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; - const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); - const uint numElements32x8TileB = numElements32x32TileB / 4; - const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; - const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; - const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; - - // Global indices - uint g_idxA[2]; - uint g_idxB[2]; -#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) - g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid; - g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid; - g_idxA[1] = g_idxA[0] + l_groupSize; - g_idxB[1] = g_idxB[0] + l_groupSize; -#else // Row (matrixA) and Col (matrixB) major layout - g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); - g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); -#endif - - // Initial SLM setup - { - l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; - l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; - l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; - l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; - -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - barrier(CLK_LOCAL_MEM_FENCE); - } - - int4 hdcReadValueA[2]; - int4 hdcReadValueB[2]; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) - { - /* - * SLM setup - HDC read only - */ - // Overlap HDC reads with mmad compute - hdcReadValueA[0] = g_matrixA[g_idxA[0]]; - hdcReadValueB[0] = g_matrixB[g_idxB[0]]; - hdcReadValueA[1] = g_matrixA[g_idxA[1]]; - hdcReadValueB[1] = g_matrixB[g_idxB[1]]; - -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - /* - * mmad compute - */ - FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, - l_offsetTileB_col3, rowA, colB, regC); - - /* - * SLM setup - SLM write only - */ - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; - - barrier(CLK_LOCAL_MEM_FENCE); - } // main outer loop - - /* - * Last mmad compute iteration (avoids branching in main loop) - */ - - FUNC_CALL(mmad_32x32_int8)( - &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, - &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, - regC); - -#ifdef OUTPUT_TILED_GLOBAL_LAYOUT - // Write out in swizzled manner after quantizing - __global uchar* g_outC_uchar = (__global uchar*)g_outC; - uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + - sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); - - uchar16 regC_uchar16; - uint offset_uc16 = 0; - - const uint workgroup_id_x = get_group_id(0); - uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x - uint feature = get_sub_group_local_id()*4 + feature_off; - - float4 quant_f = vload4(0, quantizations + feature); - float4 bias_f = vload4(0, biases + feature); - float4 calib_f = vload4(0, calibrations + feature); - -#if MMAD_SUPPORTED == 1 - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) -#endif - for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) - { - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - // B0..3, F0..31 - QUANTIZATION(0); - } - - intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); - cOffset += sizeof(uchar16) * SG_SIZE; - - // now we need to calculate again for other x - padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - // B0..3, F0..31 - QUANTIZATION(4); - } - - intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); - cOffset += sizeof(uchar16) * SG_SIZE; - } -#else - // Write final accumulated values - uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + - sg_tid * (MATRIX_M / 8); - __attribute__((opencl_unroll_hint(SIMD_LANE_N))) - for (uint i = 0; i < (SIMD_LANE_N); ++i) - { - __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) - for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) - { - g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; - } - cOffset += SG_SIZE * (MATRIX_M / 8); - } -#endif - -} - -#undef QUANTIZATION -#undef SCALE diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl deleted file mode 100644 index 6fccacc0ac5..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_224x128wg_slm_int8.cl +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/mmad.cl" - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION(idx) \ - {\ - for(uint z = 0; z < 4; z++)\ - {\ - regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + bias_f.s0);\ - regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + bias_f.s1);\ - regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + bias_f.s2);\ - regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + bias_f.s3);\ - }\ - } - -#elif NO_QUANTIZATION - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = convert_uchar_sat(regC[0 * 4 + i][idx]);\ - regC_uchar16.s1 = convert_uchar_sat(regC[1 * 4 + i][idx]);\ - regC_uchar16.s2 = convert_uchar_sat(regC[2 * 4 + i][idx]);\ - regC_uchar16.s3 = convert_uchar_sat(regC[3 * 4 + i][idx]);\ - \ - regC_uchar16.s4 = convert_uchar_sat(regC[0 * 4 + i][idx+1]);\ - regC_uchar16.s5 = convert_uchar_sat(regC[1 * 4 + i][idx+1]);\ - regC_uchar16.s6 = convert_uchar_sat(regC[2 * 4 + i][idx+1]);\ - regC_uchar16.s7 = convert_uchar_sat(regC[3 * 4 + i][idx+1]);\ - \ - regC_uchar16.s8 = convert_uchar_sat(regC[0 * 4 + i][idx+2]);\ - regC_uchar16.s9 = convert_uchar_sat(regC[1 * 4 + i][idx+2]);\ - regC_uchar16.sa = convert_uchar_sat(regC[2 * 4 + i][idx+2]);\ - regC_uchar16.sb = convert_uchar_sat(regC[3 * 4 + i][idx+2]);\ - \ - regC_uchar16.sc = convert_uchar_sat(regC[0 * 4 + i][idx+3]);\ - regC_uchar16.sd = convert_uchar_sat(regC[1 * 4 + i][idx+3]);\ - regC_uchar16.se = convert_uchar_sat(regC[2 * 4 + i][idx+3]);\ - regC_uchar16.sf = convert_uchar_sat(regC[3 * 4 + i][idx+3]); - -#else - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - -#endif - -inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) -{ -#if OUT_WITH_PADDING == 1 - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; - padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; - padded_offset += y_idx * OUT_Y_PITCH; - padded_offset += x_idx * OUT_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += OUT_OFFSET; - - return padded_offset; -#else - return cOffset; -#endif -} - -inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, - __local int8* l_tileB, const uint l_offsetTileB_col0, - const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, - const uint l_offsetTileB_col3, int8* rowA, int8* colB, - int8* regC) -{ - // Read tile A from SLM to regA - uint l_offsetTileATemp = l_offsetTileA; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); - l_offsetTileATemp += 8 * SG_SIZE; - } - // Read tile B from SLM to regB and compute mmad - colB[0] = l_tileB[l_offsetTileB_col0]; - colB[1] = l_tileB[l_offsetTileB_col1]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); - } - colB[0] = l_tileB[l_offsetTileB_col2]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); - } - colB[1] = l_tileB[l_offsetTileB_col3]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); - } - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); - } -} - -/* - * \brief GEMM kernel to compute MxN matrix using SLM - * \param g_inA - Input matrix - * \param g_inB - Input matrix - * \param g_outC - Output matrix - */ - -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) -KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8) - (__global char* const g_inA, - __global int* g_outC, - __global char* const g_inB, - #if BIAS_TERM - __global BIAS_TYPE* biases, - #endif - __global float* quantizations, - #if CALIBRATION_TERM - __global float* calibrations, - #endif - uint split_idx - - ) -{ - - __global int4* const g_matrixA = (__global int4*)g_inA; - __global int4* const g_matrixB = (__global int4*)g_inB; - __global int8* g_matrixC = (__global int8*)g_outC; - - // Each work-group works to compute 128x128 tile. - // Each work-group contains 16 sub-groups. - // Each sub-group within the work-group works to compute a 32x32 tile. - // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). - // 2) Each sub-group works to compute 32x32 tileC (stored in regC). - // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. - // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") - __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; - __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; - - __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; - __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; - __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; - - const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y); - - const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); - const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); - const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); - const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); - - // Thread IDs - const uint g_tidY = get_global_id(DIM_Y); - const uint g_tidX = get_global_id(DIM_X); - const uint l_tidX = get_local_id(DIM_X); - const uint l_tidY = get_local_id(DIM_Y); - const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; - - // SubGroup IDs - const uint sg_tid = get_sub_group_local_id(); - const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); - const uint sg_global_idY = g_tidY; - const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); - const uint sg_local_idY = l_tidY; - const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX; - - const uint sub_group_id = get_sub_group_id(); - - // Registers - int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts - int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA - int8 colB[2]; // each lane will store 32x4 piece of matrixB - - // SLM indices - const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; - const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); - const uint numElements32x8TileB = numElements32x32TileB / 4; - const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; - const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; - const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; - - // Global indices - uint g_idxA[2]; - uint g_idxB[2]; -#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) - g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid; - g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid; - g_idxA[1] = g_idxA[0] + l_groupSize; - g_idxB[1] = g_idxB[0] + l_groupSize; -#else // Row (matrixA) and Col (matrixB) major layout - g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); - g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); -#endif - // Initial SLM setup - { - l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; - l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; - - l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; - if (l_tid < 32) - { - // Not all work-items will be needed to fetch the remaining matrix B - l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; - } -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - barrier(CLK_LOCAL_MEM_FENCE); - } - int4 hdcReadValueA[2]; - int4 hdcReadValueB[2]; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) - { - hdcReadValueA[0] = g_matrixA[g_idxA[0]]; - hdcReadValueB[0] = g_matrixB[g_idxB[0]]; - hdcReadValueA[1] = g_matrixA[g_idxA[1]]; - if (l_tid < 32) - { - // Not all work-items will be needed to fetch the remaining matrix B - hdcReadValueB[1] = g_matrixB[g_idxB[1]]; - } -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - - //MMAD compute - FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, - l_offsetTileB_col3, rowA, colB, regC); - - //SLM setup - SLM write only - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; - if (l_tid < 32) - { - // Not all work-items will be needed to fetch the remaining matrix B - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } // main outer loop - - //Last MMAD compute iteration (avoids branching in main loop) - FUNC_CALL(mmad_32x32_int8)( - &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, - &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, - regC); - - -#ifdef OUTPUT_TILED_GLOBAL_LAYOUT - - // Write out in swizzled manner after quantizing - __global uchar* g_outC_uchar = (__global uchar*)g_outC; - uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + - sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); - - uchar16 regC_uchar16; - uint offset_uc16 = 0; - - const uint workgroup_id_x = get_group_id(0); - uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x - uint feature = get_sub_group_local_id()*4 + feature_off; - - float4 quant_f = vload4(0, quantizations + feature); - float4 bias_f = vload4(0, biases + feature); - float4 calib_f = vload4(0, calibrations + feature); - -#if MMAD_SUPPORTED == 1 - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) -#endif - for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) - { - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - // B0..3, F0..31 - QUANTIZATION(0); - } - - intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); - cOffset += sizeof(uchar16) * SG_SIZE; - - // now we need to calculate again for other x - padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - // B0..3, F0..31 - QUANTIZATION(4); - } - - intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); - cOffset += sizeof(uchar16) * SG_SIZE; - } - -#else - // Write final accumulated values - uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + - sg_tid * (MATRIX_M / 8); - __attribute__((opencl_unroll_hint(SIMD_LANE_N))) - for (uint i = 0; i < (SIMD_LANE_N); ++i) - { - __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) - for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) - { - g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; - } - cOffset += SG_SIZE * (MATRIX_M / 8); - } -#endif -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl deleted file mode 100644 index ddc43c1f789..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_32x32sg_slm_int8.cl +++ /dev/null @@ -1,430 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/mmad.cl" - -inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) -{ - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; - padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; - padded_offset += y_idx * OUT_Y_PITCH; - padded_offset += x_idx * OUT_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += OUT_OFFSET; - - return padded_offset; -} - -inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, - __local int8* l_tileB, const uint l_offsetTileB_col0, - const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, - const uint l_offsetTileB_col3, int8* rowA, int8* colB, - int8* regC) -{ - // Read tile A from SLM to regA - uint l_offsetTileATemp = l_offsetTileA; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); - l_offsetTileATemp += 8 * SG_SIZE; - } - // Read tile B from SLM to regB and compute mmad - colB[0] = l_tileB[l_offsetTileB_col0]; - colB[1] = l_tileB[l_offsetTileB_col1]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); - } - colB[0] = l_tileB[l_offsetTileB_col2]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); - } - colB[1] = l_tileB[l_offsetTileB_col3]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); - } - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); - } -} - -/* - * \brief GEMM kernel to compute MxN matrix using SLM - * \param g_inA - Input matrix - * \param g_inB - Input matrix - * \param g_outC - Output matrix - */ - -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) -KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8) - ( - __global char* const g_inA, - __global int* g_outC, - __global char* const g_inB, - #if BIAS_TERM - __global BIAS_TYPE* biases, - #endif - __global float* quantizations, - #if CALIBRATION_TERM - __global float* calibrations, - #endif - uint split_idx - - ) -{ - - __global int4* const g_matrixA = (__global int4*)g_inA; - __global int4* const g_matrixB = (__global int4*)g_inB; - __global int8* g_matrixC = (__global int8*)g_outC; - - // 1) All work-items in work-group fill SLM with tileA and tileB. - // 2) Each sub-group works to compute a 32x32 tileC (stored in regC). - // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. - // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") - __local int8 l_workGroupTileA_0[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; - __local int8 l_workGroupTileB_0[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; - __local uint* l_workGroupTileA_uint_0 = (__local uint*)l_workGroupTileA_0; - - __local int8 l_workGroupTileA_1[(WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; - __local int8 l_workGroupTileB_1[(WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; - __local uint* l_workGroupTileA_uint_1 = (__local uint*)l_workGroupTileA_1; - - __local int8* l_workGroupTileA_live = l_workGroupTileA_0; - __local int8* l_workGroupTileB_live = l_workGroupTileB_0; - __local uint* l_workGroupTileA_live_uint = l_workGroupTileA_uint_0; - - __local int4* l_workGroupTileA_0_int4 = (__local int4*)l_workGroupTileA_0; - __local int4* l_workGroupTileB_0_int4 = (__local int4*)l_workGroupTileB_0; - __local int4* l_workGroupTileA_1_int4 = (__local int4*)l_workGroupTileA_1; - __local int4* l_workGroupTileB_1_int4 = (__local int4*)l_workGroupTileB_1; - - const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y); - - // Thread IDs - const uint g_tidY = get_global_id(DIM_Y); - const uint g_tidX = get_global_id(DIM_X); - const uint l_tidX = get_local_id(DIM_X); - const uint l_tidY = get_local_id(DIM_Y); - const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; - - // SubGroup IDs - const uint sg_tid = get_sub_group_local_id(); - const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); - const uint sg_global_idY = g_tidY; - const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); - const uint sg_local_idY = l_tidY; - const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX; - - const uint sub_group_id = get_sub_group_id(); - - // Registers - int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts - int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA - int8 colB[2]; // each lane will store 32x4 piece of matrixB - - // SLM indices - const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; - const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); - const uint numElements32x8TileB = numElements32x32TileB / 4; - const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; - const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; - const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; - - // Global indices -#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) - uint g_idxA = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid; - uint g_idxB = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid; -#else // Row (matrixA) and Col (matrixB) major layout - uint g_idxA = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - uint g_idxB = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); -#endif - - // Initial SLM setup - { - uint g_idxATemp = g_idxA; - for (uint i = l_tid; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE) - { - l_workGroupTileA_0_int4[i] = g_matrixA[g_idxATemp]; -#ifdef TILED_GLOBAL_LAYOUT - g_idxATemp += WG_SIZE; -#else - g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); -#endif - } - - uint g_idxBTemp = g_idxB; - for (uint i = l_tid; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE) - { - l_workGroupTileB_0_int4[i] = g_matrixB[g_idxBTemp]; -#ifdef TILED_GLOBAL_LAYOUT - g_idxBTemp += WG_SIZE; -#else - g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); -#endif - } - -#ifdef TILED_GLOBAL_LAYOUT - g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA += MATRIX_SMALL_K / sizeof(int4); - g_idxB += MATRIX_SMALL_K / sizeof(int4); -#endif - - barrier(CLK_LOCAL_MEM_FENCE); - } - - int4 hdcReadValueA[(WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1 - ? 1 - : (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE]; - int4 hdcReadValueB[(WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE < 1 - ? 1 - : (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)) / WG_SIZE]; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) - { - /* - * SLM setup - HDC read only - */ - -#if ((MATRIX_K / MATRIX_SMALL_K) > 1) - uint g_idxATemp = g_idxA; - for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j) - { - hdcReadValueA[j] = g_matrixA[g_idxATemp]; -#ifdef TILED_GLOBAL_LAYOUT - g_idxATemp += WG_SIZE; -#else - g_idxATemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); -#endif - } - - uint g_idxBTemp = g_idxB; - for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); i += WG_SIZE, ++j) - { - hdcReadValueB[j] = g_matrixB[g_idxBTemp]; -#ifdef TILED_GLOBAL_LAYOUT - g_idxBTemp += WG_SIZE; -#else - g_idxBTemp += (WG_SIZE / 2) * (MATRIX_K / sizeof(int4)); -#endif - } - -#ifdef TILED_GLOBAL_LAYOUT - g_idxA += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA += MATRIX_SMALL_K / sizeof(int4); - g_idxB += MATRIX_SMALL_K / sizeof(int4); -#endif -#endif - - /* - * MMAD compute - */ - - FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live, - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, - l_offsetTileB_col3, rowA, colB, regC); - - /* - * SLM setup - SLM write only - */ - -#if ((MATRIX_K / MATRIX_SMALL_K) > 1) - if (k % 2 == 0) - { - for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); - i += WG_SIZE, ++j) - { - l_workGroupTileA_1_int4[i] = hdcReadValueA[j]; - } - - for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); - i += WG_SIZE, ++j) - { - l_workGroupTileB_1_int4[i] = hdcReadValueB[j]; - } - - l_workGroupTileA_live = l_workGroupTileA_1; - l_workGroupTileB_live = l_workGroupTileB_1; - l_workGroupTileA_live_uint = l_workGroupTileA_uint_1; - } - else - { - for (uint i = l_tid, j = 0; i < (WG_TILE_M * MATRIX_SMALL_K / sizeof(int4)); - i += WG_SIZE, ++j) - { - l_workGroupTileA_0_int4[i] = hdcReadValueA[j]; - } - - for (uint i = l_tid, j = 0; i < (WG_TILE_N * MATRIX_SMALL_K / sizeof(int4)); - i += WG_SIZE, ++j) - { - l_workGroupTileB_0_int4[i] = hdcReadValueB[j]; - } - - l_workGroupTileA_live = l_workGroupTileA_0; - l_workGroupTileB_live = l_workGroupTileB_0; - l_workGroupTileA_live_uint = l_workGroupTileA_uint_0; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif - } - - /* - * Last MMAD compute iteration (avoids branching in main loop) - */ - FUNC_CALL(mmad_32x32_int8)(l_workGroupTileA_live_uint, l_offsetTileA, l_workGroupTileB_live, - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, - l_offsetTileB_col3, rowA, colB, regC); - -#ifdef OUTPUT_TILED_GLOBAL_LAYOUT - // Write out in swizzled manner after quantizing - __global uchar* g_outC_uchar = (__global uchar*)g_outC; - uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + - sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); - - uchar8 regC_uchar8[SIMD_LANE_M * SIMD_LANE_N / (sizeof(uchar8) / sizeof(uchar))]; - uint offset_uc8 = 0; - - const uint workgroup_id_x = get_group_id(0); - uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x - uint feature = get_sub_group_local_id() + feature_off; - - float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) )); - float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) )); - float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) )); - -#if MMAD_SUPPORTED == 1 - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) -#endif - for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) - { - // begin of account for output PADDING - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - // end of account for padding - - // B0 F0..31 - regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - // B1 F0..31 - regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - - FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); - cOffset += sizeof(uchar8) * SG_SIZE; - padded_offset += sizeof(uchar8) * SG_SIZE; - offset_uc8++; - - // B2 F0..31 - regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - // B3 F0..31 - regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - - FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); - cOffset += sizeof(uchar8) * SG_SIZE; - offset_uc8++; - - // now we need to calculate again for other x - padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - // - - regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s4) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s4) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s4) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s4) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - - regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s5) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s5) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s5) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s5) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - - FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); - cOffset += sizeof(uchar8) * SG_SIZE; - padded_offset += sizeof(uchar8) * SG_SIZE; - offset_uc8++; - - regC_uchar8[offset_uc8].s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s6) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s6) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s6) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s6) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - - regC_uchar8[offset_uc8].s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i].s7) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i].s7) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i].s7) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); - regC_uchar8[offset_uc8].s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i].s7) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); - - FUNC_CALL(sub_group_block_write_uchar8)(&g_outC_uchar[padded_offset], regC_uchar8[offset_uc8]); - cOffset += sizeof(uchar8) * SG_SIZE; - offset_uc8++; - } -#else - // Write final accumulated values - uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + - sg_tid * (MATRIX_M / 8); - __attribute__((opencl_unroll_hint(SIMD_LANE_N))) - for (uint i = 0; i < (SIMD_LANE_N); ++i) - { - __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) - for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) - { - g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; - } - cOffset += SG_SIZE * (MATRIX_M / 8); - } -#endif - -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl index b41428dc4e3..d94dbbd0cdc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32.cl @@ -29,7 +29,6 @@ #undef TO_ACCUMULATOR_TYPE #endif -#if QUANTIZATION_TERM #define ACCUMULATOR_TYPE int #define TO_ACCUMULATOR_TYPE(x) convert_int(x) #define ACTIVATION_TYPE float @@ -55,10 +54,6 @@ #error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size" #endif -#else // QUANTIZATION_TERM -#error "convolution_gpu_mmad_b_fs_yx_fsv32: invalid parameters: quantization term is expected to be true" -#endif - __attribute__((reqd_work_group_size(8, OW_GROUP, 1))) __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) KERNEL(convolution_mmad_b_fs_yx_fsv32)( diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl deleted file mode 100644 index 075a5b04c2d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched.cl +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) -// input data is in blocks 4batch x 32 features -// each SIMD process 4 batches and 8 output features - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(convolution_mmad_batched)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if QUANTIZATION_TERM - const __global float* quantizations, -#endif -#if CALIBRATION_TERM - const __global float* calibrations, -#endif - uint split_idx) -{ - const uint x = get_global_id(0); - const uint y = get_global_id(1); - - const uint f = (uint)get_global_id(2) % FILTER_OFM_ALIGNED; - const uint b_block = (uint)get_global_id(2) / FILTER_OFM_ALIGNED; - const uint f_block = f / 32; - - int4 dotProd = 0; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint filter_offset = ((uint)get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - const uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block; - - for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) - { - for (uint j = 0; j < FILTER_SIZE_Y ; ++j) - { - const int input_offset_y = input_y + j * DILATION_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - - if(!zero_y) - { - for (uint i = 0; i < FILTER_SIZE_X ; ++i) - { - const int input_offset_x = input_x + i * DILATION_SIZE_X; - const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - - if(!zero_x) - { - uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH + k * IN_F_BLOCK_PITCH; - uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; - - int4 input_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); - int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx))); - - dotProd = MMAD_4x8(input_data, weights_data, dotProd); - } - } - } - } - } - -for(uint b = 0; b < 4; b++) -{ - -#if BIAS_TERM - const uint bias_index = f; -#if QUANTIZATION_TERM -#if CALIBRATION_TERM - dotProd[b] = (UNIT_TYPE)round(((float)dotProd[b] * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]); -#else // CALIBRATION_TERM - dotProd[b] = (UNIT_TYPE)round(((float)dotProd[b] * quantizations[f] * I_QF + biases[bias_index]) * O_QF); -#endif // CALIBRATION_TERM -#else // QUANTIZATION_TERM - dotProd[b] += (UNIT_TYPE)biases[bias_index]; -#endif // QUANTIZATION_TERM -#endif // BIAS_TERM - - const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f, y, x); -#if QUANTIZATION_TERM - output[dst_index] = ACTIVATION(convert_char(dotProd[b]), ACTIVATION_PARAMS); -#else - output[dst_index] = ACTIVATION(dotProd[b], ACTIVATION_PARAMS); -#endif -} -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl deleted file mode 100644 index 583d1bea3f3..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block.cl +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION \ - uchar4 out;\ - out[0] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 0][b] * SCALE + bias_f.s0);\ - out[1] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 1][b] * SCALE + bias_f.s1);\ - out[2] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 2][b] * SCALE + bias_f.s2);\ - out[3] = convert_uchar_sat((float)dotProd[o + OUT_BLOCK_WIDTH * 3][b] * SCALE + bias_f.s3); - -#elif NO_QUANTIZATION - -#define QUANTIZATION \ - uchar4 out;\ - out[0] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 0][b]);\ - out[1] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 1][b]);\ - out[2] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 2][b]);\ - out[3] = convert_uchar_sat(dotProd[o + OUT_BLOCK_WIDTH * 3][b]); - -#else - -#define QUANTIZATION \ - char4 out;\ - out[0] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 0][b] * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS);\ - out[1] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 1][b] * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS);\ - out[2] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 2][b] * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS);\ - out[3] = ACTIVATION(convert_char(round(((float)dotProd[o + OUT_BLOCK_WIDTH * 3][b] * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS); - -#endif - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) -// input data is in blocks 4batch x 32 features - -#define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1) - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(convolution_mmad_batched_block)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, - __global BIAS_TYPE* biases, - const __global float* quantizations, -#if CALIBRATION_TERM - const __global float* calibrations, -#endif - uint split_idx) -{ - const uint x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH; - const uint y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT; - - const uint b_f = ((uint)get_group_id(2) * WG_BATCH_COUNT + get_sub_group_id()); - -#if WEIGHTS_PER_WORKITEM == 4 - const uint f = (b_f * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED; -#else - const uint f = ((b_f * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED; -#endif - const uint b_block = (b_f * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED; - - // all accumulators - int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 }; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint filter_offset = ((b_f * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block + input_y * IN_Y_PITCH + input_x * IN_X_PITCH; - - uint filter_idx = filter_offset; - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) - { - uint input_offset_y = 0; - __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) - for (uint j = 0; j < FILTER_SIZE_Y; ++j) - { - uint input_idx = input_offset + input_offset_y; - - ////// preloading input data ////// - int4 preloaded_input[NEEDED_INPUT_X]; - for(int p = 0; p < NEEDED_INPUT_X; p++) - { - preloaded_input[p] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); - input_idx += IN_X_PITCH; - } - - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint wi = 0; wi < WEIGHTS_PER_WORKITEM; wi++) - { - ////// preloading weights data ////// - int8 preloaded_weights[FILTER_SIZE_X]; - uint tmp_filter_idx = filter_idx; - __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) - for(uint w = 0; w < FILTER_SIZE_X; w++) - { - preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + tmp_filter_idx + (wi * FILTER_OFM_BLOCK_PITCH)))); - tmp_filter_idx += FILTER_X_PITCH; - } - ////// computing ////// - __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) - for (uint i = 0; i < FILTER_SIZE_X; ++i) - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++) - { - const uint out_idx = ox + wi * OUT_BLOCK_WIDTH; - const uint in_idx = ox * STRIDE_SIZE_X + i; - dotProd[out_idx] = MMAD_4x8(preloaded_input[in_idx], preloaded_weights[i], dotProd[out_idx]); - } - } - } - filter_idx += FILTER_X_PITCH * FILTER_SIZE_X; - input_offset_y += IN_Y_PITCH; - } - input_offset += IN_F_BLOCK_PITCH; - } - -////// QUANTIZE & OUTPUT ////// - -#if WEIGHTS_PER_WORKITEM == 4 - -float4 quant_f = vload4(0, quantizations + f); -float4 bias_f = vload4(0, biases + f); -float4 calib_f = vload4(0, calibrations + f); - -uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y, x); - -__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) -for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) -{ - uint4 to_output; - __attribute__((opencl_unroll_hint(4))) - for(uint b = 0; b < 4; b++) - { - QUANTIZATION; - to_output[b] = as_uint(out); - } - intel_sub_group_block_write4((__global uint*)(output + dst_index), to_output); - dst_index += OUT_X_PITCH; -} -#else -__attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) -for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) -{ - float quant_f = quantizations[f + w * 8]; - float bias_f = biases[f + w * 8]; -#if CALIBRATION_TERM - float calib_f = calibrations[f + w * 8]; -#endif - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) - { - const uint out_idx = o + OUT_BLOCK_WIDTH * w; - __attribute__((opencl_unroll_hint(4))) - for(uint b = 0; b < 4; b++) - { - #if CALIBRATION_TERM - dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f); - #else // CALIBRATION_TERM - dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF); - #endif // CALIBRATION_TERM - - const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f + w * 8, y, x + o); - output[dst_index] = ACTIVATION(convert_char(dotProd[out_idx][b]), ACTIVATION_PARAMS); - } - } -} -#endif - -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED - -#undef SCALE -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl deleted file mode 100644 index 98b034bb714..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_batched_block_1x1.cl +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION \ - uchar4 out;\ - out[0] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s0);\ - out[1] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * SCALE + bias_f.s1);\ - out[2] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * SCALE + bias_f.s2);\ - out[3] = convert_uchar_sat((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * SCALE + bias_f.s3); - -#elif NO_QUANTIZATION - -#define QUANTIZATION \ - uchar4 out;\ - out[0] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b]);\ - out[1] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b]);\ - out[2] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b]);\ - out[3] = convert_uchar_sat(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b]); - -#else - -#define QUANTIZATION \ - char4 out;\ - out[0] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 0][b] * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0 ) ), ACTIVATION_PARAMS);\ - out[1] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 1][b] * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1 ) ), ACTIVATION_PARAMS);\ - out[2] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 2][b] * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2 ) ), ACTIVATION_PARAMS);\ - out[3] = ACTIVATION(convert_char(round( ((float)dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * 3][b] * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3 ) ), ACTIVATION_PARAMS); - -#endif - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) -// input data is in blocks 4batch x 32 features - -#define NEEDED_INPUT_X ((OUT_BLOCK_WIDTH-1) * (STRIDE_SIZE_X) + (FILTER_SIZE_X - 1) + 1) -#define NEEDED_INPUT_Y ((OUT_BLOCK_HEIGHT-1) * (STRIDE_SIZE_Y) + (FILTER_SIZE_Y - 1) + 1) - -__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) -KERNEL(convolution_mmad_batched_block_1x1)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, - __global BIAS_TYPE* biases, - const __global float* quantizations, -#if CALIBRATION_TERM - const __global float* calibrations, -#endif - uint split_idx) -{ - const uint x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH; - const uint y = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT; - - const uint b_f = ((uint)get_group_id(2) * WG_BATCH_COUNT + (uint)get_sub_group_id()); -#if WEIGHTS_PER_WORKITEM == 4 - const uint f = (b_f * 32 + get_sub_group_local_id() * 4) % FILTER_OFM_ALIGNED; -#else - const uint f = ((b_f * WEIGHTS_PER_WORKITEM * 8) + get_sub_group_local_id() ) % FILTER_OFM_ALIGNED; -#endif - const uint b_block = (b_f * 8 * WEIGHTS_PER_WORKITEM) / FILTER_OFM_ALIGNED; - - int4 dotProd[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * WEIGHTS_PER_WORKITEM] = { 0 }; - - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - - const uint filter_offset = ((b_f * WEIGHTS_PER_WORKITEM) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - uint input_offset = IN_OFFSET + IN_B_BLOCK_PITCH * b_block; - - uint filter_idx = filter_offset; - for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) - { - ////// preloading input data ////// - int4 preloaded_input[NEEDED_INPUT_X * NEEDED_INPUT_Y]; - for(int h = 0; h < NEEDED_INPUT_Y; h++) - { - for(int p = 0; p < NEEDED_INPUT_X; p++) - { - const int input_offset_y = input_y + h; - const int input_offset_x = input_x + p; - - uint input_idx = input_offset + input_offset_y * IN_Y_PITCH + input_offset_x * IN_X_PITCH; - preloaded_input[p + h * NEEDED_INPUT_X] = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); - } - } - - __attribute__((opencl_unroll_hint(FILTER_SIZE_Y))) - for (uint j = 0; j < FILTER_SIZE_Y; ++j) - { - __attribute__((opencl_unroll_hint(FILTER_SIZE_X))) - for (uint i = 0; i < FILTER_SIZE_X; ++i) - { - ////// preloading weights data ////// - int8 preloaded_weights[WEIGHTS_PER_WORKITEM]; - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) - { - preloaded_weights[w] = as_int8(intel_sub_group_block_read8((const __global uint*) (weights + (filter_idx + w * FILTER_OFM_BLOCK_PITCH) ) )); - } - - ////// computing ////// - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) - for(uint oy = 0; oy < OUT_BLOCK_HEIGHT; oy++) - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for(uint ox = 0; ox < OUT_BLOCK_WIDTH; ox++) - { - const uint out_idx = ox + OUT_BLOCK_WIDTH * (oy + w * OUT_BLOCK_HEIGHT); - const uint preloaded_idx =ox * STRIDE_SIZE_X + i + NEEDED_INPUT_X * (oy * STRIDE_SIZE_Y + j); - dotProd[out_idx] = MMAD_4x8(preloaded_input[preloaded_idx], preloaded_weights[w], dotProd[out_idx]); - } - } - } - filter_idx += FILTER_X_PITCH; - } - } - input_offset += IN_F_BLOCK_PITCH; - } - - -#if WEIGHTS_PER_WORKITEM == 4 - -float4 quant_f = vload4(0, quantizations + f); -float4 bias_f = vload4(0, biases + f); -float4 calib_f = vload4(0, calibrations + f); - -uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y, x); - -__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) -for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) -{ - uint tmp_dst_index = dst_index; - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) - { - uint4 to_output; - __attribute__((opencl_unroll_hint(4))) - for(uint b = 0; b < 4; b++) - { - const uint out_idx = o + OUT_BLOCK_WIDTH * h; - - QUANTIZATION; - to_output[b] = as_uint(out); - } - intel_sub_group_block_write4((__global uint*)(output + tmp_dst_index), to_output); - tmp_dst_index += OUT_X_PITCH; - } - dst_index += OUT_Y_PITCH; -} - -#else // WEIGHTS_PER_WORKITEM ==4 - -////// QUANTIZE & OUTPUT ////// -__attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) -for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) -{ - float quant_f = quantizations[f + w * 8]; - float bias_f = biases[f + w * 8]; -#if CALIBRATION_TERM - float calib_f = calibrations[f + w * 8]; -#endif - __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) - for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) - { - const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT); - for(uint b = 0; b < 4; b++) - { - #if CALIBRATION_TERM - dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * calib_f); - #else // CALIBRATION_TERM - dotProd[out_idx][b] = (UNIT_TYPE)round(((float)dotProd[out_idx][b] * quant_f * I_QF + bias_f) * O_QF); - #endif // CALIBRATION_TERM - } - } - } -} - -////// OUTPUT STAGE ////// -__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) -for(uint h = 0; h < OUT_BLOCK_HEIGHT; h++) -{ - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for(uint o = 0; o < OUT_BLOCK_WIDTH; o++) - { - const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f, y + h, x + o); - - __attribute__((opencl_unroll_hint(4))) - for(uint b = 0; b < 4; b++) - { - #if WEIGHTS_PER_WORKITEM == 2 - char2 out; - const uint out_idx = o + OUT_BLOCK_WIDTH * h; - out[0] = ACTIVATION(convert_char(dotProd[out_idx][b]), ACTIVATION_PARAMS); - out[1] = ACTIVATION(convert_char(dotProd[out_idx + OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT][b]), ACTIVATION_PARAMS); - - intel_sub_group_block_write_uc2((__global uchar*)(output + dst_index + b * 32), as_uchar2(out)); - #else - __attribute__((opencl_unroll_hint(WEIGHTS_PER_WORKITEM))) - for(uint w = 0; w < WEIGHTS_PER_WORKITEM; w++) - { - const uint out_idx = o + OUT_BLOCK_WIDTH * (h + w * OUT_BLOCK_HEIGHT); - const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4, f + w * 8, y + h, x + o); - char char_val = ACTIVATION(convert_char(dotProd[out_idx][b]), ACTIVATION_PARAMS); - output[dst_index + b * 32] = char_val; - } - #endif - } - } -} - -#endif // WEIGHTS_PER_WORKITEM ==4 - -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED - - -#undef SCALE -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl index 9f212434dc5..a26ca07ce42 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv32.cl @@ -33,7 +33,6 @@ #undef TO_ACCUMULATOR_TYPE #endif -#if QUANTIZATION_TERM #define ACCUMULATOR_TYPE int #define TO_ACCUMULATOR_TYPE(x) convert_int(x) #define ACTIVATION_TYPE float @@ -65,10 +64,6 @@ #error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: Unsupported block size" #endif -#else // QUANTIZATION_TERM -#error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: invalid parameters: quantization term is expected to be true" -#endif - #define AS_TYPE_N_(type, n, x) as_##type##n(x) #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl index 18d2e9b8859..cfa4f7ceaae 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4.cl @@ -30,7 +30,6 @@ #undef TO_ACCUMULATOR_TYPE #endif -#if QUANTIZATION_TERM #define ACCUMULATOR_TYPE int #define TO_ACCUMULATOR_TYPE(x) convert_int(x) #define ACTIVATION_TYPE float @@ -54,10 +53,6 @@ #error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: Unsupported block size" #endif -#else // QUANTIZATION_TERM -#error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: invalid parameters: quantization term is expected to be true" -#endif - #define AS_TYPE_N_(type, n, x) as_##type##n(x) #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl deleted file mode 100644 index 9d84fd8bbc0..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_2x14_rep4.cl +++ /dev/null @@ -1,945 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/data_types.cl" -#include "include/mmad.cl" - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION \ - slm_write0.s0 = convert_uchar_sat((float)outvec.s0 * SCALE + bias_f);\ - slm_write0.s1 = convert_uchar_sat((float)outvec.s1 * SCALE + bias_f);\ - slm_write0.s2 = convert_uchar_sat((float)outvec.s2 * SCALE + bias_f);\ - slm_write0.s3 = convert_uchar_sat((float)outvec.s3 * SCALE + bias_f); - -#elif NO_QUANTIZATION - -#define QUANTIZATION(idx) \ - slm_write0.s0 = convert_uchar_sat(outvec.s0);\ - slm_write0.s1 = convert_uchar_sat(outvec.s1);\ - slm_write0.s2 = convert_uchar_sat(outvec.s2);\ - slm_write0.s3 = convert_uchar_sat(outvec.s3); - -#else - -#define QUANTIZATION \ - slm_write0.s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s0) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));\ - slm_write0.s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s1) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));\ - slm_write0.s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s2) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS));\ - slm_write0.s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec.s3) * quant_f * I_QF + bias_f) * calib_f)), ACTIVATION_PARAMS)); - -#endif - -// mapping to clDNN -#define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C) -#define _OD OUTPUT_FEATURE_NUM -#define _OW OUTPUT_SIZE_X -#define _OH OUTPUT_SIZE_Y -#define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) -#define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) -#define _IH INPUT0_SIZE_Y -#define _IW INPUT0_SIZE_X -#define _ID INPUT0_FEATURE_NUM -#define K_HEIGHT FILTER_SIZE_Y -#define K_WIDTH FILTER_SIZE_X -#define BATCH_SIZE OUTPUT_BATCH_NUM - -#define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) -#define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) -#define K_STRIDE STRIDE_SIZE_X -// end of mapping - -// for now kernel stride is square -#define K_WSTRIDE K_STRIDE -#define K_HSTRIDE K_STRIDE - -#define PACK 32 -#define BATCH_PACK 4 - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(convolution_mmad_slm_2x14_rep4)( -__global int8 *inputs, -__global uchar* outputs, -__global int8* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if QUANTIZATION_TERM - const __global float* quantizations, -#endif -#if CALIBRATION_TERM - const __global float* calibrations, -#endif - uint split_idx -) -{ - const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z; - const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y; - - ushort fmg = get_group_id(0); // Output Depth - ushort group_y = get_group_id(1); // Output Width - ushort group_z = get_group_id(2); // Output Height - - /* 32,1,4 WG , SIMD8 - 16 HW threads in a WG - threads 0-3 (group1) : (lid_x:0-15,lid_y:0,lid_z:0) - threads 4-7 (group2) : (lid_x:0-15,lid_y:0,lid_z:1) - threads 8-11 (group3) : (lid_x:0-15,lid_y:0,lid_z:2) - threads 12-15 (group4) : (lid_x:0-15,lid_y:0,lid_z:3) - - Verify sub_group_layout through below printfs - - if(group_z == 0 && group_y == 0 && fmg == 0 && get_sub_group_id() == 31) { - printf("\n sub_group_local_id: %d, lid_x: %d, lid_y: %d, lid_z: %d ", get_sub_group_local_id(), get_local_id(0) ,get_local_id(1),get_local_id(2)); - printf("\n #WorkgroupsX: %d, #WorkgroupsY: %d, #WorkgroupsZ: %d",get_num_groups(0),get_num_groups(1),get_num_groups(2)); - } - - If sub_group_layout is different then derive lid_x, lid_z - - lid_z: thread_id/4 - */ - - /* Thread, local IDs */ - ushort thread_id = get_sub_group_id(); - ushort threadid_group_4 = thread_id % 4; - ushort threadid_mod_2 = thread_id%2; - ushort threadid_mod_8 = thread_id % 8; - - ushort lid_x = get_local_id(0); - ushort lid_z = get_local_id(2); - - uchar lane_id = get_sub_group_local_id(); - - /* 32-bit signed accumulator for 4 mini-batches , for a thread OUT_BLOCK_WIDTH*HEIGHT*4 registers are used - Will be converted to 8-bits before final write */ - - int4 out[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = { 0 } ; - - /* Account for batching */ - - ushort batch = ( fmg*LOCAL_SIZE_X ) /_OD; - - // Size calculated for int8 elements , One Batch processing is [H][W][4N][32C] - uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ; - - uint in_addr_offset = batch*input_size; - - /* Goto activation tile for work group, offset is w.r.t int8 array */ - - uint groupy_tile = TILE_W*group_y; - uint groupz_tile = TILE_H*group_z; - - in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK; - - /* SLM space for Activation, Weights - ( 32,1,4 ) Workgroup - 4 tiles along Y direction and 32 different output channels - Activation - 10Wx16Wx4Nx32C Weights -9RSx32Kx32C */ - - __local int8 act_slm [ 10*16*4 ]; - __local int8 weight_slm [ 9*32 ]; - - /* 10Hx16Wx4Nx32C activation tile written into SLM. Distribute among 16 threads in Workgroup - threads 0-1 write 16x4x32 of H=0, W=0...15 ( 8x4x32 per thread ) - threads 2-3 write 16x4x32 of H=1, W=0...15 ( 8x4x32 per thread ) - threads 4-5 write 16x4x32 of H=2, W=0...15 ( 8x4x32 per thread ) - threads 6-7 write 16x4x32 of H=3, W=0...15 ( 8x4x32 per thread ) - threads 8-9 write 16x4x32 of H=4, W=0...15 ( 8x4x32 per thread ) - threads 10-11 write 16x4x32 of H=5, W=0...15 ( 8x4x32 per thread ) - threads 12 write 16x4x32 of H=6, W=0...15 ( 16x4x32 per thread ) - thread 13 writes 16x4x32 of H=7 - thread 14 writes 16x4x32 of H=8 - thread 15 writes 16x4x32 of H=9 - - Interleaved write to avoid SLM BC - - threads0,1 write 16x4x32 together - thread0 writes first 4x32 block, thread1 writes next 4x32 block etc. - */ - - - /* Goto activation tile for thread in group */ - - uint row_offset = thread_id / 2; - - if ( thread_id >= 12 ) { - row_offset = 6 + thread_id - 12 - threadid_mod_2; - } - - // In addr offset for the particular thread - in_addr_offset += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ; - - /* Activation SLM indices */ - uint act_slm_write = row_offset * ( TILE_W + 2) * BATCH_PACK; - uint act_slm_read = OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ; - - /* Weights - Weight Global Tensor Order: [K/8][C/32][R][S][8C][8K][4C] - */ - - /* 9RSx32Kx32C Weight Block in SLM - thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0 ( k=0..7) - thread1 handles w(0,0),w(0,1),w(0,2) of K=1 ( k=8..15) - thread2 handles w(1,0),w(1,1) of K=0 ( k=0..7) - thread3 handles w(1,0),w(1,1) of K=1 ( k=8..15) - thread4 handles w(1,2),w(2,0) of K=0 ( k=0..7) - thread5 handles w(1,2),w(2,0) of K=1 ( k=8..15) - thread6 handles w(2,1),w(2,2) of K=0 ( k=0..7) - thread7 handles w(2,1),w(2,2) of K=1 ( k=8..15) - - Similarly threads8-15 handles for K=2,3 - - Weight Layout in SLM - - w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=8..15,C=0..15) - w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=8..15,C=16..31) - - Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM - Thread0 will read k=0..7, thread1 will read k=8..15 - - First all output channels are present in SLM, then next weight pixel is present in SLM */ - - #define NUM_FILTERS (K_HEIGHT * K_WIDTH) - - uint output_depth = fmg % ( _OD / LOCAL_SIZE_X ); - - uint weight_size_CRS = ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside - - // Global weight addr for workgroup - uint weight_global_addr_offset = output_depth * 4 * weight_size_CRS ; //32 output channels per workgroup - - // Global weight address for thread - uint weight_global_channel_offset = threadid_mod_2 * weight_size_CRS ; - - uint slm_channel_offset = 0; - - if ( thread_id >= 8 ) { - weight_global_channel_offset += 2*weight_size_CRS; - slm_channel_offset = 1; - } - - uint weight_global_pixel_offset = 0; - uint slm_pixel_offset = 0; - - if ( threadid_mod_8 >=2 ) - { - weight_global_pixel_offset = 3*8 + ( ( (threadid_mod_8/2) - 1 )*2*8 ); - slm_pixel_offset = 3*LOCAL_SIZE_X + ( ( (threadid_mod_8/2) - 1 )*2*LOCAL_SIZE_X ); - } - - weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset; - - /* Weight slm write index */ - - uint slm_write_weight = threadid_mod_2*4 + slm_pixel_offset + slm_channel_offset * 16; - - /* Weight slm read index */ - - uint wt_slm_rd_offset = threadid_group_4*8; - - if ( threadid_mod_2 ) - { - wt_slm_rd_offset = wt_slm_rd_offset - 8 + 4; - } - - int kd; - - __attribute__((opencl_unroll_hint(1))) - for(kd = 0; kd < ( _ID / PACK ) ; kd++) - { - - { - /* Load Activation from global to SLM */ - - int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset; - - __global uint *activation_tile = (__global uint*)&inputs[ in_addr ]; - - __local uint *act_slm_ptr = (__local uint *) &act_slm [ act_slm_write ]; - - /* The odd thread in fused pair will start from next 4x8 block */ - - activation_tile += threadid_mod_2*4*8; - act_slm_ptr += threadid_mod_2*4*8; - - int4 act_col_0 = as_int4( intel_sub_group_block_read4(activation_tile) ); - int4 act_col_1 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) ); - int4 act_col_2 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) ); - int4 act_col_3 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) ); - int4 act_col_4 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); - int4 act_col_5 = as_int4( intel_sub_group_block_read4(activation_tile + 5*8*8) ); - int4 act_col_6 = as_int4( intel_sub_group_block_read4(activation_tile + 6*8*8) ); - int4 act_col_7 = as_int4( intel_sub_group_block_read4(activation_tile + 7*8*8) ); - - SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 5*8*8 ) , as_uint4 ( act_col_5 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 6*8*8 ) , as_uint4 ( act_col_6 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 7*8*8 ) , as_uint4 ( act_col_7 ) ); - - if ( thread_id >=12 ) - { - activation_tile = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8; - act_slm_ptr += 8*8*8; - - int4 act_col_9 = as_int4( intel_sub_group_block_read4(activation_tile) ); - int4 act_col_10 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) ); - int4 act_col_11 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) ); - int4 act_col_12 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) ); - int4 act_col_13 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); - int4 act_col_14 = as_int4( intel_sub_group_block_read4(activation_tile + 5*8*8) ); - int4 act_col_15 = as_int4( intel_sub_group_block_read4(activation_tile + 6*8*8) ); - int4 act_col_16 = as_int4( intel_sub_group_block_read4(activation_tile + 7*8*8) ); - - SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_9 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_10 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 5*8*8 ) , as_uint4 ( act_col_14 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 6*8*8 ) , as_uint4 ( act_col_15 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 7*8*8 ) , as_uint4 ( act_col_16 ) ); - } - - /* load weights from global to weight_slm */ - - int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset; - - __global uint *weight_tile = (__global uint*)&weights [ weight_addr ]; - __local uint *wt_slm_ptr = (__local uint *) &weight_slm [ slm_write_weight ]; - - int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); - int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); - int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); - int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) ); - - SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w0 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 32*8 ) , as_uint4 ( w2 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 32*8 + 8*8 ) , as_uint4 ( w3 ) ); - - if( threadid_mod_8 < 2 ) - { - weight_tile += 16*8; - wt_slm_ptr += 2*32*8; - - int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); - int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); - - SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w5 ) ); - } - } - - // Synchronize SLM writes across workgroup - barrier(CLK_LOCAL_MEM_FENCE); - - uint wt_slm_rd = wt_slm_rd_offset; - - __local uint *slm_ptr0 = (__local uint *) &act_slm[ act_slm_read ]; - __local uint *slm_ptr1 = (__local uint *) &weight_slm[ wt_slm_rd ]; - - int8 weights_reg0, weights_reg1,weights_reg2; - - /********************************************************************************************************** - First phase - load first row of weights and for the first activation row - 1Hx8Wx4N inputs at a time - - Weights - 24 registers, Activations - 32 registers: Total 56 registers used for input data - ***********************************************************************************************************/ - { - int4 act_reg[ 8 ]; - - /* Load weights from SLM into registers */ - { - weights_reg0.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg0.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - - weights_reg1.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg1.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - - weights_reg2.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg2.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - } - - /* load first 1Hx8Wx4N inputs - Activation Broadcast will occur since it is same for fused threads */ - - __attribute__((opencl_unroll_hint(8))) - for (int ic = 0; ic < 8; ic++) - { - /* Load activations from SLM into registers */ - - uint slm_offset = ic * BATCH_PACK * 8 ; - - act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - } - - /* Convolve */ - - /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/ - - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[0], weights_reg0 ); - out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[1], weights_reg0 ); - out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[2], weights_reg0 ); - out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[3], weights_reg0 ); - out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[4], weights_reg0 ); - out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[5], weights_reg0 ); - out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[6], weights_reg0 ); - out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[7], weights_reg0 ); - - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[1], weights_reg1 ); - out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[2], weights_reg1 ); - out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[3], weights_reg1 ); - out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[4], weights_reg1 ); - out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[5], weights_reg1 ); - out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[6], weights_reg1 ); - out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[7], weights_reg1 ); - - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg[2], weights_reg2 ); - out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg[3], weights_reg2 ); - out[ 2 ] = _MMAD_4x8 ( out[ 2 ], act_reg[4], weights_reg2 ); - out[ 3 ] = _MMAD_4x8 ( out[ 3 ], act_reg[5], weights_reg2 ); - out[ 4 ] = _MMAD_4x8 ( out[ 4 ], act_reg[6], weights_reg2 ); - out[ 5 ] = _MMAD_4x8 ( out[ 5 ], act_reg[7], weights_reg2 ); - - /* load next 1Hx8Wx4N inputs */ - - __attribute__((opencl_unroll_hint(8))) - for (int ic = 8; ic < 16; ic++) - { - uint slm_offset = ic * BATCH_PACK * 8; - - act_reg [ ic - 8 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset) ) ; - } - - /* Convolve */ - - out[ 6 ] = _MMAD_4x8 ( out[ 6 ], act_reg[0], weights_reg2 ); - out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[1], weights_reg2 ); - out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[2], weights_reg2 ); - out[ 9 ] = _MMAD_4x8 ( out[ 9 ], act_reg[3], weights_reg2 ); - out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[4], weights_reg2 ); - out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[5], weights_reg2 ); - out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[6], weights_reg2 ); - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[7], weights_reg2 ); - - out[ 7 ] = _MMAD_4x8 ( out[ 7 ], act_reg[0], weights_reg1 ); - out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[1], weights_reg1 ); - out[ 9 ] = _MMAD_4x8 ( out[ 9 ], act_reg[2], weights_reg1 ); - out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[3], weights_reg1 ); - out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[4], weights_reg1 ); - out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[5], weights_reg1 ); - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[6], weights_reg1 ); - - out[ 8 ] = _MMAD_4x8 ( out[ 8 ], act_reg[0], weights_reg0 ); - out[ 9 ] = _MMAD_4x8 ( out [ 9 ], act_reg[1], weights_reg0 ); - out[ 10 ] = _MMAD_4x8 ( out[ 10 ], act_reg[2], weights_reg0 ); - out[ 11 ] = _MMAD_4x8 ( out[ 11 ], act_reg[3], weights_reg0 ); - out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg[4], weights_reg0 ); - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg[5], weights_reg0 ); - } - - /* Second , Third phase */ - { - int8 weights_reg3, weights_reg4,weights_reg5; - int4 act_reg_2[ 6 ]; - - /***************************************************************************************************************************************** - Second phase - load second row of weights, now both rows are in registers, for the second activation row - 1Hx6Wx4N inputs at a time - - Weights - 48 registers, Activations - 24 registers: Total 72 registers used for input data - ******************************************************************************************************************************************/ - - /* Load weights of row = 1 from SLM into registers */ - { - - weights_reg3.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg3.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - - weights_reg4.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg4.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - - weights_reg5.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg5.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - } - - /* load input row =1,col=0:1 1Hx2Wx8N */ - - uint slm_row_offset_2 = 1*(TILE_W + 2)*BATCH_PACK*8; - - act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_2) ) ; - act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_2 + BATCH_PACK*8) ) ; - - out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[0] , weights_reg0 ); - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[0] , weights_reg3 ); - out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg_2[1] , weights_reg3 ); - out[ 15 ] = _MMAD_4x8 ( out[ 15 ], act_reg_2[1] , weights_reg0 ); - - out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[1], weights_reg1 ); - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[1], weights_reg4 ); - - /* load input row =1,col=2:7,8:13,1Hx6Wx4N */ - - uint col = 2; - - __attribute__((opencl_unroll_hint(2))) - do { - - uint slm_offset = 1*(TILE_W + 2)*BATCH_PACK*8 + col*BATCH_PACK*8; - - act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + BATCH_PACK*8)) ; - act_reg_2 [ 2 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 2*BATCH_PACK*8)) ; - act_reg_2 [ 3 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 3*BATCH_PACK*8) ) ; - act_reg_2 [ 4 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 4*BATCH_PACK*8) ) ; - act_reg_2 [ 5 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 5*BATCH_PACK*8) ) ; - - uint first_row_offset = col - 2; - uint second_row_offset = 14 + col - 2; - - out [ first_row_offset ] = _MMAD_4x8 ( out[ first_row_offset ] , act_reg_2[0] , weights_reg5 ); - out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1] , act_reg_2[0], weights_reg4 ); - out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2] , act_reg_2[0], weights_reg3 ); - out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[1], weights_reg3 ); - - out [ second_row_offset ] = _MMAD_4x8 ( out[ second_row_offset ] , act_reg_2[0] , weights_reg2 ); - out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1] , act_reg_2[0], weights_reg1 ); - out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2] , act_reg_2[0], weights_reg0 ); - out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[1], weights_reg0 ); - - out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1 ], act_reg_2[1], weights_reg5 ); - out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2 ], act_reg_2[1], weights_reg4 ); - out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[2], weights_reg4 ); - out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4 ], act_reg_2[2], weights_reg3 ); - - out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1 ], act_reg_2[1], weights_reg2 ); - out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2 ], act_reg_2[1], weights_reg1 ); - out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3 ], act_reg_2[2], weights_reg1 ); - out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4 ], act_reg_2[2], weights_reg0 ); - - out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2], act_reg_2[2], weights_reg5 ); - out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3], act_reg_2[3], weights_reg5 ); - out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[3], weights_reg4 ); - out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[3], weights_reg3 ); - - out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2], act_reg_2[2], weights_reg2 ); - out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[3], weights_reg2 ); - out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[3], weights_reg1 ); - out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[3], weights_reg0 ); - - out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[4], weights_reg3 ); - out [ first_row_offset + 7 ] = _MMAD_4x8 ( out[ first_row_offset + 7], act_reg_2[5], weights_reg3 ); - out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[4], weights_reg4 ); - out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[5], weights_reg4 ); - out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[4], weights_reg5 ); - out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[5], weights_reg5 ); - - out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[4], weights_reg0 ); - out [ second_row_offset + 7 ] = _MMAD_4x8 ( out[ second_row_offset + 7], act_reg_2[5], weights_reg0 ); - out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[4], weights_reg1 ); - out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[5], weights_reg1 ); - out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[4], weights_reg2 ); - out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[5], weights_reg2 ); - - col +=6; - - } while ( col < 14 ); - - /* load input row =1,col=14:15 1Hx2Wx4N */ - - uint slm_row_offset_3 = 1 * (TILE_W + 2) * BATCH_PACK * 8 + 14 * BATCH_PACK * 8; - - act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_3)) ; - act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_3 + BATCH_PACK*8)) ; - - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[0], weights_reg4 ); - out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[0], weights_reg1 ); - out[ 26 ] = _MMAD_4x8 ( out[ 26 ], act_reg_2[0], weights_reg2 ); - - out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg_2[0], weights_reg5 ); - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[1], weights_reg5 ); - - out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[1], weights_reg2 ); - - /**************************************************************************************************************************************** - Third phase - load third row of weights, this replaces first weight row, for the third activation row read 1Hx6Wx4N inputs at a time - - Weights - 48 registers, Activations - 24 registers: Total 72 registers used for input data - *****************************************************************************************************************************************/ - - /* Load weights of row = 2 from SLM into registers - replaces row = 0 weights */ - { - weights_reg0.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg0.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - - weights_reg1.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg1.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - - weights_reg2.s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 ) ); - weights_reg2.s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptr1 + 64 ) ); - slm_ptr1 += LOCAL_SIZE_X*8; - } - - uint slm_row_offset_4 = 2*(TILE_W + 2)*BATCH_PACK*8; - - act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_4)) ; - act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_4 + BATCH_PACK*8)) ; - - out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[0] , weights_reg3 ); - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[0] , weights_reg0 ); - out[ 1 ] = _MMAD_4x8 ( out[ 1 ], act_reg_2[1] , weights_reg0 ); - out[ 15 ] = _MMAD_4x8 ( out[ 15 ], act_reg_2[1] , weights_reg3 ); - - out[ 14 ] = _MMAD_4x8 ( out[ 14 ], act_reg_2[1], weights_reg4 ); - out[ 0 ] = _MMAD_4x8 ( out[ 0 ], act_reg_2[1], weights_reg1 ); - - /* load input row =2,col=2:7,8:13,1Hx6Wx4N */ - - uint col_2 = 2; - - __attribute__((opencl_unroll_hint(2))) - do { - - uint slm_offset = 2*(TILE_W + 2)*BATCH_PACK*8 + col_2*BATCH_PACK*8; - - act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + BATCH_PACK*8)) ; - act_reg_2 [ 2 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 2*BATCH_PACK*8)) ; - act_reg_2 [ 3 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 3*BATCH_PACK*8) ) ; - act_reg_2 [ 4 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 4*BATCH_PACK*8) ) ; - act_reg_2 [ 5 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset + 5*BATCH_PACK*8) ) ; - - uint first_row_offset = col_2 - 2; - uint second_row_offset = 14 + col_2 - 2; - - out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1] , act_reg_2[0], weights_reg1 ); - out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2] , act_reg_2[0], weights_reg0 ); - out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[1], weights_reg0 ); - out [ first_row_offset ] = _MMAD_4x8 ( out[ first_row_offset ] , act_reg_2[0] , weights_reg2 ); - - out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1] , act_reg_2[0], weights_reg4 ); - out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2] , act_reg_2[0], weights_reg3 ); - out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[1], weights_reg3 ); - out [ second_row_offset ] = _MMAD_4x8 ( out[ second_row_offset ] , act_reg_2[0] , weights_reg5 ); - - out [ first_row_offset + 1 ] = _MMAD_4x8 ( out[ first_row_offset + 1 ], act_reg_2[1], weights_reg2 ); - out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2 ], act_reg_2[1], weights_reg1 ); - out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3 ], act_reg_2[2], weights_reg1 ); - out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4 ], act_reg_2[2], weights_reg0 ); - - out [ second_row_offset + 1 ] = _MMAD_4x8 ( out[ second_row_offset + 1 ], act_reg_2[1], weights_reg5 ); - out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2 ], act_reg_2[1], weights_reg4 ); - out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3 ], act_reg_2[2], weights_reg4 ); - out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4 ], act_reg_2[2], weights_reg3 ); - - out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[3], weights_reg0 ); - out [ first_row_offset + 2 ] = _MMAD_4x8 ( out[ first_row_offset + 2], act_reg_2[2], weights_reg2 ); - out [ first_row_offset + 3 ] = _MMAD_4x8 ( out[ first_row_offset + 3], act_reg_2[3], weights_reg2 ); - out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[3], weights_reg1 ); - - out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[3], weights_reg3 ); - out [ second_row_offset + 2 ] = _MMAD_4x8 ( out[ second_row_offset + 2], act_reg_2[2], weights_reg5 ); - out [ second_row_offset + 3 ] = _MMAD_4x8 ( out[ second_row_offset + 3], act_reg_2[3], weights_reg5 ); - out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[3], weights_reg4 ); - - out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[4], weights_reg0 ); - out [ first_row_offset + 7 ] = _MMAD_4x8 ( out[ first_row_offset + 7], act_reg_2[5], weights_reg0 ); - out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[4], weights_reg1 ); - out [ first_row_offset + 6 ] = _MMAD_4x8 ( out[ first_row_offset + 6], act_reg_2[5], weights_reg1 ); - out [ first_row_offset + 4 ] = _MMAD_4x8 ( out[ first_row_offset + 4], act_reg_2[4], weights_reg2 ); - out [ first_row_offset + 5 ] = _MMAD_4x8 ( out[ first_row_offset + 5], act_reg_2[5], weights_reg2 ); - - out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[4], weights_reg3 ); - out [ second_row_offset + 7 ] = _MMAD_4x8 ( out[ second_row_offset + 7], act_reg_2[5], weights_reg3 ); - out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[4], weights_reg4 ); - out [ second_row_offset + 6 ] = _MMAD_4x8 ( out[ second_row_offset + 6], act_reg_2[5], weights_reg4 ); - out [ second_row_offset + 4 ] = _MMAD_4x8 ( out[ second_row_offset + 4], act_reg_2[4], weights_reg5 ); - out [ second_row_offset + 5 ] = _MMAD_4x8 ( out[ second_row_offset + 5], act_reg_2[5], weights_reg5 ); - - col_2 +=6; - - } while ( col_2 < 14 ); - - /* load input row =2,col=14:15 1Hx2Wx4N */ - - uint slm_row_offset_5 = 2 * (TILE_W + 2) * BATCH_PACK * 8 + 14 * BATCH_PACK * 8; - - act_reg_2 [ 0 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_5)) ; - act_reg_2 [ 1 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_row_offset_5 + BATCH_PACK*8)) ; - - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[0], weights_reg1 ); - out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[0], weights_reg4 ); - out[ 26 ] = _MMAD_4x8 ( out[ 26 ], act_reg_2[0], weights_reg5 ); - - out[ 12 ] = _MMAD_4x8 ( out[ 12 ], act_reg_2[0], weights_reg2 ); - out[ 13 ] = _MMAD_4x8 ( out[ 13 ], act_reg_2[1], weights_reg2 ); - - out[ 27 ] = _MMAD_4x8 ( out[ 27 ], act_reg_2[1], weights_reg5 ); - } - - /************************************************************************************************* - Fourth phase - discard middle weight row, for fourth activation row load 1Hx8Wx4N at a time - - Weights - 24 registers, Activations - 32 registers: Total 56 registers used for input data - **************************************************************************************************/ - { - int4 act_reg[ 8 ]; - - /* load first 1Hx8Wx4N inputs */ - - uint slm_row_offset_6 = 3 * (TILE_W + 2) * BATCH_PACK * 8 ; - - __attribute__((opencl_unroll_hint(8))) - for (int ic = 0; ic < 8; ic++) - { - /* Load activations from SLM into registers */ - uint slm_offset = ic * BATCH_PACK * 8 + slm_row_offset_6; - act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - } - - /* Convolve */ - - uint phase_offset = 14; - - out[ phase_offset + 0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[0], weights_reg0 ); - out[ phase_offset + 1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[1], weights_reg0 ); - out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[2], weights_reg0 ); - out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[3], weights_reg0 ); - out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[4], weights_reg0 ); - out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[5], weights_reg0 ); - out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[6], weights_reg0 ); - out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[7], weights_reg0 ); - - out[ phase_offset +0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[1], weights_reg1 ); - out[ phase_offset +1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[2], weights_reg1 ); - out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[3], weights_reg1 ); - out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[4], weights_reg1 ); - out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[5], weights_reg1 ); - out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[6], weights_reg1 ); - out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[7], weights_reg1 ); - - out[ phase_offset +0 ] = _MMAD_4x8 ( out[ phase_offset +0 ], act_reg[2], weights_reg2 ); - out[ phase_offset +1 ] = _MMAD_4x8 ( out[ phase_offset +1 ], act_reg[3], weights_reg2 ); - out[ phase_offset +2 ] = _MMAD_4x8 ( out[ phase_offset +2 ], act_reg[4], weights_reg2 ); - out[ phase_offset +3 ] = _MMAD_4x8 ( out[ phase_offset +3 ], act_reg[5], weights_reg2 ); - out[ phase_offset +4 ] = _MMAD_4x8 ( out[ phase_offset +4 ], act_reg[6], weights_reg2 ); - out[ phase_offset +5 ] = _MMAD_4x8 ( out[ phase_offset +5 ], act_reg[7], weights_reg2 ); - - /* load next 1Hx8Wx4N inputs */ - - __attribute__((opencl_unroll_hint(8))) - for (int ic = 8; ic < 16; ic++) - { - uint slm_offset = ic * BATCH_PACK * 8 + slm_row_offset_6; - act_reg [ ic - 8 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - } - - /* Convolve */ - - out[ phase_offset +6 ] = _MMAD_4x8 ( out[ phase_offset +6 ], act_reg[0], weights_reg2 ); - out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[1], weights_reg2 ); - out[ phase_offset + 8 ] = _MMAD_4x8 ( out[ phase_offset +8 ], act_reg[2], weights_reg2 ); - out[ phase_offset +9 ] = _MMAD_4x8 ( out[phase_offset + 9 ], act_reg[3], weights_reg2 ); - out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[4], weights_reg2 ); - out[ phase_offset +11 ] = _MMAD_4x8 ( out[phase_offset + 11 ], act_reg[5], weights_reg2 ); - out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[6], weights_reg2 ); - out[ phase_offset +13 ] = _MMAD_4x8 ( out[ phase_offset +13 ], act_reg[7], weights_reg2 ); - - out[ phase_offset +7 ] = _MMAD_4x8 ( out[ phase_offset +7 ], act_reg[0], weights_reg1 ); - out[ phase_offset +8 ] = _MMAD_4x8 ( out[phase_offset + 8 ], act_reg[1], weights_reg1 ); - out[ phase_offset +9 ] = _MMAD_4x8 ( out[ phase_offset +9 ], act_reg[2], weights_reg1 ); - out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[3], weights_reg1 ); - out[ phase_offset +11 ] = _MMAD_4x8 ( out[ phase_offset +11 ], act_reg[4], weights_reg1 ); - out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[5], weights_reg1 ); - out[ phase_offset +13 ] = _MMAD_4x8 ( out[phase_offset + 13 ], act_reg[6], weights_reg1 ); - - out[ phase_offset +8 ] = _MMAD_4x8 ( out[phase_offset + 8 ], act_reg[0], weights_reg0 ); - out[ phase_offset +9 ] = _MMAD_4x8 ( out[ phase_offset +9 ], act_reg[1], weights_reg0 ); - out[ phase_offset +10 ] = _MMAD_4x8 ( out[ phase_offset +10 ], act_reg[2], weights_reg0 ); - out[ phase_offset +11 ] = _MMAD_4x8 ( out[phase_offset + 11 ], act_reg[3], weights_reg0 ); - out[ phase_offset +12 ] = _MMAD_4x8 ( out[ phase_offset +12 ], act_reg[4], weights_reg0 ); - out[ phase_offset +13 ] = _MMAD_4x8 ( out[phase_offset + 13 ], act_reg[5], weights_reg0 ); - } - - // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM - barrier(CLK_LOCAL_MEM_FENCE); - - } //for kd - - /**************************************************************************************************************** - *******************************Output Write Stage**************************************************************** - ****************************************************************************************************************/ - - /* - Outputs will be passed through activation function and quantized to 8 bits before writing - Output will be in same format as input [K/32][N/4][P][Q][4N][32K] - Writes are staged in SLM so that 32-bit writes can be done to Global memory - */ - - /******************* Write output to SLM *************************************/ - - /* Quantize and pack 4x1 byte - from consectuive n-coordinates - Write uint32 from each lane to SLM , the entire thread will write 8-consecutive K-coorindates - Four threads will write 4x8xuint32 for 32 output channels and 4 batches - This will be repeated for entire WG-tile - - Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK ) - */ - - uint out_slm_write = lid_z * TILE_W * OUT_BLOCK_HEIGHT * 32 + threadid_group_4 * 8 + lane_id; - - __local uchar4* out_slm = (__local uchar4*) &act_slm; - __local uchar4* out_slm_2 = (__local uchar4*) &out_slm[ out_slm_write ]; - - /* Scale the accumulator down and do the ReLU before converting to 8 bits */ - - /* Real code might do this, but need to get scale right or the convert to uchar saturates and then doesn''t match CPU - float scale = (float)SCALE_FACTOR; - - uchar outchar = (uchar)max(((float)outint) * scale, 0.0f); */ - - const uint _feature = ((fmg * 32) % _OD) + (uint)get_local_id(0); - float quant_f = as_float(intel_sub_group_block_read((__global uint*) (quantizations + _feature) )); - float bias_f = as_float(intel_sub_group_block_read((__global uint*) (biases + _feature) )); - float calib_f = as_float(intel_sub_group_block_read((__global uint*) (calibrations + _feature) )); - - __attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT))) - for (int r = 0; r < OUT_BLOCK_HEIGHT; r++) - { - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for (int c = 0; c < OUT_BLOCK_WIDTH; c++) - { - int4 outvec = out[ r * OUT_BLOCK_WIDTH + c]; - - uchar4 slm_write0; - - int slm_addr = c * 32 + r * TILE_W * 32; - - /*TODO - Activation & Quantization code goes here - presently applying ReLU and taking lower 8-bits */ - - QUANTIZATION; - - out_slm_2[ slm_addr ] = slm_write0; - - } // out_block_width-for loop - - } // out_block_height-for loop - - // Wait till all threads in WG finish placing the output - barrier(CLK_LOCAL_MEM_FENCE); - - /******************* Read from SLM & Write to Global *************************************/ - - /* Each lane will read uint4 from SLM - 4K x 4N values. Swizzle them into 4N x 4K order - - SLM Read Distribution - 8Px14Qx4Nx32K output tile - - Threads 0-1 handles row0, col 0-13, - Threads 2-3 handles row1, col 0-13, - .. - Threads 14-15 handles row7, col 0-13 */ - - uint row_id = thread_id / 2; - uint col_id = ( thread_id % 2 )*7; - - uint out_slm_read = col_id * 32 + row_id * TILE_W * 32 + lane_id * 4; - - __local uint4 *out_slm3 = (__local uint4*) &out_slm[ out_slm_read ]; - - /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */ - uint row_size_bytes = (_OW + OWPAD) * PACK * BATCH_PACK; - - /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */ - uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD); - - /* Each fmg writes [OH][OW][4][32]*/ - - uint output_depth_index = output_depth; - - uint batch_index = batch; - - uint slice_pack_addr_bytes = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + (groupz_tile + row_id ) * row_size_bytes + (groupy_tile + col_id ) * PACK * BATCH_PACK; - - __global uint* output_write = (__global uint *) &outputs [ slice_pack_addr_bytes ]; - - /* Each lane writes 4K values of 4 batches and 8 different columns */ - - /* 4K values of K=0..31 */ - - const char mask_constant = 0xFF; - - __attribute__((opencl_unroll_hint(7))) - for ( int c=0; c<7; c++ ) - { - /* Get 4K4N values in uint4 - each uint containing 4N values of a K - swizzle the data and pack into another uint4 containing 4N4K values - each uint containing 4K values of a N. - Use block_writes for writing uint4 */ - - uint4 out_k4n4 = out_slm3 [ c*8 ]; - - //Pack 4K values of first n - uchar4 out_n0k4; - - out_n0k4.s0 = out_k4n4.s0 & mask_constant; - out_n0k4.s1 = out_k4n4.s1 & mask_constant; - out_n0k4.s2 = out_k4n4.s2 & mask_constant; - out_n0k4.s3 = out_k4n4.s3 & mask_constant; - - /* Assigning to uchar hence need to get the required bits to lower 8-bits*/ - - //Pack 4K values of second n - uchar4 out_n1k4; - - out_n1k4.s0 = (out_k4n4.s0 >> 8) & mask_constant; - out_n1k4.s1 = (out_k4n4.s1 >> 8) & mask_constant; - out_n1k4.s2 = (out_k4n4.s2 >> 8) & mask_constant; - out_n1k4.s3 = (out_k4n4.s3 >> 8) & mask_constant; - - //Pack 4K values of third n - uchar4 out_n2k4; - - out_n2k4.s0 = (out_k4n4.s0 >> 16) & mask_constant; - out_n2k4.s1 = (out_k4n4.s1 >> 16) & mask_constant; - out_n2k4.s2 = (out_k4n4.s2 >> 16) & mask_constant; - out_n2k4.s3 = (out_k4n4.s3 >> 16) & mask_constant; - - //Pack 4K values of fourth n - uchar4 out_n3k4; - - out_n3k4.s0 = (out_k4n4.s0 >> 24) & mask_constant; - out_n3k4.s1 = (out_k4n4.s1 >> 24) & mask_constant; - out_n3k4.s2 = (out_k4n4.s2 >> 24) & mask_constant; - out_n3k4.s3 = (out_k4n4.s3 >> 24) & mask_constant; - - uint4 out_n4k4; - - out_n4k4.s0 = as_uint ( out_n0k4 ); - out_n4k4.s1 = as_uint ( out_n1k4 ); - out_n4k4.s2 = as_uint ( out_n2k4 ); - out_n4k4.s3 = as_uint ( out_n3k4 ); - - intel_sub_group_block_write4 ( output_write , out_n4k4 ); - - output_write += 4*8; - } -} //end of kernel - -#undef SCAL -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl deleted file mode 100644 index 6a11e01bdc2..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_slm_7x7_rep4.cl +++ /dev/null @@ -1,1044 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/mmad.cl" - -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION \ - out_write_N2K4[0].s0 = convert_uchar_sat((float)outvec0.s0 * SCALE + bias_f.s0); /*K= lane_id,N=0*/ \ - out_write_N2K4[0].s1 = convert_uchar_sat((float)outvec1.s0 * SCALE + bias_f.s1); /*K= lane_id + 8,N=0*/\ - out_write_N2K4[0].s2 = convert_uchar_sat((float)outvec2.s0 * SCALE + bias_f.s2); /*K= lane_id + 16,N=0*/\ - out_write_N2K4[0].s3 = convert_uchar_sat((float)outvec3.s0 * SCALE + bias_f.s3); /*K= lane_id + 24,N=0*/\ - \ - out_write_N2K4[0].s4 = convert_uchar_sat((float)outvec0.s1 * SCALE + bias_f.s0); /*K= lane_id,N=1*/\ - out_write_N2K4[0].s5 = convert_uchar_sat((float)outvec1.s1 * SCALE + bias_f.s1); /*K= lane_id + 8,N=1*/\ - out_write_N2K4[0].s6 = convert_uchar_sat((float)outvec2.s1 * SCALE + bias_f.s2); /*K= lane_id + 16,N=1*/\ - out_write_N2K4[0].s7 = convert_uchar_sat((float)outvec3.s1 * SCALE + bias_f.s3); /*K= lane_id + 24,N=1*/\ - \ - out_write_N2K4[1].s0 = convert_uchar_sat((float)outvec0.s2 * SCALE + bias_f.s0); /*K= lane_id,N=2*/\ - out_write_N2K4[1].s1 = convert_uchar_sat((float)outvec1.s2 * SCALE + bias_f.s1); /*K= lane_id + 8,N=2*/\ - out_write_N2K4[1].s2 = convert_uchar_sat((float)outvec2.s2 * SCALE + bias_f.s2); /*K= lane_id + 16,N=2*/\ - out_write_N2K4[1].s3 = convert_uchar_sat((float)outvec3.s2 * SCALE + bias_f.s3); /*K= lane_id + 24,N=2*/\ - \ - out_write_N2K4[1].s4 = convert_uchar_sat((float)outvec0.s3 * SCALE + bias_f.s0); /*K= lane_id,N=3*/\ - out_write_N2K4[1].s5 = convert_uchar_sat((float)outvec1.s3 * SCALE + bias_f.s1); /*K= lane_id + 8,N=3*/\ - out_write_N2K4[1].s6 = convert_uchar_sat((float)outvec2.s3 * SCALE + bias_f.s2); /*K= lane_id + 16,N=3*/\ - out_write_N2K4[1].s7 = convert_uchar_sat((float)outvec3.s3 * SCALE + bias_f.s3); /*K= lane_id + 24,N=3*/ - -#elif NO_QUANTIZATION - -#define QUANTIZATION \ - out_write_N2K4[0].s0 = convert_uchar_sat(outvec0.s0); /*K= lane_id,N=0*/ \ - out_write_N2K4[0].s1 = convert_uchar_sat(outvec1.s0); /*K= lane_id + 8,N=0*/\ - out_write_N2K4[0].s2 = convert_uchar_sat(outvec2.s0); /*K= lane_id + 16,N=0*/\ - out_write_N2K4[0].s3 = convert_uchar_sat(outvec3.s0); /*K= lane_id + 24,N=0*/\ - \ - out_write_N2K4[0].s4 = convert_uchar_sat(outvec0.s1); /*K= lane_id,N=1*/\ - out_write_N2K4[0].s5 = convert_uchar_sat(outvec1.s1); /*K= lane_id + 8,N=1*/\ - out_write_N2K4[0].s6 = convert_uchar_sat(outvec2.s1); /*K= lane_id + 16,N=1*/\ - out_write_N2K4[0].s7 = convert_uchar_sat(outvec3.s1); /*K= lane_id + 24,N=1*/\ - \ - out_write_N2K4[1].s0 = convert_uchar_sat(outvec0.s2); /*K= lane_id,N=2*/\ - out_write_N2K4[1].s1 = convert_uchar_sat(outvec1.s2); /*K= lane_id + 8,N=2*/\ - out_write_N2K4[1].s2 = convert_uchar_sat(outvec2.s2); /*K= lane_id + 16,N=2*/\ - out_write_N2K4[1].s3 = convert_uchar_sat(outvec3.s2); /*K= lane_id + 24,N=2*/\ - \ - out_write_N2K4[1].s4 = convert_uchar_sat(outvec0.s3); /*K= lane_id,N=3*/\ - out_write_N2K4[1].s5 = convert_uchar_sat(outvec1.s3); /*K= lane_id + 8,N=3*/\ - out_write_N2K4[1].s6 = convert_uchar_sat(outvec2.s3); /*K= lane_id + 16,N=3*/\ - out_write_N2K4[1].s7 = convert_uchar_sat(outvec3.s3); /*K= lane_id + 24,N=3*/ - -#else - -#define QUANTIZATION \ - out_write_N2K4[0].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s0) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=0*/ \ - out_write_N2K4[0].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s0) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=0*/\ - out_write_N2K4[0].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s0) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=0*/\ - out_write_N2K4[0].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s0) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=0*/\ - \ - out_write_N2K4[0].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s1) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=1*/\ - out_write_N2K4[0].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s1) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=1*/\ - out_write_N2K4[0].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s1) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=1*/\ - out_write_N2K4[0].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s1) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=1*/\ - \ - out_write_N2K4[1].s0 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s2) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=2*/\ - out_write_N2K4[1].s1 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s2) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=2*/\ - out_write_N2K4[1].s2 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s2) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=2*/\ - out_write_N2K4[1].s3 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s2) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=2*/\ - \ - out_write_N2K4[1].s4 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec0.s3) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS)); /*K= lane_id,N=3*/\ - out_write_N2K4[1].s5 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec1.s3) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS)); /*K= lane_id + 8,N=3*/\ - out_write_N2K4[1].s6 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec2.s3) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS)); /*K= lane_id + 16,N=3*/\ - out_write_N2K4[1].s7 = as_uchar(ACTIVATION(convert_char(round(((float)(outvec3.s3) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS)); /*K= lane_id + 24,N=3*/ - -#endif - -// mapping to clDNN -#define _MMAD_4x8(C, A, B) MMAD_4x8(A, B, C) -#define _OD OUTPUT_FEATURE_NUM -#define _OW OUTPUT_SIZE_X -#define _OH OUTPUT_SIZE_Y -#define OWPAD (OUTPUT_PAD_BEFORE_SIZE_X + OUTPUT_PAD_AFTER_SIZE_X) -#define OHPAD (OUTPUT_PAD_BEFORE_SIZE_Y + OUTPUT_PAD_AFTER_SIZE_Y) -#define _IH INPUT0_SIZE_Y -#define _IW INPUT0_SIZE_X -#define _ID INPUT0_FEATURE_NUM -#define K_HEIGHT FILTER_SIZE_Y -#define K_WIDTH FILTER_SIZE_X -#define BATCH_SIZE OUTPUT_BATCH_NUM - -#define IHPAD (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y) -#define IWPAD (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X) -#define K_STRIDE STRIDE_SIZE_X -// end of mapping - -// for now kernel stride is square -#define K_WSTRIDE K_STRIDE -#define K_HSTRIDE K_STRIDE - -#define PACK 32 -#define BATCH_PACK 4 - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(convolution_mmad_slm_2x14_rep4)( -__global int8 *inputs, -__global uchar* outputs, -__global int8* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif -#if QUANTIZATION_TERM - const __global float* quantizations, -#endif -#if CALIBRATION_TERM - const __global float* calibrations, -#endif - uint split_idx -) -{ - const uint TILE_H = OUT_BLOCK_HEIGHT*LOCAL_SIZE_Z; - const uint TILE_W = OUT_BLOCK_WIDTH*LOCAL_SIZE_Y; - - ushort fmg = get_group_id(0); // Output Depth - ushort group_y = get_group_id(1); // Output Width - ushort group_z = get_group_id(2); // Output Height - - /* 16,1,8 WG , SIMD8 - 16 HW threads in a WG - threads 0-1 : ( lid_x:0-15,lid_y:0,lid_z:0) - threads 2-3 : ( lid_x:0-15,lid_y:0,lid_z:1) - .. - threads 12-13: ( lid_x:0-15, lid_y:0,lid_z:6) - threads 14-15: ( lid_x:0-15, lid_y:0,lid_z:7) - */ - - /* Thread, local IDs */ - ushort thread_id = get_sub_group_id(); - ushort threadid_mod_2 = thread_id % 2; - ushort threadid_mod_8 = thread_id % 8; - - ushort lid_x = get_local_id(0); - ushort lid_z = get_local_id(2); - - uchar lane_id = get_sub_group_local_id(); - - /* 32-bit signed accumulator , 112 output registers for 1Px7Qx4Nx32K output tile size - Will be converted to 8-bits before final write */ - - int4 out_07 [ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 0-7 - int4 out_815[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 8-15 - int4 out_1623[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 16-23 - int4 out_2431[ OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH ] = {0}; // For output channels 24-31 - - /* Account for batching */ - - ushort batch = ( fmg*LOCAL_SIZE_X*4 ) /_OD; // Each thread processing 32 output_channels and each fmg processing 64 output channels , LOCAL_SIZE_X is only 16 - - // Size calculated for int8 elements - uint input_size = (_IH + IHPAD) * (_IW + IWPAD) * BATCH_PACK ; - - uint in_addr_offset = batch*input_size; - - /* Goto activation tile for work group, offset is w.r.t int8 array */ - - uint groupy_tile = TILE_W*group_y; - uint groupz_tile = TILE_H*group_z; - - in_addr_offset += (groupz_tile * K_STRIDE) * (_IW + IWPAD) * BATCH_PACK + (groupy_tile * K_STRIDE) * BATCH_PACK; - - /* SLM space for Activation, Weights - ( 16,1,8 ) Workgroup - 7 tiles along Y direction and 64 different output channels - 2 threads used to load global memory - Activation - 9Hx9Wx4Nx32C Weights -3Rx3Sx64Kx32C */ - - __local int8 act_slm [ 9*9*4 ]; - __local int8 weight_slm [ 9*64 ]; - - /* 9Hx9Wx4Nx32C activation tile written into SLM. Distribute among 14 threads in Workgroup - threads 0-1 write 9x4x32 of H=0, W=0...8 - threads 2-3 write 9x4x32 of H=1, W=0...8 - threads 4-5 write 9x4x32 of H=2, W=0...8 - threads 6-7 write 9x4x32 of H=3, W=0...8 - threads 8-9 write 9x4x32 of H=4, W=0...8 - threads 10-11 write 9x4x32 of H=5,W=0...8 - threads 12-13 write 9x4x32 of H=6,W=0...8 - threads 14 write 9x4x32 of H=7,W=0...8 - threads 15 write 9x4x32 of H=8,W=0...8 */ - - /* Goto activation tile for thread in group */ - - uint row_offset = thread_id / 2; - - if ( thread_id >= 14 ) - { - row_offset = 7; - } - - // In addr offset for the particular thread - in_addr_offset += row_offset * K_STRIDE * (_IW + IWPAD ) * BATCH_PACK ; - - /* Activation SLM indices */ - uint act_slm_write = row_offset * ( TILE_W + 2) * BATCH_PACK; - uint act_slm_read = OUT_BLOCK_HEIGHT * lid_z * ( TILE_W + 2) * BATCH_PACK ; - - /* 9RSx64Kx32C Weight Block in SLM - thread0 handles ( reads from global ) w(0,0),w(0,1),w(0,2) of K=0,1 ( k=0..15 ) - thread1 handles w(0,0),w(0,1),w(0,2) of K=2,3 ( k=16..31) - thread2 handles w(1,0),w(1,1) of K=0,1 ( k=0..15) - thread3 handles w(1,0),w(1,1) of K=2,3 ( k=16..31) - thread4 handles w(1,2),w(2,0) of K=0,1 ( k=0..15) - thread5 handles w(1,2),w(2,0) of K=2,3 ( k=16..31) - thread6 handles w(2,1),w(2,2) of K=0,1 ( k=0..15) - thread7 handles w(2,1),w(2,2) of K=2,3 ( k=16..31) - - Similarly threads8-15 handles for K=4,5,6,7 - - Weight Layout in SLM - - w(R=0,S=0,k=0..7,C=0..15),w(R=0,S=0,k=32..39,C=0..15) - w(R=0,S=0,k=0..7,C=16..31),w(R=0,S=0,k=32..39,C=16..31) - - Above interleaving present to avoid SLM Bank conflicts when fused threads read from SLM - Thread0 will read k=0..31, thread1 will read k=32..63 - - First all output channels are present in SLM, then next weight pixel is present in SLM */ - - #define NUM_FILTERS (K_HEIGHT * K_WIDTH) - - uint output_depth = fmg % ( _OD / ( LOCAL_SIZE_X * 4 ) ); //LOCAL_SIZE_X=16, 64 output channels used - - uint weight_size_CRS = ( _ID / PACK ) * NUM_FILTERS * 8; //8 output channels packed inside - - // Global weight addr for workgroup - uint weight_global_addr_offset = output_depth * 8 * weight_size_CRS ; //64 output channels per workgroup - - /* Global weight address for thread */ - - // Goto appropriate output channel in weights - uint weight_global_channel_offset = threadid_mod_2 * 2 * weight_size_CRS ; - - uint slm_channel_offset = threadid_mod_2; - uint bc_fused_thread_offset = 0; - - if ( thread_id >= 8 ) - { - bc_fused_thread_offset = 1; - - weight_global_channel_offset = 4 * weight_size_CRS + slm_channel_offset * weight_size_CRS * 2 ; - } - - // Goto appropriate pixel in weights - - uint weight_global_pixel_offset = 0; - uint slm_pixel_offset = 0; - - if ( threadid_mod_8 >=2 ) - { - /* First three pixels handled by threads 0-1, then 2 pixels handled by two threads */ - - weight_global_pixel_offset = 3*8 + ( ( (threadid_mod_8/2) - 1 )*2*8 ); - slm_pixel_offset = 3*64 + ( ( (threadid_mod_8/2) - 1 )*2*64 ); - } - - weight_global_addr_offset += weight_global_channel_offset + weight_global_pixel_offset; - - /* Weight slm write index */ - - uint slm_write_weight = slm_pixel_offset + slm_channel_offset * 32 + bc_fused_thread_offset * 4; - - /* Weight slm read index */ - - /* Thread 0 reads output channels 0-15, thread 1 handles output channels 16-31, data present in interleaved - manner in SLM - Data layout in SLM - - w(0,0) C=0..7, K = 0..7 | w(0,0) C=0..7, K = 32..39 - w(0,0) C=8..15,K=0..7 | w(0,0) C=8..15,K = 32..39 - w(0,0) C=0..7, K=8..15 | w(0,0) C=0..7, K = 40..47 - w(0,0) C=8..15,K=8..15 | w(0,0) C=8..15,K= 40..47 - - */ - uint wt_slm_rd_offset = threadid_mod_2*4; - - int kd; - - __attribute__((opencl_unroll_hint(1))) - for(kd = 0; kd < ( _ID / PACK ) ; kd++) - { - { - /* Load Activation from global to SLM */ - - int in_addr = kd * (_IH + IHPAD) * (_IW + IWPAD) * BATCH_SIZE + in_addr_offset; - - __global uint *activation_tile = (__global uint*)&inputs[ in_addr ]; - - __local uint *act_slm_ptr = (__local uint *) &act_slm [ act_slm_write ]; - - /* The odd thread in fused pair will start from next 4x8 block */ - - activation_tile += threadid_mod_2*4*8; - act_slm_ptr += threadid_mod_2*4*8; - - int4 act_col_0 = as_int4( intel_sub_group_block_read4(activation_tile) );//col 0 - int4 act_col_1 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) );//col 2 - int4 act_col_2 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) );//col 4 - int4 act_col_3 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) );//col 6 - - SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_0 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_1 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_2 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_3 ) ); - - if ( threadid_mod_2 == 0 ) - { - int4 act_col_4 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); - - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_4 ) ); - } - - if ( thread_id >=14) - { - activation_tile = activation_tile + 1 * (_IW + IWPAD ) * BATCH_PACK * 8; - act_slm_ptr = act_slm_ptr + (TILE_W + 2) * BATCH_PACK *8; - - int4 act_col_9 = as_int4( intel_sub_group_block_read4(activation_tile) ); - int4 act_col_10 = as_int4( intel_sub_group_block_read4(activation_tile + 8*8) ); - int4 act_col_11 = as_int4( intel_sub_group_block_read4(activation_tile + 2*8*8) ); - int4 act_col_12 = as_int4( intel_sub_group_block_read4(activation_tile + 3*8*8) ); - - SLM_BLOCK_WRITE_4 ( act_slm_ptr , as_uint4 ( act_col_9 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 8*8 ) , as_uint4 ( act_col_10 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 2*8*8 ) , as_uint4 ( act_col_11 ) ); - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 3*8*8 ) , as_uint4 ( act_col_12 ) ); - - if ( threadid_mod_2 == 0 ) - { - int4 act_col_13 = as_int4( intel_sub_group_block_read4(activation_tile + 4*8*8) ); - - SLM_BLOCK_WRITE_4 ( ( act_slm_ptr + 4*8*8 ) , as_uint4 ( act_col_13 ) ); - } - } - - /* load weights from global to weight_slm */ - - int weight_addr = kd * NUM_FILTERS * 8 + weight_global_addr_offset; - - __global uint *weight_tile = (__global uint*)&weights [ weight_addr ]; - __local uint *wt_slm_ptr = (__local uint *)&weight_slm [ slm_write_weight ]; - - __global uint *weight_tile_2 = weight_tile; - __local uint *wt_slm_ptr_2 = wt_slm_ptr; - - int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); // Pixel1 K=0..7 C=0..15 - int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=0..7 C=16..31 - int4 w2 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=0..7 C=0..15 - int4 w3 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=0..7 C=16..31 - - // Goto next output channel - weight_tile += weight_size_CRS*8; - - int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile ) ); // Pixel1 K=8..15 C=0..15 - int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile + 4*8 ) ); // Pixel1 K=8..15 C=16..31 - int4 w6 = as_int4 ( intel_sub_group_block_read4( weight_tile + 8*8 ) ); // Pixel2 K=8..15 C=0..15 - int4 w7 = as_int4 ( intel_sub_group_block_read4( weight_tile + 12*8 ) );// Pixel2 K=8..15 C=16..31 - - SLM_BLOCK_WRITE_4 ( wt_slm_ptr, as_uint4 ( w0 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w1 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ), as_uint4 ( w2 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ), as_uint4 ( w3 ) ); - - wt_slm_ptr += 16*8; - - SLM_BLOCK_WRITE_4 ( wt_slm_ptr , as_uint4 ( w4 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 8*8 ) , as_uint4 ( w5 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 ) , as_uint4 ( w6 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr + 64*8 + 8*8 ) , as_uint4 ( w7 ) ); - - if( threadid_mod_8 < 2 ) - { - // Goto next pixel - weight_tile_2 += 16*8; - wt_slm_ptr_2 += 2*64*8; - - int4 w0 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) ); // Pixel1 K=0..7 C=0..15 - int4 w1 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) ); // Pixel1 K=0..7 C=16..31 - - // Goto next output channel - weight_tile_2 += weight_size_CRS*8; - - int4 w4 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 ) ); // Pixel1 K=8..15 C=0..15 - int4 w5 = as_int4 ( intel_sub_group_block_read4( weight_tile_2 + 4*8 ) ); // Pixel1 K=8..15 C=16..31 - - SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2, as_uint4 ( w0 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w1 ) ); - - wt_slm_ptr_2 += 16*8; - - SLM_BLOCK_WRITE_4 ( wt_slm_ptr_2 , as_uint4 ( w4 ) ); - SLM_BLOCK_WRITE_4 ( ( wt_slm_ptr_2 + 8*8 ) , as_uint4 ( w5 ) ); - } - } - - // Synchronize SLM writes across workgroup - barrier(CLK_LOCAL_MEM_FENCE); - - if ( lid_z <= 6 ) - { - uint wt_slm_rd = wt_slm_rd_offset; - - __local uint *slm_ptr0 = (__local uint *) &act_slm[ act_slm_read ]; - __local uint *slm_ptr1 = (__local uint *) &weight_slm[ wt_slm_rd ]; - - /* balancing load of weights, activations */ - int8 weights_reg[3]; //24 registers - int4 act_reg[18]; //72 registers - uint slm_read_pixel_offset = 64*8; - - /********************************************************************************************************** - First phase - multiply first row of weights and 1st row of activations - ***********************************************************************************************************/ - - /* Load weights from SLM into registers - row0, output channels 0..7 */ - - { - __local uint *slm_ptrw0 = slm_ptr1; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - } - - /* load 1Hx9Wx4N inputs, Activation row0 */ - - __attribute__((opencl_unroll_hint(9))) - for (int ic = 0; ic < 9; ic++) - { - /* Load activations from SLM into registers */ - - uint slm_offset = ic * BATCH_PACK * 8 ; - - act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - } - - /* Convolve */ - - /* order the mmad instructions to minimize dependency on src0,dst - also try to maximise reuse of weights-reg*/ - - /* Output channels 0-7 */ - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] ); - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] ); - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] ); - - /* Load weights from SLM into registers - row0, output channels 8..15 */ - - { - __local uint *slm_ptrw0 = slm_ptr1 + 2*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - } - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] ); - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] ); - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] ); - - /* Load weights from SLM into registers - row0, output channels 16..23 */ - { - __local uint *slm_ptrw0 = slm_ptr1 + 4*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - } - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] ); - - /* load 1Hx9Wx4N inputs, Activation row1 */ - - uint slm_row_offset_2 = 1*(TILE_W + 2)*BATCH_PACK*8; - - __attribute__((opencl_unroll_hint(9))) - for (int ic = 0; ic < 9; ic++) - { - /* Load activations from SLM into registers */ - - uint slm_offset = slm_row_offset_2 + ic * BATCH_PACK * 8 ; - - act_reg [ ic + 9 ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - } - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] ); - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] ); - - /* Load weights from SLM into registers - row0, output channels 24..31 */ - { - __local uint *slm_ptrw0 = slm_ptr1 + 6*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - slm_ptrw0 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw0 + 64 ) ); - } - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] ); - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] ); - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] ); - - /********************************************************************************************************** - Second phase - multiply second row of weights and second row of activations - ***********************************************************************************************************/ - - /* Load weights from SLM into registers - row1, output channels 0..7 */ - { - __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - } - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[9], weights_reg[0] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[10], weights_reg[0] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[11], weights_reg[0] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[12], weights_reg[0] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[13], weights_reg[0] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[14], weights_reg[0] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[15], weights_reg[0] ); - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[10], weights_reg[1] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[11], weights_reg[1] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[12], weights_reg[1] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[13], weights_reg[1] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[14], weights_reg[1] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[15], weights_reg[1] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[16], weights_reg[1] ); - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[11], weights_reg[2] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[12], weights_reg[2] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[13], weights_reg[2] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[14], weights_reg[2] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[15], weights_reg[2] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[16], weights_reg[2] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[17], weights_reg[2] ); - - /* Load weights from SLM into registers - row1, output channels 8..15 */ - { - __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 2*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - } - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[9], weights_reg[0] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[10], weights_reg[0] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[11], weights_reg[0] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[12], weights_reg[0] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[13], weights_reg[0] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[14], weights_reg[0] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[15], weights_reg[0] ); - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[10], weights_reg[1] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[11], weights_reg[1] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[12], weights_reg[1] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[13], weights_reg[1] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[14], weights_reg[1] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[15], weights_reg[1] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[16], weights_reg[1] ); - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[11], weights_reg[2] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[12], weights_reg[2] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[13], weights_reg[2] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[14], weights_reg[2] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[15], weights_reg[2] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[16], weights_reg[2] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[17], weights_reg[2] ); - - /* Load weights from SLM into registers - row1, output channels 16..23 */ - { - __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 4*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - } - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[9], weights_reg[0] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[10], weights_reg[0] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[11], weights_reg[0] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[12], weights_reg[0] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[13], weights_reg[0] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[14], weights_reg[0] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[15], weights_reg[0] ); - - /* load 1Hx9Wx4N inputs, Activation row2 */ - - uint slm_row_offset_3 = 2*(TILE_W + 2)*BATCH_PACK*8; - - __attribute__((opencl_unroll_hint(9))) - for (int ic = 0; ic < 9; ic++) - { - /* Load activations from SLM into registers */ - - uint slm_offset = slm_row_offset_3 + ic * BATCH_PACK * 8 ; - - act_reg [ ic ] = as_int4 (SLM_BLOCK_READ_4 (slm_ptr0 + slm_offset)) ; - } - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[10], weights_reg[1] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[11], weights_reg[1] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[12], weights_reg[1] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[13], weights_reg[1] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[14], weights_reg[1] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[15], weights_reg[1] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[16], weights_reg[1] ); - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[11], weights_reg[2] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[12], weights_reg[2] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[13], weights_reg[2] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[14], weights_reg[2] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[15], weights_reg[2] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[16], weights_reg[2] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[17], weights_reg[2] ); - - /* Load weights from SLM into registers - row1, output channels 24..31 */ - { - __local uint *slm_ptrw1 = slm_ptr1 + 3*slm_read_pixel_offset + 6*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - slm_ptrw1 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw1 + 64 ) ); - } - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[9], weights_reg[0] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[10], weights_reg[0] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[11], weights_reg[0] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[12], weights_reg[0] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[13], weights_reg[0] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[14], weights_reg[0] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[15], weights_reg[0] ); - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[10], weights_reg[1] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[11], weights_reg[1] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[12], weights_reg[1] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[13], weights_reg[1] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[14], weights_reg[1] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[15], weights_reg[1] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[16], weights_reg[1] ); - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[11], weights_reg[2] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[12], weights_reg[2] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[13], weights_reg[2] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[14], weights_reg[2] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[15], weights_reg[2] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[16], weights_reg[2] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[17], weights_reg[2] ); - - /********************************************************************************************************** - Third phase - multiply third row of weights and third row of activations - ***********************************************************************************************************/ - - /* Load weights from SLM into registers - row2, output channels 0..7 */ - { - __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - } - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[0], weights_reg[0] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[1], weights_reg[0] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[2], weights_reg[0] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[3], weights_reg[0] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[4], weights_reg[0] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[5], weights_reg[0] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[6], weights_reg[0] ); - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[1], weights_reg[1] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[2], weights_reg[1] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[3], weights_reg[1] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[4], weights_reg[1] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[5], weights_reg[1] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[6], weights_reg[1] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[7], weights_reg[1] ); - - out_07[ 0 ] = _MMAD_4x8 ( out_07[ 0 ], act_reg[2], weights_reg[2] ); - out_07[ 1 ] = _MMAD_4x8 ( out_07[ 1 ], act_reg[3], weights_reg[2] ); - out_07[ 2 ] = _MMAD_4x8 ( out_07[ 2 ], act_reg[4], weights_reg[2] ); - out_07[ 3 ] = _MMAD_4x8 ( out_07[ 3 ], act_reg[5], weights_reg[2] ); - out_07[ 4 ] = _MMAD_4x8 ( out_07[ 4 ], act_reg[6], weights_reg[2] ); - out_07[ 5 ] = _MMAD_4x8 ( out_07[ 5 ], act_reg[7], weights_reg[2] ); - out_07[ 6 ] = _MMAD_4x8 ( out_07[ 6 ], act_reg[8], weights_reg[2] ); - - /* Load weights from SLM into registers - row2, output channels 8..15 */ - { - __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 2*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - } - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[0], weights_reg[0] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[1], weights_reg[0] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[2], weights_reg[0] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[3], weights_reg[0] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[4], weights_reg[0] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[5], weights_reg[0] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[6], weights_reg[0] ); - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[1], weights_reg[1] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[2], weights_reg[1] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[3], weights_reg[1] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[4], weights_reg[1] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[5], weights_reg[1] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[6], weights_reg[1] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[7], weights_reg[1] ); - - out_815[ 0 ] = _MMAD_4x8 ( out_815[ 0 ], act_reg[2], weights_reg[2] ); - out_815[ 1 ] = _MMAD_4x8 ( out_815[ 1 ], act_reg[3], weights_reg[2] ); - out_815[ 2 ] = _MMAD_4x8 ( out_815[ 2 ], act_reg[4], weights_reg[2] ); - out_815[ 3 ] = _MMAD_4x8 ( out_815[ 3 ], act_reg[5], weights_reg[2] ); - out_815[ 4 ] = _MMAD_4x8 ( out_815[ 4 ], act_reg[6], weights_reg[2] ); - out_815[ 5 ] = _MMAD_4x8 ( out_815[ 5 ], act_reg[7], weights_reg[2] ); - out_815[ 6 ] = _MMAD_4x8 ( out_815[ 6 ], act_reg[8], weights_reg[2] ); - - /* Load weights from SLM into registers - row2, output channels 16..23 */ - { - __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 4*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - } - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[0], weights_reg[0] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[1], weights_reg[0] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[2], weights_reg[0] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[3], weights_reg[0] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[4], weights_reg[0] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[5], weights_reg[0] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[6], weights_reg[0] ); - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[1], weights_reg[1] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[2], weights_reg[1] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[3], weights_reg[1] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[4], weights_reg[1] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[5], weights_reg[1] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[6], weights_reg[1] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[7], weights_reg[1] ); - - out_1623[ 0 ] = _MMAD_4x8 ( out_1623[ 0 ], act_reg[2], weights_reg[2] ); - out_1623[ 1 ] = _MMAD_4x8 ( out_1623[ 1 ], act_reg[3], weights_reg[2] ); - out_1623[ 2 ] = _MMAD_4x8 ( out_1623[ 2 ], act_reg[4], weights_reg[2] ); - out_1623[ 3 ] = _MMAD_4x8 ( out_1623[ 3 ], act_reg[5], weights_reg[2] ); - out_1623[ 4 ] = _MMAD_4x8 ( out_1623[ 4 ], act_reg[6], weights_reg[2] ); - out_1623[ 5 ] = _MMAD_4x8 ( out_1623[ 5 ], act_reg[7], weights_reg[2] ); - out_1623[ 6 ] = _MMAD_4x8 ( out_1623[ 6 ], act_reg[8], weights_reg[2] ); - - /* Load weights from SLM into registers - row3, output channels 24..31 */ - { - __local uint *slm_ptrw2 = slm_ptr1 + 6*slm_read_pixel_offset + 6*8*8; - - weights_reg[0].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[0].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[1].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[1].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - slm_ptrw2 += slm_read_pixel_offset; - - weights_reg[2].s0123 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 ) ); - weights_reg[2].s4567 = as_int4 ( SLM_BLOCK_READ_4 ( slm_ptrw2 + 64 ) ); - } - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[0], weights_reg[0] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[1], weights_reg[0] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[2], weights_reg[0] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[3], weights_reg[0] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[4], weights_reg[0] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[5], weights_reg[0] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[6], weights_reg[0] ); - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[1], weights_reg[1] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[2], weights_reg[1] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[3], weights_reg[1] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[4], weights_reg[1] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[5], weights_reg[1] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[6], weights_reg[1] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[7], weights_reg[1] ); - - out_2431[ 0 ] = _MMAD_4x8 ( out_2431[ 0 ], act_reg[2], weights_reg[2] ); - out_2431[ 1 ] = _MMAD_4x8 ( out_2431[ 1 ], act_reg[3], weights_reg[2] ); - out_2431[ 2 ] = _MMAD_4x8 ( out_2431[ 2 ], act_reg[4], weights_reg[2] ); - out_2431[ 3 ] = _MMAD_4x8 ( out_2431[ 3 ], act_reg[5], weights_reg[2] ); - out_2431[ 4 ] = _MMAD_4x8 ( out_2431[ 4 ], act_reg[6], weights_reg[2] ); - out_2431[ 5 ] = _MMAD_4x8 ( out_2431[ 5 ], act_reg[7], weights_reg[2] ); - out_2431[ 6 ] = _MMAD_4x8 ( out_2431[ 6 ], act_reg[8], weights_reg[2] ); - } - - // To make sure all threads in WG have finished compute before next depth tile of activation and weights are loaded into SLM - barrier(CLK_LOCAL_MEM_FENCE); - } //for kd - - /**************************************************************************************************************** - *******************************Output Write Stage**************************************************************** - ****************************************************************************************************************/ - /* - Outputs will be passed through activation function and quantized to 8 bits before writing - Output will be in same format as input [K/32][N/4][P][Q][4N][32K] */ - - /******************* Write output to SLM *************************************/ - - /* Quantize and pack 4x1 byte - from consectuive n-coordinates - Each thread produces [1P][7Q][4N][32K] - Write uint32 from each lane to SLM , the entire thread will write 32-consecutive K-coorindates - - Assume one SLM row as 32 uints ( 32 channels , four batches for each channel - 4NK ) - In SLM 7x7x4x32 present first then the next 32 channels - */ - - if( lid_z <= 6 ) - { - /* feature maps are an array of slicePacks, each H,W position within the slice pack contains 32 8bit feature maps(channels) of 8 different batches */ - uint row_size_bytes = (_OW + OWPAD) * PACK * BATCH_PACK; - - /* slice_pack is a pack of 32 feature map tiles that are [OH][OW][4][32] that are stored within the full [K/32][N/4][OH][OW][4][32] output */ - uint slice_pack_size_bytes = row_size_bytes * (_OH + OHPAD); - - /* Each output_depth WG writes 64 output channels */ - - uint output_depth_index = output_depth*2 + threadid_mod_2; - uint batch_index = batch; - - /* Each WG produces entire 7x7 output, hence no group_y, group_z tiling */ - - uint output_offset_x = groupy_tile * OUT_X_PITCH; - uint output_offset_y = groupz_tile * OUT_Y_PITCH; - uint slice_pack_addr_bytes = output_depth_index * slice_pack_size_bytes * ( BATCH_SIZE / BATCH_PACK ) + batch_index * slice_pack_size_bytes + lid_z * row_size_bytes; - - __global uchar* output_write_ptr = (__global uchar *) &outputs [ slice_pack_addr_bytes + output_offset_x + output_offset_y ]; - - const uint feature = output_depth_index * 32 + get_sub_group_local_id(); - - const float4 quant_f = as_float4(intel_sub_group_block_read4((__global uint*) (quantizations + feature) )); - const float4 bias_f = as_float4(intel_sub_group_block_read4((__global uint*) (biases + feature) )); - const float4 calib_f = as_float4(intel_sub_group_block_read4((__global uint*) (calibrations + feature) )); - - __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH))) - for (int col = 0; col < OUT_BLOCK_WIDTH; col++) - { - - int4 outvec0 = out_07[col]; - int4 outvec1 = out_815[col]; - int4 outvec2 = out_1623[col]; - int4 outvec3 = out_2431[col]; - - /* Non-Linear Activation & Quantization code */ - - uchar8 out_write_N2K4[2]; - - QUANTIZATION; - - intel_sub_group_block_write_uc8 ( output_write_ptr , out_write_N2K4[0] ); - output_write_ptr += 64; - intel_sub_group_block_write_uc8 ( output_write_ptr , out_write_N2K4[1] ); - output_write_ptr += 64; - - } // out_block_width-for loop - }//lid_z loop -} //end of kernel - -#undef SCAL -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl deleted file mode 100644 index 0066ddd618b..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_1x1.cl +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -__attribute__((intel_reqd_sub_group_size(16))) -KERNEL(convolution_grad_weights_gpu_1x1)( - const __global UNIT_TYPE* input_grad, - __global UNIT_TYPE* output, - __global UNIT_TYPE* filter, -#if BIAS_TERM - __global UNIT_TYPE* bias, -#endif -#if MOMENTUM - __global UNIT_TYPE* prev_grad_w, -#if BIAS_TERM - __global UNIT_TYPE* prev_grad_b, -#endif -#endif - const __global UNIT_TYPE* input, - uint split_idx, - float lr) -{ - const uint local_id = get_local_id(0); - const uint ifm = get_global_id(1); - const uint ofm = get_global_id(2); - - const int in_x = -PADDING_SIZE_X; - const int in_y = -PADDING_SIZE_Y; - - ACCUMULATOR_TYPE grad_w = 0; - -#if BIAS_TERM - ACCUMULATOR_TYPE grad_b = 0; -#endif - - uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH; - - for(int b = 0; b < INPUT0_BATCH_NUM; b++) - { - UNIT_TYPE result = UNIT_VAL_ZERO; - -#if BIAS_TERM - UNIT_TYPE result_bias = UNIT_VAL_ZERO; -#endif - - const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM; - const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM; - - for (uint i = 0; i < INPUT0_SIZE_Y; i++) - { - const int input_offset_y = in_y + i * STRIDE_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0; - for (uint j = 0; j < (INPUT0_SIZE_X + 15)/16; j++) - { - const int input_offset_x = in_x + j * STRIDE_SIZE_X * 16 + local_id * STRIDE_SIZE_X; - const bool zero_x = input_offset_x >= INPUT1_SIZE_X || input_offset_x < 0; - const bool grad_zero = j*16 + local_id >= INPUT0_SIZE_X; -#if BIAS_TERM - UNIT_TYPE grad; - if(grad_zero) - { - grad = 0; - } - else - { - uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH*16 + local_id*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - grad = input_grad[input_grad_idx]; - } -#endif - if(!zero_x && !zero_y) - { - uint input_idx = in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH; -#if BIAS_TERM - result = fma(input[input_idx], grad, result); -#else - if(!grad_zero) - { - uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH*16 + local_id*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - result = fma(input[input_idx], input_grad[input_grad_idx], result); - } -#endif - } -#if BIAS_TERM - result_bias += grad; -#endif - } - } - - grad_w += result; - -#if BIAS_TERM - grad_b += result_bias; -#endif - } - - grad_w = sub_group_reduce_add(grad_w); -#if BIAS_TERM - grad_b = sub_group_reduce_add(grad_b); -#endif - - if (local_id == 0) - { -#if MOMENTUM - UNIT_TYPE update_gradient_w = lr * (prev_grad_w[weights_idx] * MOMENTUM_FACTOR + grad_w + DECAY_RATE * filter[weights_idx]); - filter[weights_idx] -= update_gradient_w; - prev_grad_w[weights_idx] = update_gradient_w; -#else - filter[weights_idx] -= lr * grad_w + DECAY_RATE * lr * filter[weights_idx]; -#endif - -#if BIAS_TERM - if(ifm == 0) - { -#if MOMENTUM - UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR; - bias[ofm] -= update_gradient_b; - prev_grad_b[ofm] = update_gradient_b; -#else - bias[ofm] -= lr * grad_b; -#endif - } -#endif - } -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl deleted file mode 100644 index df6a4595708..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_3x3.cl +++ /dev/null @@ -1,182 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -KERNEL(convolution_grad_weights_gpu_3x3)( - const __global UNIT_TYPE* input_grad, - __global UNIT_TYPE* output, - __global UNIT_TYPE* filter, -#if BIAS_TERM - __global UNIT_TYPE* bias, -#endif -#if MOMENTUM - __global UNIT_TYPE* prev_grad_w, -#if BIAS_TERM - __global UNIT_TYPE* prev_grad_b, -#endif -#endif - const __global UNIT_TYPE* input, - uint split_idx, - float lr) -{ - const uint ofm = get_global_id(0); - const uint ifm = get_global_id(1); - - if (ofm >= INPUT0_FEATURE_NUM || ifm >= INPUT1_FEATURE_NUM) - return; - - const int in_x = -PADDING_SIZE_X; - const int in_y = -PADDING_SIZE_Y; - - ACCUMULATOR_TYPE grad_w[9] = {}; -#if BIAS_TERM - ACCUMULATOR_TYPE grad_b = 0; -#endif - - uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH; - - for(int b = 0; b < INPUT0_BATCH_NUM; b++) - { - const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM; - const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM; - - for (uint i = 0; i < INPUT0_SIZE_Y; i++) - { - - for (uint j = 0; j < INPUT0_SIZE_X; j+=2) - { - float2 grad; - if (j + 1 >= INPUT0_SIZE_X) - { - uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - grad.s0 = input_grad[input_grad_idx]; - grad.s1 = 0; - } - else - { - uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - grad = vload2(0, &input_grad[input_grad_idx]); - } - for (uint y = 0; y < 3; y++) - { - const int input_offset_y = in_y + y + i; - const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0; - const int input_offset_x = in_x + j; - const bool zero_x = input_offset_x < 0 || input_offset_x + 3 >= INPUT1_SIZE_X; - uint input_idx = in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH; - union v4 { - float s[4]; - float4 v; - }; - union v4 inp; - if (zero_y) - continue; - if (zero_x) - { - for (uint k = 0; k < 4; k++) - { - if (input_offset_x + k >= INPUT1_SIZE_X || input_offset_x + k < 0) - inp.s[k] = 0; - else - inp.s[k] = input[input_idx + k]; - } - } - else - { - inp.v = vload4(0, &input[input_idx]); - } - for (uint x = 0; x < 3; x++) - { - grad_w[y * 3 + x] = mad(inp.s[x] * lr, grad.s0, grad_w[y * 3 + x]); - grad_w[y * 3 + x] = mad(inp.s[x + 1] * lr, grad.s1, grad_w[y * 3 + x]); - } - } -#if BIAS_TERM - grad_b += grad.s0; - grad_b += grad.s1; -#endif - } - } - } - - union { - float s[8]; - float8 v; - } uweights_0_7; - float uweights8; - -#if MOMENTUM - float dwa[9]; - uweights_0_7.v = vload8(0, &prev_grad_w[weights_idx]); - dwa[0 * 3 + 0] = uweights_0_7.v.s0; - dwa[0 * 3 + 1] = uweights_0_7.v.s1; - dwa[0 * 3 + 2] = uweights_0_7.v.s2; - dwa[1 * 3 + 0] = uweights_0_7.v.s3; - dwa[1 * 3 + 1] = uweights_0_7.v.s4; - dwa[1 * 3 + 2] = uweights_0_7.v.s5; - dwa[2 * 3 + 0] = uweights_0_7.v.s6; - dwa[2 * 3 + 1] = uweights_0_7.v.s7; - dwa[2 * 3 + 2] = prev_grad_w[weights_idx + 8]; -#endif - - uweights_0_7.v = vload8(0, &filter[weights_idx]); - uweights8 = filter[weights_idx + 8]; - -#if MOMENTUM - float8 newDelta_0_7 = (float8)( - grad_w[0 * 3 + 0] + (MOMENTUM_FACTOR * dwa[0 * 3 + 0]), - grad_w[0 * 3 + 1] + (MOMENTUM_FACTOR * dwa[0 * 3 + 1]), - grad_w[0 * 3 + 2] + (MOMENTUM_FACTOR * dwa[0 * 3 + 2]), - grad_w[1 * 3 + 0] + (MOMENTUM_FACTOR * dwa[1 * 3 + 0]), - grad_w[1 * 3 + 1] + (MOMENTUM_FACTOR * dwa[1 * 3 + 1]), - grad_w[1 * 3 + 2] + (MOMENTUM_FACTOR * dwa[1 * 3 + 2]), - grad_w[2 * 3 + 0] + (MOMENTUM_FACTOR * dwa[2 * 3 + 0]), - grad_w[2 * 3 + 1] + (MOMENTUM_FACTOR * dwa[2 * 3 + 1])); - float newDelta8 = grad_w[2 * 3 + 2] + (MOMENTUM_FACTOR * dwa[2 * 3 + 2]); -#else - float8 newDelta_0_7 = (float8)( - grad_w[0 * 3 + 0], - grad_w[0 * 3 + 1], - grad_w[0 * 3 + 2], - grad_w[1 * 3 + 0], - grad_w[1 * 3 + 1], - grad_w[1 * 3 + 2], - grad_w[2 * 3 + 0], - grad_w[2 * 3 + 1]); - float newDelta8 = grad_w[2 * 3 + 2]; -#endif - uweights8 -= newDelta8; - uweights_0_7.v -= newDelta_0_7; - - vstore8(uweights_0_7.v, 0, &filter[weights_idx]); - filter[weights_idx + 8] = uweights8; -#if MOMENTUM - vstore8(newDelta_0_7, 0, &prev_grad_w[weights_idx]); - prev_grad_w[weights_idx + 8] = newDelta8; -#endif - -#if BIAS_TERM - if(ifm == 0) - { -#if MOMENTUM - UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR; - bias[ofm] -= update_gradient_b; - prev_grad_b[ofm] = update_gradient_b; -#else - bias[ofm] -= lr * grad_b; -#endif - } -#endif -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl deleted file mode 100644 index e5d9fde8403..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_7x7.cl +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -KERNEL(convolution_grad_weights_gpu_7x7)( - const __global UNIT_TYPE* input_grad, - __global UNIT_TYPE* output, - __global UNIT_TYPE* filter, -#if BIAS_TERM - __global UNIT_TYPE* bias, -#endif -#if MOMENTUM - __global UNIT_TYPE* prev_grad_w, -#if BIAS_TERM - __global UNIT_TYPE* prev_grad_b, -#endif -#endif - const __global UNIT_TYPE* input, - uint split_idx, - float lr) -{ - const uint x_filter = get_global_id(0); - const uint ofm = get_global_id(1); - const uint ifm = get_global_id(2); - - if (x_filter >= 7 || ofm >= INPUT0_FEATURE_NUM || ifm >= INPUT1_FEATURE_NUM) - return; - - const int in_x = -PADDING_SIZE_X; - const int in_y = -PADDING_SIZE_Y; - - ACCUMULATOR_TYPE grad_w[7] = { 0, 0, 0, 0, 0, 0, 0 }; -#if BIAS_TERM - ACCUMULATOR_TYPE grad_b = UNIT_VAL_ZERO; -#endif - - uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH; - - for(int b = 0; b < INPUT0_BATCH_NUM; b++) - { - const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM; - const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM; - - for(int i = 0; i < INPUT0_SIZE_Y; i++) - { - for(int j = 0; j < INPUT0_SIZE_X; j++) - { - float grad; - uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - grad = input_grad[input_grad_idx]; - for(uint y_filter = 0; y_filter < 7; y_filter++) - { - const int input_offset_y = in_y + y_filter + i * STRIDE_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0; - const int input_offset_x = in_x + x_filter + j * STRIDE_SIZE_X; - const bool zero_x = input_offset_x < 0 || input_offset_x >= INPUT1_SIZE_X; - uint input_idx = in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH; - if(!zero_x && !zero_y) - { - const float delta_f = input[input_idx] * lr * grad; - grad_w[y_filter] += delta_f; - } - } -#if BIAS_TERM - grad_b += grad; -#endif - } - } - } - for(uint y_filter = 0; y_filter < 7; y_filter++) - { - uint address = weights_idx + 48 - (7 * (6 - y_filter) + (6 - x_filter)); -#if MOMENTUM - float dw = prev_grad_w[address]; - const float delta_f_m = MOMENTUM_FACTOR * dw; - grad_w[y_filter] += delta_f_m; - prev_grad_w[address] = grad_w[y_filter]; -#endif - filter[address] -= grad_w[y_filter]; - } -#if BIAS_TERM - if(ifm == 0 && x_filter == 0) - { -#if MOMENTUM - UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR; - bias[ofm] -= update_gradient_b; - prev_grad_b[ofm] = update_gradient_b; -#else - bias[ofm] -= lr * grad_b; -#endif - } -#endif -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl deleted file mode 100644 index 98bbc29c44d..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_ref.cl +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" -KERNEL(convolution_grad_weights_gpu_ref)( - const __global UNIT_TYPE* input_grad, - __global UNIT_TYPE* output, - __global float* filter, -#if BIAS_TERM - __global float* bias, -#endif -#if MOMENTUM - __global float* prev_grad_w, -#if BIAS_TERM - __global float* prev_grad_b, -#endif -#endif - const __global UNIT_TYPE* input, - uint split_idx, - float lr) -{ - const uint ofm_ifm = get_global_id(0); - const uint id_x = (uint)get_global_id(1); - const uint id_y = (uint)get_global_id(2); - const uint ifm = ofm_ifm % INPUT1_FEATURE_NUM; - const uint ofm = ofm_ifm / INPUT1_FEATURE_NUM; - - const int in_x = id_x - PADDING_SIZE_X; - const int in_y = id_y - PADDING_SIZE_Y; - - ACCUMULATOR_TYPE grad_w = 0; - -#if BIAS_TERM - ACCUMULATOR_TYPE grad_b = 0; -#endif - - uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH + id_y * FILTER_Y_PITCH + id_x * FILTER_X_PITCH; - - for(int b = 0; b < INPUT0_BATCH_NUM; b++) - { - ACCUMULATOR_TYPE result = ACCUMULATOR_TYPE_ZERO; - -#if BIAS_TERM - ACCUMULATOR_TYPE result_bias = ACCUMULATOR_TYPE_ZERO; -#endif - - const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM; - const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM; - - for (uint i = 0; i < INPUT0_SIZE_Y; i++) - { - for (uint j = 0; j < INPUT0_SIZE_X; j++) - { - const int input_offset_y = in_y + i * STRIDE_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0; - const int input_offset_x = in_x + j * STRIDE_SIZE_X; - const bool zero_x = input_offset_x >= INPUT1_SIZE_X || input_offset_x < 0; -#if BIAS_TERM - uint input_grad_idx = grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - ACCUMULATOR_TYPE grad = TO_ACCUMULATOR_TYPE(input_grad[input_grad_idx]); -#endif - if(!zero_x && !zero_y) - { - uint input_idx = INPUT1_OFFSET + in_split_offset + b*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH; -#if BIAS_TERM - result = fma(TO_ACCUMULATOR_TYPE(input[input_idx]), grad, result); -#else - uint input_grad_idx = INPUT0_OFFSET + grad_split_offset + b*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + j*INPUT0_X_PITCH + i*INPUT0_Y_PITCH; - result = fma(TO_ACCUMULATOR_TYPE(input[input_idx]), TO_ACCUMULATOR_TYPE(input_grad[input_grad_idx]), result); -#endif - } -#if BIAS_TERM - result_bias += grad; -#endif - } - } - - grad_w += result; - -#if BIAS_TERM - grad_b += result_bias; -#endif - } - -#if OUTPUT_GRAD_W - output[weights_idx] = grad_w; -#else - #if MOMENTUM - float update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR; - filter[weights_idx] -= update_gradient_w; - prev_grad_w[weights_idx] = update_gradient_w; - #else - filter[weights_idx] -= lr * grad_w + DECAY_RATE * lr * filter[weights_idx]; - #endif - -#if BIAS_TERM - if(ifm == 0 && id_x == 0 && id_y == 0) - { -#if MOMENTUM - float update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR; - bias[ofm] -= update_gradient_b; - prev_grad_b[ofm] = update_gradient_b; -#else - bias[ofm] -= lr * grad_b; -#endif - } -#endif - -#endif - -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl deleted file mode 100644 index fba71dbdd69..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_grad_weights_yxfb.cl +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -__attribute__((intel_reqd_sub_group_size(16))) -KERNEL(convolution_grad_weights_gpu_ref)( - const __global UNIT_TYPE* input_grad, - __global UNIT_TYPE* output, - __global UNIT_TYPE* filter, -#if BIAS_TERM - __global UNIT_TYPE* bias, -#endif -#if MOMENTUM - __global UNIT_TYPE* prev_grad_w, -#if BIAS_TERM - __global UNIT_TYPE* prev_grad_b, -#endif -#endif - const __global UNIT_TYPE* input, - uint split_idx, - float lr) -{ - const uint local_id = get_local_id(0); - const uint ofm_ifm = get_global_id(1); - const uint id_x_y = get_global_id(2); - - const uint id_x = id_x_y % FILTER_SIZE_X; - const uint id_y = id_x_y / FILTER_SIZE_X; - const uint ifm = ofm_ifm % INPUT1_FEATURE_NUM; - const uint ofm = ofm_ifm / INPUT1_FEATURE_NUM; - - const int in_x = id_x - PADDING_SIZE_X; - const int in_y = id_y - PADDING_SIZE_Y; - - ACCUMULATOR_TYPE grad_w = 0; -#if BIAS_TERM - ACCUMULATOR_TYPE grad_b = 0; -#endif - - const uint grad_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_OFM_NUM; - const uint in_split_offset = split_idx * INPUT1_FEATURE_PITCH * FILTER_IFM_NUM; - - uint weights_idx = ofm * FILTER_OFM_PITCH + ifm * FILTER_IFM_PITCH + id_y * FILTER_Y_PITCH + id_x * FILTER_X_PITCH; - - for(int y = 0; y < INPUT0_SIZE_Y; y++) - { - const int input_offset_y = in_y + y * STRIDE_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT1_SIZE_Y || input_offset_y < 0; - for (uint x = 0; x < INPUT0_SIZE_X; x++) - { - const int input_offset_x = in_x + x * STRIDE_SIZE_X; - const bool zero_x = input_offset_x >= INPUT1_SIZE_X || input_offset_x < 0; - for (uint b = 0; b < INPUT0_BATCH_NUM / 16; b++) - { -#if BIAS_TERM - uint input_grad_idx = grad_split_offset + b*16*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + x*INPUT0_X_PITCH + y*INPUT0_Y_PITCH; - UNIT_TYPE grad = as_float(intel_sub_group_block_read((const __global uint*)(input_grad + input_grad_idx))); - grad_b += grad; -#endif - if(!zero_x && !zero_y) - { - uint input_idx = in_split_offset + b*16*INPUT1_BATCH_PITCH + ifm*INPUT1_FEATURE_PITCH + (uint)input_offset_x*INPUT1_X_PITCH + (uint)input_offset_y*INPUT1_Y_PITCH; -#if BIAS_TERM - grad_w = fma(as_float(intel_sub_group_block_read((const __global uint*)(input + input_idx))), grad, grad_w); -#else - uint input_grad_idx = grad_split_offset + b*16*INPUT0_BATCH_PITCH + ofm*INPUT0_FEATURE_PITCH + x*INPUT0_X_PITCH + y*INPUT0_Y_PITCH; - grad_w = fma(as_float(intel_sub_group_block_read((const __global uint*)(input + input_idx))), as_float(intel_sub_group_block_read((const __global uint*)(input_grad + input_grad_idx))), grad_w); -#endif - } - } - } - } - - grad_w = sub_group_reduce_add(grad_w); -#if BIAS_TERM - grad_b = sub_group_reduce_add(grad_b); -#endif - - if (local_id == 0) - { -#if OUTPUT_GRAD_W - output[weights_idx] = grad_w; -#else - #if MOMENTUM - UNIT_TYPE update_gradient_w = lr * (grad_w + DECAY_RATE * filter[weights_idx]) + prev_grad_w[weights_idx] * MOMENTUM_FACTOR; - filter[weights_idx] -= update_gradient_w; - prev_grad_w[weights_idx] = update_gradient_w; - #else - filter[weights_idx] -= lr * (grad_w + DECAY_RATE * filter[weights_idx]); - #endif - -#if BIAS_TERM - if(ifm == 0 && id_x == 0 && id_y == 0) - { -#if MOMENTUM - UNIT_TYPE update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR; - bias[ofm] -= update_gradient_b; - prev_grad_b[ofm] = update_gradient_b; -#else - bias[ofm] -= lr * grad_b; -#endif - } -#endif -#endif - } -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl index c36cb5a07ce..8691930df78 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_bfyx_opt.cl @@ -89,15 +89,6 @@ KERNEL(deconvolution_gpu_bfyx_opt)( uint fixed_input_offset_y = (uint)input_offset_y / STRIDE_SIZE_Y; uint input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH; -#if GRADIENT - uint filter_idx = filter_offset + of*FILTER_IFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; - for (uint h = 0; h < FILTER_OFM_NUM; h++) - { - acc += TO_ACCUMULATOR_TYPE(input[input_idx]) * TO_ACCUMULATOR_TYPE(filter[filter_idx]); - filter_idx += FILTER_OFM_PITCH; - input_idx += INPUT0_FEATURE_PITCH; - } -#else uint filter_idx = filter_offset + of*FILTER_OFM_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; for (uint h = 0; h < FILTER_IFM_NUM; h++) { @@ -105,7 +96,6 @@ KERNEL(deconvolution_gpu_bfyx_opt)( filter_idx += FILTER_IFM_PITCH; input_idx += INPUT0_FEATURE_PITCH; } -#endif } } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl index f7fd8ee408a..d87bbafd948 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/deconvolution_gpu_ref.cl @@ -105,24 +105,6 @@ KERNEL(deconvolution_gpu_yxfb_ref)( input_idx = input_offset + (uint)fixed_input_offset_x*INPUT0_X_PITCH + (uint)fixed_input_offset_y*INPUT0_Y_PITCH + (uint)fixed_input_offset_z*INPUT0_Z_PITCH; #endif -#if GRADIENT - uint filter_idx = filter_offset + of*FILTER_IFM_PITCH + (FILTER_SIZE_Z - k - 1)*FILTER_Z_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; - for (uint h = 0; h < FILTER_OFM_NUM; h++) { -#if !INPUT0_SIMPLE -# if INPUT0_DIMS <= 4 - input_idx = INPUT0_GET_INDEX(batch_offset, h + g*FILTER_IFM_NUM, fixed_input_offset_y, fixed_input_offset_x); -# elif INPUT0_DIMS == 5 - input_idx = INPUT0_GET_INDEX(batch_offset, h + g*FILTER_IFM_NUM, fixed_input_offset_z, fixed_input_offset_y, fixed_input_offset_x); -# endif -#endif - - acc += TO_ACCUMULATOR_TYPE(input[input_idx]) * TO_ACCUMULATOR_TYPE(filter[filter_idx]); - filter_idx += FILTER_OFM_PITCH; -#if INPUT0_SIMPLE - input_idx += INPUT0_FEATURE_PITCH; -#endif - } -#else // GRADIENT uint filter_idx = filter_offset + of*FILTER_OFM_PITCH + (FILTER_SIZE_Z - k - 1)*FILTER_Z_PITCH + (FILTER_SIZE_Y - i - 1)*FILTER_Y_PITCH + (FILTER_SIZE_X - j - 1)*FILTER_X_PITCH; for (uint h = 0; h < FILTER_IFM_NUM; h++) { #if !INPUT0_SIMPLE @@ -139,7 +121,6 @@ KERNEL(deconvolution_gpu_yxfb_ref)( input_idx += INPUT0_FEATURE_PITCH; #endif } -#endif // GRADIENT } } } diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl deleted file mode 100644 index 00a06c524a8..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_b_fs_yx_fsv4.cl +++ /dev/null @@ -1,104 +0,0 @@ -/* -// Copyright (c) 2019 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "include/include_all.cl" - -#define PACK 4 - -#define SGR_MAX_SIZE (get_max_sub_group_size()) -#define SGR_LOCAL_ID (get_sub_group_local_id()) - -#define GET_INDEX(_x) \ - ( ((_x / SGR_MAX_SIZE) * SGR_MAX_SIZE /* Normed to max_subgroup_size */) \ - * (4 * sizeof(int) /* 4xINT32 per sub_group reading */) \ - ) - -inline int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx) -{ - int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(src + idx))); - int16 to_return; - for(uint i = 0; i < 4; i++) - { - for(uint j = 0; j < 4; j++) - { - to_return[i * 4 + j] = as_char4(int_data[i])[j]; - } - } - return to_return; -} -#define GET_INPUT(A, B) FUNC_CALL(get_int16)(A, GET_INDEX(x)) - - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(eltwise_b_fs_yx_fsv4)( - INPUTS_DECLS - __global UNIT_TYPE* output -#if CALIBRATION_TERM - , const __global float* calibrations -#endif - ) -{ - // This kernel works with linearized data w/o strides and padding - // so only one dimension 'X' is required - const uint x = get_global_id(0); - const uint idx = GET_INDEX(x); - - int16 res; - - DO_ELTWISE; - - for(uint i = 0; i < 4; i++) - { - const uint out_idx = idx + (sizeof(int) * (SGR_LOCAL_ID + (i * SGR_MAX_SIZE))); - char4 char_res; - - for(uint j = 0; j < 4; j++) - { - int res_tmp = res[i * 4 + j]; - #if QUANTIZATION_TERM - #if CALIBRATION_TERM - // Batch: - const uint b = out_idx / OUTPUT_BATCH_PITCH; - // Feature: - // Because of specific data layout Feature must be normed to PACK size - uint d3 = ((out_idx - b * OUTPUT_BATCH_PITCH) / (OUTPUT_FEATURE_PITCH * PACK)) * PACK; - res_tmp = (int)round(((float)res_tmp) * calibrations[d3+j]); - #else // CALIBRATION_TERM - res_tmp = (int)round(((float)res_tmp) * O_QF); - #endif // CALIBRATION_TERM - #endif // QUANTIZATION_TERM - - #if QUANTIZATION_TERM - #ifdef ELTW_UNSIGNED - char_res[j] = ACTIVATION(convert_uchar_sat(res_tmp), ACTIVATION_PARAMS); - #else - char_res[j] = ACTIVATION(convert_char_sat(res_tmp), ACTIVATION_PARAMS); - #endif - #else - char_res[j] = ACTIVATION(convert_char(res_tmp), ACTIVATION_PARAMS); - #endif - } - // put 4 chars into output - // char_result[i] = as_int(char_res); - *((__global int*)(output + out_idx)) = as_int(char_res); - } -} - -#undef PACK -#undef SGR_MAX_SIZE -#undef SGR_LOCAL_ID -#undef GET_INDEX -#undef GET_INPUT diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl deleted file mode 100644 index f93b1ab5aa7..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/eltwise_fs_bs_yx_bsv4_fsv32.cl +++ /dev/null @@ -1,83 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "include/include_all.cl" - -#ifdef INPUT_STRIDED -#define GET_INDEX(src) \ - GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2 * CAT(src, _STRIDE_Y), d1 * CAT(src, _STRIDE_X)) -#else -#define GET_INDEX(src) \ - GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(src, d4, d3, d2, d1) -#endif - -int16 FUNC(get_int16)(const __global UNIT_TYPE* src, uint idx) -{ - int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(src + idx))); - int16 to_return; - for(uint b = 0; b < 4; b++) - { - for(uint f = 0; f < 4; f++) - { - to_return[b * 4 + f] = as_char4(int_data[b])[f]; - } - } - return to_return; -} -#define GET_INPUT(A, B) FUNC_CALL(get_int16)(A, GET_INDEX(B)) - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(eltwise_fs_bs_yx_bsv4_fsv32)( - INPUTS_DECLS - __global UNIT_TYPE* output -#if CALIBRATION_TERM - , const __global float* calibrations -#endif - ) -{ - const uint of_32_aligned = ((OUTPUT_FEATURE_NUM + 31) / 32) * 32; - const uint d1 = get_global_id(0); // X - const uint d2 = get_global_id(1); // Y - const uint d3 = ((uint)get_global_id(2) * 4) % of_32_aligned; // Feature - const uint d4 = 4 * (((uint)get_global_id(2) * 4) / of_32_aligned); // Batch - - int16 res; - - DO_ELTWISE; - - int4 char_result; - for(uint b = 0; b < 4; b++) - { - char4 char_res; - for(uint f = 0; f < 4; f++) - { - int res_tmp = res[b * 4 + f]; - #if CALIBRATION_TERM - res_tmp = (int)round(((float)res_tmp) * calibrations[d3+f]); - #else // CALIBRATION_TERM - res_tmp = (int)round(((float)res_tmp) * O_QF); - #endif // CALIBRATION_TERM - char_res[f] = ACTIVATION(convert_char_sat(res_tmp), ACTIVATION_PARAMS); - } - // pack 4 chars into int - char_result[b] = as_int(char_res); - } - - uint output_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, d4, d3, d2, d1); - intel_sub_group_block_write4((__global uint*)(output + output_offset), as_uint4(char_result)); -} - -#undef GET_INDEX diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl deleted file mode 100644 index f1a5a4e5f9f..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/embed_ref.cl +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -KERNEL(embed_ref)(const __global UNIT_TYPE* input0, - __global UNIT_TYPE* output, - const __global UNIT_TYPE* weights -#if BIAS_TERM - ,const __global UNIT_TYPE* biases -#endif -) -{ - const uint x = (uint)get_global_id(0); - const uint y = (uint)get_global_id(1); - const uint b = (uint)get_global_id(2); - - uint output_idx = (b*INPUT0_ELEMENTS_COUNT*NUM_OUTPUT_SIZE)+(uint)(x*NUM_OUTPUT_SIZE+y); - output[output_idx] = weights[(uint)(input0[(b*INPUT0_ELEMENTS_COUNT)+x]*NUM_OUTPUT_SIZE+y)]; -#if BIAS_TERM - output[output_idx] += biases[y]; -#endif -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl deleted file mode 100644 index 76169fe87eb..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_mmad_batched.cl +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/common.cl" - -#include "include/data_types.cl" -#include "include/fetch.cl" -#include "include/mmad.cl" - -#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) -#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) -#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) -#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(fully_connected_kernel_mmad_batched)( - const __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - const __global FILTER_TYPE* weights -#if BIAS_TERM - , const __global BIAS_TYPE* biases -#endif -#if QUANTIZATION_TERM - ,const __global float* quantizations -#endif -#if CALIBRATION_TERM - ,const __global float* calibrations -#endif - ) -{ - const uint sg_channel = get_sub_group_local_id(); - - const uint batch_id = (uint)get_group_id(0) * 8; - const uint b_block = batch_id / 4; - const uint f = (uint)get_global_id(1) % FILTER_OFM_ALIGNED; - - uint in_addr = IN_OFFSET + b_block * IN_B_BLOCK_PITCH; - - const uint filter_offset = ((uint)get_group_id(1) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; - uint filter_idx = filter_offset; - - int8 tileA; - int8 tileB; - int8 tileC = 0; - - for(uint z = 0; z < FILTER_IFM_MMAD_NUM; z++ ) - { - for (uint k = 0; k < FILTER_SIZE_X * FILTER_SIZE_Y; ++k) - { - // load A tile ( input ) - // load 8 batches 4 channels per WI, so we'll have 8x32 block - - tileA.lo = as_int4(intel_sub_group_block_read4((const __global uint*)(input + in_addr))); - tileA.hi = as_int4(intel_sub_group_block_read4((const __global uint*)(input + in_addr + IN_B_BLOCK_PITCH))); - - // load B tile ( weights ) - tileB = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx))); - - // compute C tile ( output ) - tileC = MMAD_8x8(tileA, tileB, tileC); // here we output 8 batches per workitem, and each workitem gets different output feature - - in_addr += 32 * 4; // 4 batches * 4 features per channel * 8 SIMD channels - filter_idx += 32*8; // 32 features per channel * 8 output features per SIMD channel - } - in_addr += IN_F_BLOCK_PITCH; - in_addr -= (FILTER_SIZE_X * FILTER_SIZE_Y * 32 * 4); - } - -#if BIAS_TERM -#if BIAS_PER_OUTPUT - const uint bias_index = GET_DATA_INDEX(BIAS, batch_id, f, y, x); -#elif BIAS_PER_OFM - const uint bias_index = f; -#endif - for(uint i = 0; i < 8; i++) - { -#if CALIBRATION_TERM - tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]); -#else // CALIBRATION_TERM - tileC[i] = (UNIT_TYPE)round(((float)tileC[i] * quantizations[f] * I_QF + biases[bias_index]) * O_QF); -#endif // CALIBRATION_TERM - } -#endif // BIAS_TERM - - // save to output - if(f < FILTER_OFM_NUM) - { - for(uint i = 0; i < 8; i++) - { - const uint curr_b = batch_id + i; -#if defined OUTPUT_LAYOUT_FS_BS_YX_BSV4_FSV32 - const uint dst_index = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, curr_b, f, 0, 0); -#else - const uint dst_index = GET_DATA_INDEX(OUTPUT, curr_b, f, 0, 0); -#endif - output[dst_index] = ACTIVATION(convert_char(tileC[i]), ACTIVATION_PARAMS); - } - } -} - -#undef FILTER_IFM_MMAD_NUM -#undef FILTER_OFM_MMAD_NUM -#undef FILTER_IFM_ALIGNED -#undef FILTER_OFM_ALIGNED diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl deleted file mode 100644 index eb1d803cbef..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_input_gpu_ref.cl +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -KERNEL(fully_connected_grad_input_gpu_ref)( - const __global INPUT0_TYPE* input_grad, - __global OUTPUT_TYPE* output, - const __global FILTER_TYPE* weights, - const __global INPUT1_TYPE* input - ) -{ - const uint x = get_global_id(1); - const uint y = get_global_id(2); - const uint b_f = get_global_id(0); - const uint batch_id = b_f % INPUT0_BATCH_NUM; - const uint feature_id = b_f / INPUT0_BATCH_NUM; - - if(b_f >= INPUT1_FEATURE_NUM * INPUT0_BATCH_NUM) - return; - - ACCUMULATOR_TYPE result = 0; - - for (uint ofm = 0; ofm < FILTER_OFM_NUM; ++ofm) - { - const uint input_grad_idx = GET_DATA_INDEX(INPUT0, batch_id, 0, 0, ofm); - const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, feature_id, y, x); - - result += (ACCUMULATOR_TYPE)(input_grad[input_grad_idx] * weights[filter_idx]); - } - - const uint output_idx = GET_DATA_INDEX(OUTPUT, batch_id, feature_id, y, x); - output[output_idx] = result; -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl deleted file mode 100644 index c038bdf3e10..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_grad_weights_gpu_ref.cl +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2016-2017 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -KERNEL(fully_connected_grad_weights_gpu_ref)( - const __global INPUT0_TYPE* input_grad, - __global OUTPUT_TYPE* output, - __global float* weights, -#if BIAS_TERM - __global float* bias, -#endif -#if MOMENTUM - __global float* prev_grad_w, -#if BIAS_TERM - __global float* prev_grad_b, -#endif -#endif - const __global INPUT1_TYPE* input, - const float lr - ) -{ - const uint ofm_ifm = get_global_id(0); - const uint id_x = (uint)get_global_id(1); - const uint id_y = (uint)get_global_id(2); - const uint ifm = ofm_ifm % FILTER_IFM_NUM; - const uint ofm = ofm_ifm / FILTER_IFM_NUM; - - ACCUMULATOR_TYPE grad_w = 0; -#if BIAS_TERM - ACCUMULATOR_TYPE grad_b = 0; -#endif - - const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, ofm, ifm, id_y, id_x); - for (uint b = 0; b < INPUT0_BATCH_NUM; b++) - { - const uint input_grad_idx = GET_DATA_INDEX(INPUT0, b, 0, 0, ofm); - const uint input_idx = GET_DATA_INDEX(INPUT1, b, ifm, id_y, id_x); - ACCUMULATOR_TYPE grad = TO_ACCUMULATOR_TYPE(input_grad[input_grad_idx]); - grad_w += TO_ACCUMULATOR_TYPE(input[input_idx] * grad); -#if BIAS_TERM - grad_b += TO_ACCUMULATOR_TYPE(grad); -#endif - } - -#if MOMENTUM - float update_gradient_w = lr * (grad_w + DECAY_RATE * weights[filter_idx]) + prev_grad_w[filter_idx] * MOMENTUM_FACTOR; - weights[filter_idx] -= update_gradient_w; - prev_grad_w[filter_idx] = update_gradient_w; -#else - weights[filter_idx] -= lr * grad_w + DECAY_RATE * lr * weights[filter_idx]; -#endif - -#if BIAS_TERM - if(ifm == 0 && id_x == 0 && id_y == 0) - { -#if MOMENTUM - float update_gradient_b = lr * grad_b + prev_grad_b[ofm] * MOMENTUM_FACTOR; - bias[ofm] -= update_gradient_b; - prev_grad_b[ofm] = update_gradient_b; -#else - bias[ofm] -= lr * grad_b; -#endif - } -#endif - - -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl deleted file mode 100644 index e27ff51a405..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_bn_scale_kernel_ref.cl +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - -#define LOCAL_SIZE INPUT0_BATCH_NUM - -__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) -KERNEL(convolution)( - __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output, - __global FILTER_TYPE* weights, -#if BIAS_TERM - __global BIAS_TYPE* biases, -#endif - uint split_idx, - __global INPUT0_TYPE* scale_in -#if SCALE_BIAS_TERM - , __global INPUT0_TYPE* scale_bias -#endif -#if FUSED_TRAINING - , __global INPUT0_TYPE* inv_var, - __global INPUT0_TYPE* conv_output, - __global INPUT0_TYPE* bn_output -#endif - ) -{ - const uint f = get_global_id(1); - const uint b = get_global_id(0); - - UNIT_TYPE conv_out = UNIT_VAL_ZERO; - - const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; - - const uint filter_offset = f*FILTER_OFM_PITCH; - const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; - - for (uint y = 0; y < OUTPUT_SIZE_Y; ++y) - { - const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; - for (uint x = 0; x < OUTPUT_SIZE_X; ++x) - { - const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; - for (uint k = 0; k < FILTER_IFM_NUM; ++k) - { - for (uint j = 0; j < FILTER_SIZE_Y ; ++j) - { - const int input_offset_y = input_y + j * DILATION_SIZE_Y; - const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - - if(!zero_y) - { - for (uint i = 0; i < FILTER_SIZE_X ; ++i) - { - const int input_offset_x = input_x + i * DILATION_SIZE_X; - const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - - if(!zero_x) - { - uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*INPUT0_FEATURE_PITCH; - uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; - conv_out += input[input_idx] * weights[filter_idx]; - } - } - } - } - } -#if BIAS_TERM - conv_out += (UNIT_TYPE)biases[f]; -#endif - - const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; - const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset; -#ifdef FUSED_TRAINING - conv_output[dst_index] = conv_out; -#else - output[dst_index] = conv_out; -#endif - } - } - - - // BATCH NORM PART - barrier(CLK_LOCAL_MEM_FENCE); - - __local ACCUMULATOR_TYPE sum[LOCAL_SIZE]; - - const uint local_idx = b; - - sum[local_idx] = 0; - - uint input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); - for (uint y = 0; y < OUTPUT_SIZE_Y; y++) - { - for (uint x = 0; x < OUTPUT_SIZE_X; x++) - { -#ifdef FUSED_TRAINING - UNIT_TYPE in = conv_output[input_idx]; -#else - UNIT_TYPE in = output[input_idx]; -#endif - sum[local_idx] += in; - input_idx += OUTPUT_X_PITCH; - } - input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) - { - if (local_idx < offset) - { - sum[local_idx] += sum[local_idx + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - UNIT_TYPE mean = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); - - sum[local_idx] = 0; - - input_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); - for (uint y = 0; y < OUTPUT_SIZE_Y; y++) - { - for (uint x = 0; x < OUTPUT_SIZE_X; x++) - { -#ifdef FUSED_TRAINING - UNIT_TYPE in = conv_output[input_idx] - mean; -#else - UNIT_TYPE in = output[input_idx] - mean; -#endif - sum[local_idx] += in * in; - input_idx += OUTPUT_X_PITCH; - } - input_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) - { - if (local_idx < offset) - { - sum[local_idx] += sum[local_idx + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - float variance = sum[0] / (OUTPUT_BATCH_NUM * OUTPUT_SIZE_X * OUTPUT_SIZE_Y); - - float inv_variance = (float)(1.0 / sqrt(variance + EPSILON)); - -#ifdef FUSED_TRAINING - if (local_idx == 0) - inv_var[f] = inv_variance; -#endif - - uint out_idx = GET_DATA_INDEX(OUTPUT, local_idx, f, 0, 0); - for (uint y = 0; y < OUTPUT_SIZE_Y; y++) - { - for (uint x = 0; x < OUTPUT_SIZE_X; x++) - { -#ifdef FUSED_TRAINING - UNIT_TYPE out_val = inv_variance * (conv_output[out_idx] - mean); - bn_output[out_idx] = out_val; -#ifdef SCALE_BIAS_TERM - output[out_idx] = ACTIVATION(out_val * scale_in[f] + scale_bias[f], ACTIVATION_PARAMS); -#else - output[out_idx] = ACTIVATION(out_val * scale_in[f], ACTIVATION_PARAMS); -#endif -#else -#ifdef SCALE_BIAS_TERM - output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f] + scale_bias[f], ACTIVATION_PARAMS); -#else - output[out_idx] = ACTIVATION(inv_variance * (output[out_idx] - mean) * scale_in[f], ACTIVATION_PARAMS); -#endif -#endif - out_idx += OUTPUT_X_PITCH; - } - out_idx += OUTPUT_Y_PITCH - OUTPUT_SIZE_X * OUTPUT_X_PITCH; - } - -} - -#undef LOCAL_SIZE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl deleted file mode 100644 index e7af9776425..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_gemm_fp32.cl +++ /dev/null @@ -1,602 +0,0 @@ -/* -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "include/include_all.cl" -#include "include/sub_group.cl" -#include "include/fetch.cl" - -#define TILE_M 2 -#define TILE_K FILTER_SIZE_X -#define TILE_N 32 - -inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset)(uint out_offset, uint strideX, uint strideY) -{ -// bfyx - uint tmp_idx = out_offset; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - x_idx *= strideX; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - y_idx *= strideY; - tmp_idx /= OUTPUT_SIZE_Y; - uint f_idx = tmp_idx % OUTPUT_FEATURE_NUM; - tmp_idx /= OUTPUT_FEATURE_NUM; - uint b_idx = tmp_idx % OUTPUT_BATCH_NUM; - - return GET_DATA_INDEX(INPUT1, b_idx, f_idx, y_idx, x_idx); -} - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(fused_conv_eltwise_gemm_fp32)( - const __global float *src0, - __global float *dst, - const __global float *src1, -#if BIAS_TERM - const __global float *bias, -#endif - uint split_idx, - const __global float* src3) -{ -#include "include/vec_typedefs.cl" - - const unsigned group_x = get_group_id(0); - const unsigned group_y = get_group_id(1); - const unsigned global_x = get_global_id(0); - const unsigned global_y = get_global_id(1); - const unsigned global_z = get_global_id(2); - - unsigned interleaved_y; - unsigned kernel_y; - unsigned kernel_idx; - - // Result ctile (*dst) is M rows x N columns - // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile. - float8 blockC00 = 0.f; - float8 blockC10 = 0.f; - float8 blockC20 = 0.f; - float8 blockC30 = 0.f; - float8 blockC01 = 0.f; - float8 blockC11 = 0.f; - float8 blockC21 = 0.f; - float8 blockC31 = 0.f; - - const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * INPUT0_FEATURE_NUM; - // Src0 (patch input) is directly used as atile. - // Each work item points to the start of a different patch. - // atile is M rows x K columns. - const uint src0_read_offset0_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset - + INPUT0_BATCH_PITCH * global_z // batch offset - + ( ( ( global_y * TILE_M + 0 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset - + ( ( ( global_y * TILE_M + 0 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset - const uint src0_read_offset1_const = INPUT0_OFFSET_WITH_PADDING + in_split_offset - + INPUT0_BATCH_PITCH * global_z // batch offset - + ( ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * STRIDE_SIZE_Y * INPUT0_Y_PITCH ) // y offset - + ( ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ) * STRIDE_SIZE_X ); // x offset - - // Src1 (filter) is directly used as btile. - // It starts at the top of src1 and walks down. - // btile is K rows x N columns. - uint src0_read_offset0 = src0_read_offset0_const; - uint src0_read_offset1 = src0_read_offset1_const; - uint src1_read_offset = ( global_x * TILE_N * 2); - -#define DOT_PRODUCT_8( _result, _rowA, colB ) \ - { \ - _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \ - _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \ - _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \ - _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \ - _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \ - _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \ - _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \ - _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \ - } - - // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1. - // Inner loop loads and FMADs one row (FILTER_SIZE_X) of each input patch - // and FILTER_SIZE_X/2 rows of interleaved filter. - unsigned patch_depth = 0; - do - { - unsigned patch_row = 0; - do - { - // Load atile and btile. - // Kernel data is partially interleaved. Every 2 rows are interleaved at float8 granularity. - // The exception is that if FILTER_SIZE_X is odd the last row is not interleaved. The non - // interleaved row is padded with zero to ensure same size as interleaved rows. This - // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the - // kernel data would be arranged before/after interleaving for FILTER_SIZE_X=3. - // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) .. - // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ... - // (0, 2) (8, 2) (16, 2) (24, 2) ... ... - // ... - const bool kernel_width_is_odd = FILTER_SIZE_X % 2 == 1; - - float blockA00[FILTER_SIZE_X]; - float blockA01[FILTER_SIZE_X]; - - // in case the data is not aligned to sizeof(T)*FILTER_SIZE_X we need to use vload or set the data in a loop - { - unsigned i = 0; - LOOP(FILTER_SIZE_X, i, - { -#if LEFTOVERS == 1 - if(src0_read_offset0_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) - { - if(src0_read_offset0 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) - blockA00[i] = src0[src0_read_offset0 + i]; - } - else -#endif - blockA00[i] = src0[src0_read_offset0 + i]; - -#if LEFTOVERS == 1 - if(src0_read_offset1_const + (FILTER_SIZE_Y - 1) * INPUT0_Y_PITCH + (INPUT0_FEATURE_NUM - 1) * (INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH )) >= INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) - { - if(src0_read_offset1 + i < INPUT0_BATCH_NUM * INPUT0_BATCH_PITCH) - blockA01[i] = src0[src0_read_offset1 + i]; - } - else -#endif - blockA01[i] = src0[src0_read_offset1 + i]; - } ) - } - - float* pblockA00 = (float*)(&blockA00); - float* pblockA01 = (float*)(&blockA01); - - src0_read_offset0 += INPUT0_Y_PITCH; - src0_read_offset1 += INPUT0_Y_PITCH; - - - float blockB00[FILTER_SIZE_X*4]; - float8* p8BlockB00 = (float8*)blockB00; - float4* p4BlockB00 = (float4*)blockB00; - float* pBlockB00 = (float* )blockB00; - - interleaved_y = 0; - LOOP(FILTER_SIZE_X_DIV2, interleaved_y, - { - p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) ); - src1_read_offset += ALIGNED_OFM * 2; - } ) - if ( kernel_width_is_odd ) - { - p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) ); - src1_read_offset += ALIGNED_OFM * 2; - } - - // Perform MADs - kernel_idx = 0; - interleaved_y = 0; - LOOP(FILTER_SIZE_X_DIV2, interleaved_y, - { - kernel_y = interleaved_y * 2; - DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++; - } ) - if ( kernel_width_is_odd ) - { - kernel_y = interleaved_y * 2; - DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; - DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); - DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++; - } - } - - //while( ++patch_row < 1 ); //debug - while( ++patch_row < FILTER_SIZE_Y ); - - src0_read_offset0 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch - src0_read_offset1 += INPUT0_FEATURE_PITCH - ( FILTER_SIZE_Y * INPUT0_Y_PITCH ); // reset to start of next slice of patch - } - //while ( ++patch_depth < 1 ); //debug - while ( ++patch_depth < INPUT0_FEATURE_NUM ); - - const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; - // Dst resembles a cube of width x height x (output channel * batches). Each tile writes: - // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used. - __global float *out0 = dst + OUTPUT_OFFSET + out_split_offset - + global_z * OUTPUT_BATCH_PITCH // batch offset - + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset - + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset - + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset - __global float *out1 = dst + OUTPUT_OFFSET + out_split_offset - + global_z * OUTPUT_BATCH_PITCH // batch offset - + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset - + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset - + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ); // x offset - - #if BIAS_TERM - __global float8* biasPtr = (__global float8*) (bias + group_x * TILE_N); - #endif - - uint out0_offset = OUTPUT_OFFSET + out_split_offset - + global_z * OUTPUT_BATCH_PITCH // batch offset - + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset - + ( ( global_y * TILE_M ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset - + ( ( global_y * TILE_M ) % OUTPUT_SIZE_X ); // x offset - - uint out1_offset = OUTPUT_OFFSET + out_split_offset - + global_z * OUTPUT_BATCH_PITCH // batch offset - + ( group_x * TILE_N ) * OUTPUT_FEATURE_PITCH // channel offset - + ( ( global_y * TILE_M + 1 ) / OUTPUT_SIZE_X ) * OUTPUT_Y_PITCH // y offset - + ( ( global_y * TILE_M + 1 ) % OUTPUT_SIZE_X ); - - //-----------------------------------------------------------------------------------------------// - // OUTPUT PHASE - //-----------------------------------------------------------------------------------------------// - if( global_y * TILE_M < OUTPUT_SIZE_X * OUTPUT_SIZE_Y ) - { - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 ) - { - #if BIAS_TERM - blockC00 += *biasPtr; - blockC10 += *(biasPtr + 1); - blockC20 += *(biasPtr + 2); - blockC30 += *(biasPtr + 3); - #endif - - blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV); - blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV); - blockC20 = ACTIVATION_CONV(blockC20, ACTIVATION_PARAMS_CONV); - blockC30 = ACTIVATION_CONV(blockC30, ACTIVATION_PARAMS_CONV); - - // eltwise - uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); - for(uint i = 0; i < 8; i++) - { - blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; - blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; - blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; - blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH]; - } - - blockC00 = ACTIVATION_ELTW(blockC00, ACTIVATION_PARAMS_ELTW); - blockC10 = ACTIVATION_ELTW(blockC10, ACTIVATION_PARAMS_ELTW); - blockC20 = ACTIVATION_ELTW(blockC20, ACTIVATION_PARAMS_ELTW); - blockC30 = ACTIVATION_ELTW(blockC30, ACTIVATION_PARAMS_ELTW); - // end eltwise - - for( unsigned i = 0; i < 8; i++ ) - { - out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; - out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; - out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i]; - out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i]; - } - } - else - { - if ( ( global_x + 1 ) < get_global_size(0) ) - { - #if BIAS_TERM - blockC00 += *biasPtr; - blockC10 += *(biasPtr + 1); - blockC20 += *(biasPtr + 2); - blockC30 += *(biasPtr + 3); - #endif - - blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV); - blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV); - blockC20 = ACTIVATION_CONV(blockC20, ACTIVATION_PARAMS_CONV); - blockC30 = ACTIVATION_CONV(blockC30, ACTIVATION_PARAMS_CONV); - - // eltwise - uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); - for(uint i = 0; i < 8; i++) - { - blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; - blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; - blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; - blockC30[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH]; - } - - blockC00 = ACTIVATION_ELTW(blockC00, ACTIVATION_PARAMS_ELTW); - blockC10 = ACTIVATION_ELTW(blockC10, ACTIVATION_PARAMS_ELTW); - blockC20 = ACTIVATION_ELTW(blockC20, ACTIVATION_PARAMS_ELTW); - blockC30 = ACTIVATION_ELTW(blockC30, ACTIVATION_PARAMS_ELTW); - // end eltwise - - for ( unsigned i = 0; i < 8; i++ ) - { - out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; - out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; - out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i]; - out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i]; - } - } - else - { - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 ) - { - #if BIAS_TERM - blockC00 += *biasPtr; - blockC10 += *(biasPtr + 1); - blockC20 += *(biasPtr + 2); - if (( OUTPUT_FEATURE_NUM % TILE_N) > 24 ) blockC30 += *(biasPtr + 3); - #endif - - blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV); - blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV); - blockC20 = ACTIVATION_CONV(blockC20, ACTIVATION_PARAMS_CONV); - - // remaining output channels - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - blockC30[i] = ACTIVATION_CONV(blockC30[i], ACTIVATION_PARAMS_CONV); - } - - // eltwise - uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out0_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); - for(uint i = 0; i < 8; i++) - { - blockC00[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; - blockC10[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; - blockC20[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; - } - - // remaining output channels - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - blockC30[i] += src3[src3_offset + (i + 24 )* INPUT1_FEATURE_PITCH]; - blockC30[i] = ACTIVATION_ELTW(blockC30[i], ACTIVATION_PARAMS_ELTW); - } - - blockC00 = ACTIVATION_ELTW(blockC00, ACTIVATION_PARAMS_ELTW); - blockC10 = ACTIVATION_ELTW(blockC10, ACTIVATION_PARAMS_ELTW); - blockC20 = ACTIVATION_ELTW(blockC20, ACTIVATION_PARAMS_ELTW); - // end eltwise - - for (unsigned i = 0; i < 8; i++) - { - out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; - out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; - out0[(16+i) * OUTPUT_FEATURE_PITCH] = blockC20[i]; - } - - // remaining output channels - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out0[(24+i) * OUTPUT_FEATURE_PITCH] = blockC30[i]; - } - } - else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 ) - { - #if BIAS_TERM - blockC00 += *biasPtr; - blockC10 += *(biasPtr + 1); - if (( OUTPUT_FEATURE_NUM % TILE_N) > 16 ) - blockC20 += *(biasPtr + 2); - #endif - - blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV); - blockC10 = ACTIVATION_CONV(blockC10, ACTIVATION_PARAMS_CONV); - - for (unsigned i = 0; i < 8; i++) - { - out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; - out0[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC10[i]; - } - - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out0[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC20[i], ACTIVATION_PARAMS_CONV); - - } - } - else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 ) - { - #if BIAS_TERM - blockC00 += *biasPtr; - if (( OUTPUT_FEATURE_NUM % TILE_N) > 8 ) - blockC10 += *(biasPtr + 1); - #endif - - blockC00 = ACTIVATION_CONV(blockC00, ACTIVATION_PARAMS_CONV); - - for (unsigned i = 0; i < 8; i++) - { - out0[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC00[i]; - } - - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out0[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC10[i], ACTIVATION_PARAMS_CONV); - } - } - else - { - #if BIAS_TERM - blockC00 += *biasPtr; - #endif - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out0[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC00[i], ACTIVATION_PARAMS_CONV); - } - } - } - } - } - - if ((global_y * TILE_M + 1) < OUTPUT_SIZE_X * OUTPUT_SIZE_Y ) - { - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) == 0 ) - { - #if BIAS_TERM - blockC01 += *biasPtr; - blockC11 += *(biasPtr + 1); - blockC21 += *(biasPtr + 2); - blockC31 += *(biasPtr + 3); - #endif - - blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV); - blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV); - blockC21 = ACTIVATION_CONV(blockC21, ACTIVATION_PARAMS_CONV); - blockC31 = ACTIVATION_CONV(blockC31, ACTIVATION_PARAMS_CONV); - - // eltwise - uint src3_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset)(out1_offset, ELTW_STRIDE_X,ELTW_STRIDE_Y); - for(uint i = 0; i < 8; i++) - { - blockC01[i] += src3[src3_offset + (i + 0 )* INPUT1_FEATURE_PITCH]; - blockC11[i] += src3[src3_offset + (i + 8 )* INPUT1_FEATURE_PITCH]; - blockC21[i] += src3[src3_offset + (i + 16)* INPUT1_FEATURE_PITCH]; - blockC31[i] += src3[src3_offset + (i + 24)* INPUT1_FEATURE_PITCH]; - } - - blockC01 = ACTIVATION_ELTW(blockC01, ACTIVATION_PARAMS_ELTW); - blockC11 = ACTIVATION_ELTW(blockC11, ACTIVATION_PARAMS_ELTW); - blockC21 = ACTIVATION_ELTW(blockC21, ACTIVATION_PARAMS_ELTW); - blockC31 = ACTIVATION_ELTW(blockC31, ACTIVATION_PARAMS_ELTW); - // end eltwise - - for( unsigned i = 0; i < 8; i++ ) - { - out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; - out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; - out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i]; - out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i]; - } - } - else - { - if ( ( global_x + 1 ) < get_global_size(0) ) - { - #if BIAS_TERM - blockC01 += *biasPtr; - blockC11 += *(biasPtr + 1); - blockC21 += *(biasPtr + 2); - blockC31 += *(biasPtr + 3); - #endif - - blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV); - blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV); - blockC21 = ACTIVATION_CONV(blockC21, ACTIVATION_PARAMS_CONV); - blockC31 = ACTIVATION_CONV(blockC31, ACTIVATION_PARAMS_CONV); - - for ( unsigned i = 0; i < 8; i++ ) - { - out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; - out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; - out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i]; - out1[(24+i) * OUTPUT_FEATURE_PITCH] = blockC31[i]; - } - } - else - { - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 24 ) - { - #if BIAS_TERM - blockC01 += *biasPtr; - blockC11 += *(biasPtr + 1); - blockC21 += *(biasPtr + 2); - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 24 ) blockC31 += *(biasPtr + 3); - #endif - - blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV); - blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV); - blockC21 = ACTIVATION_CONV(blockC21, ACTIVATION_PARAMS_CONV); - - for (unsigned i = 0; i < 8; i++) - { - out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; - out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; - out1[(16+i) * OUTPUT_FEATURE_PITCH] = blockC21[i]; - } - - // Remaining channels - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out1[(24+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC31[i], ACTIVATION_PARAMS_CONV); - } - } - else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 16 ) - { - #if BIAS_TERM - blockC01 += *biasPtr; - blockC11 += *(biasPtr + 1); - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 16 ) blockC21 += *(biasPtr + 2); - #endif - - blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV); - blockC11 = ACTIVATION_CONV(blockC11, ACTIVATION_PARAMS_CONV); - - for (unsigned i = 0; i < 8; i++) - { - out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; - out1[( 8+i) * OUTPUT_FEATURE_PITCH] = blockC11[i]; - } - - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out1[(16+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC21[i], ACTIVATION_PARAMS_CONV); - } - } - else if ( ( OUTPUT_FEATURE_NUM % TILE_N ) >= 8 ) - { - #if BIAS_TERM - blockC01 += *biasPtr; - if ( ( OUTPUT_FEATURE_NUM % TILE_N ) > 8 ) blockC11 += *(biasPtr + 1); - #endif - - blockC01 = ACTIVATION_CONV(blockC01, ACTIVATION_PARAMS_CONV); - - for (unsigned i = 0; i < 8; i++) - { - out1[( 0+i) * OUTPUT_FEATURE_PITCH] = blockC01[i]; - } - - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out1[(8+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC11[i], ACTIVATION_PARAMS_CONV); - } - } - else - { - #if BIAS_TERM - blockC01 += *biasPtr; - #endif - - for (unsigned i = 0; i < OUTPUT_FEATURE_NUM % 8; i++) - { - out1[( 0+i) * OUTPUT_FEATURE_PITCH] = ACTIVATION_CONV(blockC01[i], ACTIVATION_PARAMS_CONV); - } - } - } - } - } -} diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl deleted file mode 100644 index e0eec62783e..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_128x128wg_slm_int8.cl +++ /dev/null @@ -1,510 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/mmad.cl" - -#define SUM_SCALE 0.11f -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION(idx) \ - {\ - float4 tmp;\ - for(uint z = 0; z < 4; z++)\ - {\ - tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\ - tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\ - tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\ - tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\ - \ - regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\ - regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\ - regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\ - regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\ - }\ - } - -#elif NO_QUANTIZATION - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = regC[0 * 4 + i][idx];\ - regC_uchar16.s1 = regC[1 * 4 + i][idx];\ - regC_uchar16.s2 = regC[2 * 4 + i][idx];\ - regC_uchar16.s3 = regC[3 * 4 + i][idx];\ - \ - regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\ - regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\ - regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\ - regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\ - \ - regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\ - regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\ - regC_uchar16.sa = regC[2 * 4 + i][idx+2];\ - regC_uchar16.sb = regC[3 * 4 + i][idx+2];\ - \ - regC_uchar16.sc = regC[0 * 4 + i][idx+3];\ - regC_uchar16.sd = regC[1 * 4 + i][idx+3];\ - regC_uchar16.se = regC[2 * 4 + i][idx+3];\ - regC_uchar16.sf = regC[3 * 4 + i][idx+3];\ - {\ - int16 sum;\ - for(uint s = 0; s <16; s++)\ - {\ - sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ - }\ - regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\ - regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\ - regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\ - regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\ - \ - regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\ - regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\ - regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\ - regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\ - \ - regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\ - regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\ - regC_uchar16.sa = convert_uchar_sat( sum.sa );\ - regC_uchar16.sb = convert_uchar_sat( sum.sb );\ - \ - regC_uchar16.sc = convert_uchar_sat( sum.sc );\ - regC_uchar16.sd = convert_uchar_sat( sum.sd );\ - regC_uchar16.se = convert_uchar_sat( sum.se );\ - regC_uchar16.sf = convert_uchar_sat( sum.sf );\ - } - -#else - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s1 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s2 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s3 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\ - \ - regC_uchar16.s4 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s5 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s6 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s7 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\ - \ - regC_uchar16.s8 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.s9 = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.sa = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.sb = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\ - \ - regC_uchar16.sc = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.sd = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.se = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS_CONV));\ - regC_uchar16.sf = as_uchar(ACTIVATION_CONV( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS_CONV));\ - {\ - int16 sum;\ - for(uint s = 0; s <16; s++)\ - {\ - sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ - }\ - regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s0) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s1) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s2) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s3) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - - \ - regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s4) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s5) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s6) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s7) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - \ - regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s8) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s9) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sa) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sb) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - \ - regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sc) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sd) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.se) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sf) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - } -#endif - - -inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) -{ -#if OUT_WITH_PADDING == 1 - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; - padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; - padded_offset += y_idx * OUT_Y_PITCH; - padded_offset += x_idx * OUT_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += OUT_OFFSET; - - return padded_offset; -#else - return cOffset; -#endif -} - -#if IN_OUT_OPT != 1 -inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY) -{ -#if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1 - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - x_idx *= strideX; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - y_idx *= strideY; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH; - padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH; - padded_offset += y_idx * IN2_Y_PITCH; - padded_offset += x_idx * IN2_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += IN2_OFFSET; - - return padded_offset; -#else - return cOffset; -#endif -} -#endif - -inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, - __local int8* l_tileB, const uint l_offsetTileB_col0, - const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, - const uint l_offsetTileB_col3, int8* rowA, int8* colB, - int8* regC) -{ - // Read tile A from SLM to regA - uint l_offsetTileATemp = l_offsetTileA; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); - l_offsetTileATemp += 8 * SG_SIZE; - } - // Read tile B from SLM to regB and compute mmad - colB[0] = l_tileB[l_offsetTileB_col0]; - colB[1] = l_tileB[l_offsetTileB_col1]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); - } - colB[0] = l_tileB[l_offsetTileB_col2]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); - } - colB[1] = l_tileB[l_offsetTileB_col3]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); - } - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); - } -} - -/* - * \brief GEMM kernel to compute MxN matrix using SLM - * \param g_inA - Input matrix - * \param g_inB - Input matrix - * \param g_outC - Output matrix - */ - -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) -KERNEL(Kernel_GEMM_MMAD8_32x32SG_128x128WG_SLM_INT8_fused_eltwise) - ( - __global char* const g_inA, - __global int* g_outC, - __global char* const g_inB, - #if BIAS_TERM - __global BIAS_TYPE* biases, - #endif - __global float* quantizations, - #if CALIBRATION_TERM - __global float* calibrations, - #endif - uint split_idx, - __global char* const input2, - __global float* eltw_calibrations - ) -{ - - __global int4* const g_matrixA = (__global int4*)g_inA; - __global int4* const g_matrixB = (__global int4*)g_inB; - __global int8* g_matrixC = (__global int8*)g_outC; - - // Each work-group works to compute 128x128 tile. - // Each work-group contains 16 sub-groups. - // Each sub-group within the work-group works to compute a 32x32 tile. - // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). - // 2) Each sub-group works to compute 32x32 tileC (stored in regC). - // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. - // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") - __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 - __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; // [2*128*32/8] = 1024 - - __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; - __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; - __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; - - const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y); - - const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); - const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); - const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); - const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); - - // Thread IDs - const uint g_tidY = get_global_id(DIM_Y); // 0,...,all_wi_inY - const uint g_tidX = get_global_id(DIM_X); // 0,...,all_wi_inX - const uint l_tidX = get_local_id(DIM_X); // 0,...,31 in WG - const uint l_tidY = get_local_id(DIM_Y); // 0,1,2,3 in WG - const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; // 0,1,2,...127 - - // SubGroup IDs - const uint sg_tid = get_sub_group_local_id(); // 0,1,...,8 - const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); //{0}/8 - const uint sg_global_idY = g_tidY; //{0} - - const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); // {0,...,31}/8={0,0,0,0,0...,1,1,1,...,3,3,3} - const uint sg_local_idY = l_tidY; // 0,1,2,3 - const uint sg_local_id = sg_local_idY * get_local_size(DIM_X) / SG_SIZE + sg_local_idX; // get_local_size(DIM_X) / SG_SIZE = 32/8 = 4 - - const uint sub_group_id = get_sub_group_id(); - - - // Registers - int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts // (32/8)*4 - int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA - int8 colB[2]; // each lane will store 32x4 piece of matrixB - - // SLM indices - const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; - const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); - const uint numElements32x8TileB = numElements32x32TileB / 4; - const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; - const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; - const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; - - // Global indices - uint g_idxA[2]; - uint g_idxB[2]; -#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) - g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid; - g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid; - g_idxA[1] = g_idxA[0] + l_groupSize; - g_idxB[1] = g_idxB[0] + l_groupSize; -#else // Row (matrixA) and Col (matrixB) major layout - g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); - g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); -#endif - - // Initial SLM setup - { - l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; - l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; - l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; - l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; - -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - barrier(CLK_LOCAL_MEM_FENCE); - } - - int4 hdcReadValueA[2]; - int4 hdcReadValueB[2]; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) - { - /* - * SLM setup - HDC read only - */ - // Overlap HDC reads with mmad compute - hdcReadValueA[0] = g_matrixA[g_idxA[0]]; - hdcReadValueB[0] = g_matrixB[g_idxB[0]]; - hdcReadValueA[1] = g_matrixA[g_idxA[1]]; - hdcReadValueB[1] = g_matrixB[g_idxB[1]]; - -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - /* - * mmad compute - */ - FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, - l_offsetTileB_col3, rowA, colB, regC); - - /* - * SLM setup - SLM write only - */ - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; - - barrier(CLK_LOCAL_MEM_FENCE); - } // main outer loop - - /* - * Last mmad compute iteration (avoids branching in main loop) - */ - - FUNC_CALL(mmad_32x32_int8)( - &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, - &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, - regC); - -#ifdef OUTPUT_TILED_GLOBAL_LAYOUT - // Write out in swizzled manner after quantizing - __global uchar* g_outC_uchar = (__global uchar*)g_outC; - uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + - sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); - - uchar16 regC_uchar16; - uint offset_uc16 = 0; - - const uint workgroup_id_x = get_group_id(0); - uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x - uint feature = get_sub_group_local_id()*4 + feature_off; - - float4 quant_f = vload4(0, quantizations + feature); - float4 bias_f = vload4(0, biases + feature); - float4 calib_f = vload4(0, calibrations + feature); - - // eltwise calibs - float4 eltw_calib_f = vload4(0, eltw_calibrations + feature); - - uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))]; - uint tmpcOff = cOffset; - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) - for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++) - { - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff); -#if IN_OUT_OPT == 1 - eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset))); -#else - const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y); - eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset))); -#endif - tmpcOff += sizeof(uchar16) * SG_SIZE; - } - -#if MMAD_SUPPORTED == 1 - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) -#endif - for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) - { - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - uchar16 eltw_input_vals = eltw[i * 2]; - // B0..3, F0..31 - QUANTIZATION(0); - } - - intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); - cOffset += sizeof(uchar16) * SG_SIZE; - - // now we need to calculate again for other x - padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - uchar16 eltw_input_vals = eltw[i * 2 + 1]; - // B0..3, F0..31 - QUANTIZATION(4); - } - - intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); - cOffset += sizeof(uchar16) * SG_SIZE; - } -#else - // Write final accumulated values - uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + - sg_tid * (MATRIX_M / 8); - __attribute__((opencl_unroll_hint(SIMD_LANE_N))) - for (uint i = 0; i < (SIMD_LANE_N); ++i) - { - __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) - for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) - { - g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; - } - cOffset += SG_SIZE * (MATRIX_M / 8); - } -#endif -} - -#undef SUM_SCALE -#undef SCALE -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl deleted file mode 100644 index 30542a197ca..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_mmad_32x32sg_224x128wg_slm_int8.cl +++ /dev/null @@ -1,505 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/mmad.cl" - -#define SUM_SCALE 0.11f -#define SCALE 0.11f - -#ifdef LIGHTWEIGHT_QUANTIZATION - -#define QUANTIZATION(idx) \ - {\ - float4 tmp;\ - for(uint z = 0; z < 4; z++)\ - {\ - tmp.s0 = (float)eltw_input_vals[z * 4 + 0] * SUM_SCALE + bias_f.s0;\ - tmp.s1 = (float)eltw_input_vals[z * 4 + 1] * SUM_SCALE + bias_f.s1;\ - tmp.s2 = (float)eltw_input_vals[z * 4 + 2] * SUM_SCALE + bias_f.s2;\ - tmp.s3 = (float)eltw_input_vals[z * 4 + 3] * SUM_SCALE + bias_f.s3;\ - \ - regC_uchar16[z * 4 + 0] = convert_uchar_sat( (regC[0 * 4 + i][idx + z / 4]) * SCALE + tmp.s0);\ - regC_uchar16[z * 4 + 1] = convert_uchar_sat( (regC[1 * 4 + i][idx + z / 4]) * SCALE + tmp.s1);\ - regC_uchar16[z * 4 + 2] = convert_uchar_sat( (regC[2 * 4 + i][idx + z / 4]) * SCALE + tmp.s2);\ - regC_uchar16[z * 4 + 3] = convert_uchar_sat( (regC[3 * 4 + i][idx + z / 4]) * SCALE + tmp.s3);\ - }\ - } - -#elif NO_QUANTIZATION - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = regC[0 * 4 + i][idx];\ - regC_uchar16.s1 = regC[1 * 4 + i][idx];\ - regC_uchar16.s2 = regC[2 * 4 + i][idx];\ - regC_uchar16.s3 = regC[3 * 4 + i][idx];\ - \ - regC_uchar16.s4 = regC[0 * 4 + i][idx+1];\ - regC_uchar16.s5 = regC[1 * 4 + i][idx+1];\ - regC_uchar16.s6 = regC[2 * 4 + i][idx+1];\ - regC_uchar16.s7 = regC[3 * 4 + i][idx+1];\ - \ - regC_uchar16.s8 = regC[0 * 4 + i][idx+2];\ - regC_uchar16.s9 = regC[1 * 4 + i][idx+2];\ - regC_uchar16.sa = regC[2 * 4 + i][idx+2];\ - regC_uchar16.sb = regC[3 * 4 + i][idx+2];\ - \ - regC_uchar16.sc = regC[0 * 4 + i][idx+3];\ - regC_uchar16.sd = regC[1 * 4 + i][idx+3];\ - regC_uchar16.se = regC[2 * 4 + i][idx+3];\ - regC_uchar16.sf = regC[3 * 4 + i][idx+3];\ - {\ - int16 sum;\ - for(uint s = 0; s <16; s++)\ - {\ - sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ - }\ - regC_uchar16.s0 = convert_uchar_sat( sum.s0 );\ - regC_uchar16.s1 = convert_uchar_sat( sum.s1 );\ - regC_uchar16.s2 = convert_uchar_sat( sum.s2 );\ - regC_uchar16.s3 = convert_uchar_sat( sum.s3 );\ - \ - regC_uchar16.s4 = convert_uchar_sat( sum.s4 );\ - regC_uchar16.s5 = convert_uchar_sat( sum.s5 );\ - regC_uchar16.s6 = convert_uchar_sat( sum.s6 );\ - regC_uchar16.s7 = convert_uchar_sat( sum.s7 );\ - \ - regC_uchar16.s8 = convert_uchar_sat( sum.s8 );\ - regC_uchar16.s9 = convert_uchar_sat( sum.s9 );\ - regC_uchar16.sa = convert_uchar_sat( sum.sa );\ - regC_uchar16.sb = convert_uchar_sat( sum.sb );\ - \ - regC_uchar16.sc = convert_uchar_sat( sum.sc );\ - regC_uchar16.sd = convert_uchar_sat( sum.sd );\ - regC_uchar16.se = convert_uchar_sat( sum.se );\ - regC_uchar16.sf = convert_uchar_sat( sum.sf );\ - } - -#else - -#define QUANTIZATION(idx) \ - regC_uchar16.s0 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s1 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.s2 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.s3 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.s4 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+1]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s5 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+1]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.s6 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+1]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.s7 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+1]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.s8 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+2]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.s9 = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+2]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.sa = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+2]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.sb = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+2]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - \ - regC_uchar16.sc = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[0 * 4 + i][idx+3]) * quant_f.s0 * I_QF + bias_f.s0) * calib_f.s0)), ACTIVATION_PARAMS));\ - regC_uchar16.sd = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[1 * 4 + i][idx+3]) * quant_f.s1 * I_QF + bias_f.s1) * calib_f.s1)), ACTIVATION_PARAMS));\ - regC_uchar16.se = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[2 * 4 + i][idx+3]) * quant_f.s2 * I_QF + bias_f.s2) * calib_f.s2)), ACTIVATION_PARAMS));\ - regC_uchar16.sf = as_uchar(ACTIVATION( convert_char(round(( (float)(regC[3 * 4 + i][idx+3]) * quant_f.s3 * I_QF + bias_f.s3) * calib_f.s3)), ACTIVATION_PARAMS));\ - {\ - int16 sum;\ - for(uint s = 0; s <16; s++)\ - {\ - sum[s] = (int)as_char(regC_uchar16[s]) + (int)as_char(eltw_input_vals[s]);\ - }\ - regC_uchar16.s0 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s0) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s1 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s1) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s2 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s2) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s3 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s3) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - \ - regC_uchar16.s4 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s4) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s5 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s5) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s6 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s6) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s7 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s7) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - \ - regC_uchar16.s8 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s8) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.s9 = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.s9) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sa = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sa) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sb = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sb) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - \ - regC_uchar16.sc = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sc) * eltw_calib_f.s0)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sd = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sd) * eltw_calib_f.s1)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.se = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.se) * eltw_calib_f.s2)), ACTIVATION_PARAMS_ELTW));\ - regC_uchar16.sf = as_uchar(ACTIVATION_ELTW( convert_char_sat(round( (float)(sum.sf) * eltw_calib_f.s3)), ACTIVATION_PARAMS_ELTW));\ - } -#endif - -inline uint FUNC(calculate_output_offset_to_account_padding)(uint cOffset) -{ -#if OUT_WITH_PADDING == 1 - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * OUT_F_BLOCK_PITCH; - padded_offset += b_slice_idx * OUT_B_BLOCK_PITCH; - padded_offset += y_idx * OUT_Y_PITCH; - padded_offset += x_idx * OUT_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += OUT_OFFSET; - - return padded_offset; -#else - return cOffset; -#endif -} - -#if IN_OUT_OPT != 1 -inline uint FUNC(calculate_eltw_input_offset_based_on_output_offset_account_padding)(uint cOffset, uint strideX, uint strideY) -{ -#if ELTW_WITH_PADDING == 1 || ELTW_STRIDE_X != 1 || ELTW_STRIDE_Y != 1 - uint tmp_idx = cOffset; - uint f_val_idx = tmp_idx % 32; - tmp_idx /= 32; - uint b_val_idx = tmp_idx % 4; - tmp_idx /= 4; - uint x_idx = tmp_idx % OUTPUT_SIZE_X; - x_idx *= strideX; - tmp_idx /= OUTPUT_SIZE_X; - uint y_idx = tmp_idx % OUTPUT_SIZE_Y; - y_idx *= strideY; - tmp_idx /= OUTPUT_SIZE_Y; - uint b_slice_idx = tmp_idx % (OUTPUT_BATCH_NUM / 4); - tmp_idx /= (OUTPUT_BATCH_NUM / 4); - uint f_slice_idx = tmp_idx % (OUTPUT_FEATURE_NUM / 32); - - uint padded_offset = f_slice_idx * IN2_F_BLOCK_PITCH; - padded_offset += b_slice_idx * IN2_B_BLOCK_PITCH; - padded_offset += y_idx * IN2_Y_PITCH; - padded_offset += x_idx * IN2_X_PITCH; - padded_offset += b_val_idx * 32; - padded_offset += f_val_idx; - padded_offset += IN2_OFFSET; - - return padded_offset; -#else - return cOffset; -#endif -} -#endif - -inline void FUNC(mmad_32x32_int8)( __local uint* l_tileA, const uint l_offsetTileA, - __local int8* l_tileB, const uint l_offsetTileB_col0, - const uint l_offsetTileB_col1, const uint l_offsetTileB_col2, - const uint l_offsetTileB_col3, int8* rowA, int8* colB, - int8* regC) -{ - // Read tile A from SLM to regA - uint l_offsetTileATemp = l_offsetTileA; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - rowA[j] = as_int8(SLM_BLOCK_READ_8(&l_tileA[l_offsetTileATemp])); - l_offsetTileATemp += 8 * SG_SIZE; - } - // Read tile B from SLM to regB and compute mmad - colB[0] = l_tileB[l_offsetTileB_col0]; - colB[1] = l_tileB[l_offsetTileB_col1]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[0*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[0], regC[0*(SIMD_LANE_M / 8) + j]); - } - colB[0] = l_tileB[l_offsetTileB_col2]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[1*(SIMD_LANE_M / 8) + j] = MMAD_8x8( rowA[j], colB[1], regC[1*(SIMD_LANE_M / 8) + j] ); - } - colB[1] = l_tileB[l_offsetTileB_col3]; - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[2*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[0], regC[2*(SIMD_LANE_M / 8) + j]); - } - __attribute__((opencl_unroll_hint(SG_TILE_M / 8))) - for (uint j = 0; j < (SG_TILE_M / 8); ++j) - { - // Compute partial C - regC[3*(SIMD_LANE_M / 8) + j] = MMAD_8x8(rowA[j], colB[1], regC[3*(SIMD_LANE_M / 8) + j]); - } -} - -/* - * \brief GEMM kernel to compute MxN matrix using SLM - * \param g_inA - Input matrix - * \param g_inB - Input matrix - * \param g_outC - Output matrix - */ - -__attribute__((intel_reqd_sub_group_size(SG_SIZE))) -KERNEL(Kernel_GEMM_MMAD8_32x32SG_224x128WG_SLM_INT8_fused_eltwise) - (__global char* const g_inA, - __global int* g_outC, - __global char* const g_inB, - #if BIAS_TERM - __global BIAS_TYPE* biases, - #endif - __global float* quantizations, - #if CALIBRATION_TERM - __global float* calibrations, - #endif - uint split_idx, - __global char* const input2, - __global float* eltw_calibrations - ) -{ - - __global int4* const g_matrixA = (__global int4*)g_inA; - __global int4* const g_matrixB = (__global int4*)g_inB; - __global int8* g_matrixC = (__global int8*)g_outC; - - // Each work-group works to compute 128x128 tile. - // Each work-group contains 16 sub-groups. - // Each sub-group within the work-group works to compute a 32x32 tile. - // 1) All work-items in WG fill SLM with tileA (128x32) and tileB (32x128). - // 2) Each sub-group works to compute 32x32 tileC (stored in regC). - // Note that each work-item in the sub-group computes a 32x4 chunk of tileC. - // 3) Repeat until tileC is fully computed (while moving tileA and tileB "windows") - __local int8 l_workGroupTileA[2 * (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int8)]; - __local int8 l_workGroupTileB[2 * (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8)]; - - __local uint* l_workGroupTileA_uint = (__local uint*)l_workGroupTileA; - __local int4* l_workGroupTileA_int4 = (__local int4*)l_workGroupTileA; - __local int4* l_workGroupTileB_int4 = (__local int4*)l_workGroupTileB; - - const uint l_groupSize = (uint)get_local_size(DIM_X) * (uint)get_local_size(DIM_Y); - - const uint l_pingPongOffsetA_uint = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(uint); - const uint l_pingPongOffsetB_int8 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int8); - const uint l_pingPongOffsetA_int4 = (WG_TILE_M * MATRIX_SMALL_K) / sizeof(int4); - const uint l_pingPongOffsetB_int4 = (WG_TILE_N * MATRIX_SMALL_K) / sizeof(int4); - - // Thread IDs - const uint g_tidY = get_global_id(DIM_Y); - const uint g_tidX = get_global_id(DIM_X); - const uint l_tidX = get_local_id(DIM_X); - const uint l_tidY = get_local_id(DIM_Y); - const uint l_tid = l_tidY * (uint)get_local_size(DIM_X) + l_tidX; - - // SubGroup IDs - const uint sg_tid = get_sub_group_local_id(); - const uint sg_global_idX = (uint)(g_tidX / SG_SIZE); - const uint sg_global_idY = g_tidY; - const uint sg_local_idX = (uint)(l_tidX / SG_SIZE); - const uint sg_local_idY = l_tidY; - const uint sg_local_id = sg_local_idY * (uint)get_local_size(DIM_X) / SG_SIZE + sg_local_idX; - - const uint sub_group_id = get_sub_group_id(); - - // Registers - int8 regC[(SIMD_LANE_M / 8) * SIMD_LANE_N] = {0}; // Each work-item responsible for 32x4 ints elts - int8 rowA[(SG_TILE_M * MATRIX_SMALL_K / SG_SIZE) / sizeof(int8)]; // each work-item will hold 1/8 of matrixA - int8 colB[2]; // each lane will store 32x4 piece of matrixB - - // SLM indices - const uint l_offsetTileA = SG_TILE_M * (MATRIX_SMALL_K / sizeof(uint)) * sg_local_idY; - const uint numElements32x32TileB = (MATRIX_SMALL_K * SG_TILE_N) / sizeof(int8); - const uint numElements32x8TileB = numElements32x32TileB / 4; - const uint l_offsetTileB = numElements32x32TileB * sg_local_idX; - const uint l_offsetTileB_col0 = l_offsetTileB + sg_tid; - const uint l_offsetTileB_col1 = l_offsetTileB + 1 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col2 = l_offsetTileB + 2 * numElements32x8TileB + sg_tid; - const uint l_offsetTileB_col3 = l_offsetTileB + 3 * numElements32x8TileB + sg_tid; - - // Global indices - uint g_idxA[2]; - uint g_idxB[2]; -#ifdef TILED_GLOBAL_LAYOUT // 32-row major (matrixA) and 32-col major (matrixB) - g_idxA[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_M) * (uint)get_group_id(DIM_Y) + l_tid; - g_idxB[0] = ((MATRIX_SMALL_K / sizeof(int4)) * WG_TILE_N) * (uint)get_group_id(DIM_X) + l_tid; - g_idxA[1] = g_idxA[0] + l_groupSize; - g_idxB[1] = g_idxB[0] + l_groupSize; -#else // Row (matrixA) and Col (matrixB) major layout - g_idxA[0] = WG_TILE_M * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_Y) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxB[0] = WG_TILE_N * (MATRIX_K / sizeof(int4)) * (uint)get_group_id(DIM_X) + - (l_tid / 2) * (MATRIX_K / sizeof(int4)) + (l_tid % 2); - g_idxA[1] = g_idxA[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); - g_idxB[1] = g_idxB[0] + (l_groupSize / 2) * (MATRIX_K / sizeof(int4)); -#endif - // Initial SLM setup - { - l_workGroupTileA_int4[l_tid] = g_matrixA[g_idxA[0]]; - l_workGroupTileB_int4[l_tid] = g_matrixB[g_idxB[0]]; - - l_workGroupTileA_int4[l_tid + l_groupSize] = g_matrixA[g_idxA[1]]; - if (l_tid < 32) - { - // Not all work-items will be needed to fetch the remaining matrix B - l_workGroupTileB_int4[l_tid + l_groupSize] = g_matrixB[g_idxB[1]]; - } -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - barrier(CLK_LOCAL_MEM_FENCE); - } - int4 hdcReadValueA[2]; - int4 hdcReadValueB[2]; - - __attribute__((opencl_unroll_hint(1))) - for (uint k = 0; k < (MATRIX_K / MATRIX_SMALL_K) - 1; k++) - { - hdcReadValueA[0] = g_matrixA[g_idxA[0]]; - hdcReadValueB[0] = g_matrixB[g_idxB[0]]; - hdcReadValueA[1] = g_matrixA[g_idxA[1]]; - if (l_tid < 32) - { - // Not all work-items will be needed to fetch the remaining matrix B - hdcReadValueB[1] = g_matrixB[g_idxB[1]]; - } -#ifdef TILED_GLOBAL_LAYOUT - g_idxA[0] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_M * MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_N * MATRIX_SMALL_K / sizeof(int4); -#else - g_idxA[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[0] += MATRIX_SMALL_K / sizeof(int4); - g_idxA[1] += MATRIX_SMALL_K / sizeof(int4); - g_idxB[1] += MATRIX_SMALL_K / sizeof(int4); -#endif - - - //MMAD compute - FUNC_CALL(mmad_32x32_int8)(&l_workGroupTileA_uint[(k % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, &l_workGroupTileB[(k % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, - l_offsetTileB_col3, rowA, colB, regC); - - //SLM setup - SLM write only - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid] = hdcReadValueA[0]; - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid] = hdcReadValueB[0]; - l_workGroupTileA_int4[((k + 1) % 2 * l_pingPongOffsetA_int4) + l_tid + l_groupSize] = hdcReadValueA[1]; - if (l_tid < 32) - { - // Not all work-items will be needed to fetch the remaining matrix B - l_workGroupTileB_int4[((k + 1) % 2 * l_pingPongOffsetB_int4) + l_tid + l_groupSize] = hdcReadValueB[1]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } // main outer loop - - //Last MMAD compute iteration (avoids branching in main loop) - FUNC_CALL(mmad_32x32_int8)( - &l_workGroupTileA_uint[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetA_uint], - l_offsetTileA, - &l_workGroupTileB[(((MATRIX_K / MATRIX_SMALL_K) - 1) % 2) * l_pingPongOffsetB_int8], - l_offsetTileB_col0, l_offsetTileB_col1, l_offsetTileB_col2, l_offsetTileB_col3, rowA, colB, - regC); - - -#ifdef OUTPUT_TILED_GLOBAL_LAYOUT - - // Write out in swizzled manner after quantizing - __global uchar* g_outC_uchar = (__global uchar*)g_outC; - uint cOffset = sg_global_idX * (MATRIX_M * SG_TILE_N / sizeof(uchar)) + - sg_global_idY * (SG_TILE_M * SG_TILE_N / sizeof(uchar)); - - uchar16 regC_uchar16; - uint offset_uc16 = 0; - - const uint workgroup_id_x = get_group_id(0); - uint feature_off = 32*(sub_group_id % (WG_TILE_N / 32)) + WG_TILE_N*workgroup_id_x; //=32*{0,1,2,3} + WG_TILE_N * workgroup_id_x - uint feature = get_sub_group_local_id()*4 + feature_off; - - float4 quant_f = vload4(0, quantizations + feature); - float4 bias_f = vload4(0, biases + feature); - float4 calib_f = vload4(0, calibrations + feature); - - // eltwise calibs - float4 eltw_calib_f = vload4(0, eltw_calibrations + feature); - - uchar16 eltw[(2*SG_TILE_M) / (sizeof(int8) / sizeof(int))]; - uint tmpcOff = cOffset; - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) - for (uint i = 0; i < (2*SG_TILE_M) / (sizeof(int8) / sizeof(int)); i++) - { - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(tmpcOff); -#if IN_OUT_OPT == 1 - eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(g_outC_uchar + padded_offset))); -#else - const uint eltw_second_input_offset = FUNC_CALL(calculate_eltw_input_offset_based_on_output_offset_account_padding)(tmpcOff, ELTW_STRIDE_X, ELTW_STRIDE_Y); - eltw[i] = as_uchar16(intel_sub_group_block_read4((__global uint*)(input2 + eltw_second_input_offset))); -#endif - tmpcOff += sizeof(uchar16) * SG_SIZE; - } - -#if MMAD_SUPPORTED == 1 - __attribute__((opencl_unroll_hint( SG_TILE_M / (sizeof(int8) / sizeof(int)) ))) -#endif - for (uint i = 0; i < SG_TILE_M / (sizeof(int8) / sizeof(int)); i++) - { - uint padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - uchar16 eltw_input_vals = eltw[i * 2]; - // B0..3, F0..31 - QUANTIZATION(0); - } - - intel_sub_group_block_write4((__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16)); - cOffset += sizeof(uchar16) * SG_SIZE; - - // now we need to calculate again for other x - padded_offset = FUNC_CALL(calculate_output_offset_to_account_padding)(cOffset); - { - uchar16 eltw_input_vals = eltw[i * 2 + 1]; - // B0..3, F0..31 - QUANTIZATION(4); - } - - intel_sub_group_block_write4( (__global uint*)(g_outC_uchar + padded_offset), as_uint4(regC_uchar16) ); - cOffset += sizeof(uchar16) * SG_SIZE; - } -#else - // Write final accumulated values - uint cOffset = sg_global_idX * ((MATRIX_M / 8) * SG_TILE_N) + sg_global_idY * (SG_TILE_M / 8) + - sg_tid * (MATRIX_M / 8); - __attribute__((opencl_unroll_hint(SIMD_LANE_N))) - for (uint i = 0; i < (SIMD_LANE_N); ++i) - { - __attribute__((opencl_unroll_hint(SIMD_LANE_M / 8))) - for (uint j = 0; j < (SIMD_LANE_M / 8); ++j) - { - g_matrixC[cOffset + j] = regC[i*(SIMD_LANE_M / 8) + j]; - } - cOffset += SG_SIZE * (MATRIX_M / 8); - } -#endif -} - -#undef SUM_SCALE -#undef SCALE -#undef QUANTIZATION diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl index 6bef9de4977..0fd1fb9eecc 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f16.cl @@ -55,12 +55,6 @@ KERNEL(gen9_common_conv_fwd_f16_kernel)( #if WITH_BIAS const __global half *bias, #endif -#if QUANTIZATION_TERM - __global float* quantizations, -#endif -#if CALIBRATION_TERM - __global float* calibrations, -#endif #if HAS_FUSED_OPS_DECLS FUSED_OPS_DECLS, #endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl index 024c3b818d5..b3f717d0461 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/gen9_common_conv_fwd_data_f32.cl @@ -53,12 +53,6 @@ KERNEL(gen9_common_conv_fwd_f32_kernel)( #if WITH_BIAS const __global float *bias, #endif -#if QUANTIZATION_TERM - __global float* quantizations, -#endif -#if CALIBRATION_TERM - __global float* calibrations, -#endif #if HAS_FUSED_OPS_DECLS FUSED_OPS_DECLS, #endif diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl index 9b1fc3d3805..1fe945327b3 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/generic_eltwise_ref.cl @@ -26,11 +26,7 @@ KERNEL(eltwise)( INPUTS_DECLS - __global OUTPUT_TYPE* output -#if CALIBRATION_TERM - , const __global float* calibrations -#endif - ) + __global OUTPUT_TYPE* output) { #if OUTPUT_DIMS == 6 // 4D spatial diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl deleted file mode 100644 index 33d340337fc..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/index_select_gpu_ref.cl +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "include/include_all.cl" - - -KERNEL(index_select_gpu_ref)( - const __global UNIT_TYPE* input, -#ifndef REVERSE - const __global int* indices, -#endif - __global UNIT_TYPE* output) -{ - // [CONSTEXPR]: - const uint input_sx = INPUT0_SIZE_X; - const uint input_sy = INPUT0_SIZE_Y; - const uint input_sf = INPUT0_FEATURE_NUM; - const uint input_sb = INPUT0_BATCH_NUM; - - const uint out_b = (uint) get_global_id(0); - const uint indices_idx = (uint) get_global_id(1); - const uint feature_idx = (uint) get_global_id(2); - - #if AXES_NUMBER == 1 - #ifdef REVERSE - const uint indices_value = REVERSE_AXIS_SIZE - 1 - indices_idx; - #else - const uint indices_value = indices[indices_idx]; - #endif - #elif AXES_NUMBER > 1 - #ifdef REVERSE - uint indices_value[4] = { - #ifdef REVERSE_INDEX_SELECT_AXIS_BATCH_SIZE - REVERSE_INDEX_SELECT_AXIS_BATCH_SIZE - 1 - out_b, - #else - out_b, - #endif - #ifdef REVERSE_INDEX_SELECT_AXIS_FEATURE_SIZE - REVERSE_INDEX_SELECT_AXIS_FEATURE_SIZE - 1 - feature_idx, - #else - feature_idx, - #endif - #ifdef REVERSE_INDEX_SELECT_AXIS_Y_SIZE - REVERSE_INDEX_SELECT_AXIS_Y_SIZE - 1 - indices_idx, - #else - indices_idx, - #endif - 0 - }; - #endif - #endif - - // [LOGIC]: - #if AXES_NUMBER > 1 - for(uint x = 0; x < input_sx; x++) - { - #ifdef REVERSE_INDEX_SELECT_AXIS_X_SIZE - indices_value[3] = REVERSE_INDEX_SELECT_AXIS_X_SIZE - 1 - x; - #else - indices_value[3] = x; - #endif - output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, x)] = input[GET_DATA_INDEX(INPUT0, indices_value[0], indices_value[1], indices_value[2], indices_value[3])]; - } - - #else - #ifdef INDEX_SELECT_AXIS_BATCH - for(uint x = 0; x < input_sx; x++) - { - for(uint y = 0; y < input_sy; y++) - { - output[GET_DATA_INDEX(OUTPUT, indices_idx, feature_idx, y, x)] = input[GET_DATA_INDEX(INPUT0, indices_value, feature_idx, y, x)]; - } - } - #elif defined INDEX_SELECT_AXIS_FEATURE - for(uint x = 0; x < input_sx; x++) - { - output[GET_DATA_INDEX(OUTPUT, out_b, indices_idx, feature_idx, x)] = input[GET_DATA_INDEX(INPUT0, out_b, indices_value, feature_idx, x)]; - } - #elif defined INDEX_SELECT_AXIS_X - for(uint i = 0; i < input_sy; i++) - { - output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, i, indices_idx)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, i, indices_value)]; - } - #elif defined INDEX_SELECT_AXIS_Y - - for(uint i = 0; i < input_sx; i++) - { - output[GET_DATA_INDEX(OUTPUT, out_b, feature_idx, indices_idx, i)] = input[GET_DATA_INDEX(INPUT0, out_b, feature_idx, indices_value, i)]; - } - #endif - #endif -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl deleted file mode 100644 index 99549fc9cdc..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_axis.cl +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef BATCH_AXIS - #define GAP_SIZE (INPUT0_FEATURE_NUM * INPUT0_SIZE_X * INPUT0_SIZE_Y) - #define VALUES_NUM INPUT0_BATCH_NUM - #define FIRST_DIM_SIZE INPUT0_SIZE_X - #define SECOND_DIM_SIZE INPUT0_SIZE_Y - #define FIRST_DIM_MUL 1 - #define SECOND_DIM_MUL INPUT0_SIZE_X - #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y) -#endif -#ifdef FEATURE_AXIS - #define GAP_SIZE (INPUT0_SIZE_X * INPUT0_SIZE_Y) - #define VALUES_NUM INPUT0_FEATURE_NUM - #define FIRST_DIM_SIZE INPUT0_SIZE_X - #define SECOND_DIM_SIZE INPUT0_SIZE_Y - #define FIRST_DIM_MUL 1 - #define SECOND_DIM_MUL INPUT0_SIZE_X - #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM) -#endif -#ifdef Y_AXIS - #define GAP_SIZE INPUT0_SIZE_X - #define VALUES_NUM INPUT0_SIZE_Y - #define FIRST_DIM_SIZE INPUT0_SIZE_X - #define SECOND_DIM_SIZE INPUT0_FEATURE_NUM - #define FIRST_DIM_MUL 1 - #define SECOND_DIM_MUL (INPUT0_SIZE_Y * INPUT0_SIZE_X) - #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM) -#endif -#ifdef X_AXIS - #define GAP_SIZE 1 - #define VALUES_NUM INPUT0_SIZE_X - #define FIRST_DIM_SIZE INPUT0_SIZE_Y - #define SECOND_DIM_SIZE INPUT0_FEATURE_NUM - #define FIRST_DIM_MUL INPUT0_SIZE_X - #define SECOND_DIM_MUL (INPUT0_SIZE_Y * INPUT0_SIZE_X) - #define THIRD_DIM_MUL (INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM) -#endif - - -#include "include/common.cl" -#include "include/data_types.cl" - -KERNEL(lookup_table_axis)(const __global UNIT_TYPE* input0, const __global float* indices, __global UNIT_TYPE* output) -{ - const uint first_dim_id = (uint)get_global_id(0); - const uint second_dim_id = (uint)get_global_id(1); - const uint third_dim_id = (uint)get_global_id(2); - const uint offset = first_dim_id * FIRST_DIM_MUL + second_dim_id * SECOND_DIM_MUL + third_dim_id * THIRD_DIM_MUL; - const uint val_index = (first_dim_id + second_dim_id * FIRST_DIM_SIZE + third_dim_id * FIRST_DIM_SIZE * SECOND_DIM_SIZE) * VAL_NUM; - for (uint i = 0; i < VAL_NUM; i++) - { - uint global_index = offset + (int)indices[val_index + i] * GAP_SIZE; - output[val_index + i] = input0[global_index]; - } -} - - -#undef GAP_SIZE -#undef VALUES_NUM -#undef FIRST_DIM_SIZE -#undef SECOND_DIM_SIZE -#undef FIRST_DIM_MUL -#undef SECOND_DIM_MUL -#undef THIRD_DIM_MUL \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl deleted file mode 100644 index a8e25fd2004..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/lookup_table_ref.cl +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/common.cl" -#include "include/data_types.cl" - -KERNEL(lookup_table)(const __global UNIT_TYPE* input0, const __global float* indices, __global UNIT_TYPE* output) -{ - const uint x = (uint)get_global_id(0); - const uint b = (uint)get_global_id(1); - const uint size = INPUT0_SIZE_X * INPUT0_SIZE_Y * INPUT0_FEATURE_NUM; - #ifdef INPUT0_LAYOUT_BFYX - const uint global_index = b * VAL_NUM + x; - output[global_index] = input0[(int)indices[global_index] + b*size]; - #elif defined INPUT0_LAYOUT_YXFB - const uint global_index = b + x * INPUT0_BATCH_NUM; - output[global_index] = input0[(int)indices[global_index]*INPUT0_BATCH_NUM + b]; - #endif -} - \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl deleted file mode 100644 index 4439732718c..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) -#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) - -#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) - -#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE,4) -#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) - -#if MAX_POOLING - #define INIT_VAL ACCUMULATOR_VAL_MIN -#elif AVG_POOLING - #define INIT_VAL ACCUMULATOR_VAL_ZERO -#else - #error -#endif - -inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) -{ -#if MAX_POOLING - return ACCUMULATOR_MAX_FUNC(tmp, in); -#elif AVG_POOLING - return tmp + in; -#endif -} - -__attribute__((intel_reqd_sub_group_size(8))) -KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)( - const __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output -#if HAS_FUSED_OPS_DECLS - , FUSED_OPS_DECLS -#endif -) -{ - const uint x = (uint)get_global_id(0); - const uint y = (uint)get_global_id(1); - const uint bf = (uint)get_global_id(2); - // we process 4 features per workitem that's why we need to divide it - const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32; - const uint f = ((uint)get_global_id(2) * 4) % aligned32_features; - const uint b = 4 * (((uint)get_global_id(2) * 4) / aligned32_features); - if (x >= OUTPUT_SIZE_X) - { - return; - } - - const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; - const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - ACCUMULATOR_VEC4 result[4] = { INIT_VAL }; - -#ifdef CHECK_BOUNDRY - if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || - offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y) - { - return; - } - -#ifdef DYNAMIC_KERNEL_DIVIDER - uint num_elementes = 0; -#endif - - const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, 0, 0); - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - int input_offset_y = offset_y + j; - bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - if(!zero_y) - { - for(uint i = 0; i < POOL_SIZE_X; i++) - { - int input_offset_x = offset_x + i; - bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - if(!zero) - { - const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH; - - int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); - for(uint b = 0; b < 4; b++) - { - char4 input_data = as_char4(int_data[b]); - result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0])); - result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1])); - result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2])); - result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3])); - } - -#ifdef DYNAMIC_KERNEL_DIVIDER - num_elementes++; -#endif - } - } - } - } -#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER - const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y); - const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X); - const uint num_elementes = (hend - offset_y) * (wend - offset_x); -#endif -#else - uint input_idx = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, offset_y, offset_x); - - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - for(uint i = 0; i < POOL_SIZE_X; i++) - { - int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx))); - for(uint b = 0; b < 4; b++) - { - char4 input_data = as_char4(int_data[b]); - result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0])); - result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1])); - result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2])); - result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3])); - } - - input_idx += IN_X_PITCH; - } - input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH); - } - -#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y; -#endif -#endif - -#if defined AVG_POOLING - #if ENABLE_ROUND - #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - for(uint b = 0; b < 4; b++) - { - for(uint i = 0; i < 4; i++) - { - result[b][i] = TO_ACCUMULATOR_TYPE(round(((float)result[b][i] / max(num_elementes, (uint)1)))); - } - } - #else - for(uint b = 0; b < 4; b++) - { - for(uint i = 0; i < 4; i++) - { - result[b][i] = TO_ACCUMULATOR_TYPE(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X))); - } - } - #endif - #else - #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER) - for(uint b = 0; b < 4; b++) - { - for(uint i = 0; i < 4; i++) - { - result[b][i] = TO_ACCUMULATOR_TYPE(((float)result[b][i] / max(num_elementes, (uint)1))); - } - } - #else - for(uint b = 0; b < 4; b++) - { - for(uint i = 0; i < 4; i++) - { - result[b][i] = TO_ACCUMULATOR_TYPE((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)); - } - } - #endif - #endif // ENABLE_ROUND -#endif // AVG_POOLING - -#if OUTPUT_TYPE_SIZE == 1 - int4 final_result; - - for(uint bi = 0; bi < 4; bi++) - { - #if HAS_FUSED_OPS - ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(convert_char4(result[bi])); - FUSED_OPS; - final_result[bi] = as_int(FUSED_OPS_RESULT); - #else - char4 char_result = ACTIVATION(convert_char4(result[bi]), ACTIVATION_PARAMS); - final_result[bi] = as_int(char_result); - #endif - } - const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); - intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(final_result)); - -#elif OUTPUT_TYPE_SIZE == 2 || OUTPUT_TYPE_SIZE == 4 - OUTPUT_VEC4 final_result; - - for(uint bi = 0; bi < 4; bi++) - { - #if HAS_FUSED_OPS - ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result[bi])); - FUSED_OPS; - final_result = FUSED_OPS_RESULT; - #else - char4 char_result = ACTIVATION(TO_OUTPUT_VEC4(result[bi]), ACTIVATION_PARAMS); - final_result = TO_OUTPUT_VEC4(char_result); - #endif - const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b + bi, f, y, x); - vstore4(final_result, 0, output + output_pos); - } -#endif -} - -#undef INIT_VAL -#undef ACCUMULATOR_VEC4 -#undef ACCUMULATOR_VEC4 - -#undef ACTIVATION_VEC4 -#undef TO_ACTIVATION_VEC4 - -#undef OUTPUT_VEC4 -#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl deleted file mode 100644 index f439e9e6e30..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2020 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4) -#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4) - -#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4) - -#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) -#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4) - -#if MAX_POOLING - #define INIT_VAL ACCUMULATOR_VAL_MIN -#else - #error -#endif - -inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in) -{ -#if MAX_POOLING - return ACCUMULATOR_MAX_FUNC(tmp, in); -#endif -} - -__attribute__((intel_reqd_sub_group_size(32))) -KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)( - const __global INPUT0_TYPE* input, - __global OUTPUT_TYPE* output -#if HAS_FUSED_OPS_DECLS - , FUSED_OPS_DECLS -#endif -) -{ - const uint x = (uint)get_group_id(0); - const uint y = (uint)get_group_id(1); - const uint bf = (uint)get_group_id(2) * BATCH_SG_COUNT + (uint)get_sub_group_id(); - // we process 4 features per workitem that's why we need to divide it - const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32; - const uint f = ((bf * 32) % aligned32_features) + (get_sub_group_local_id() % 8) * 4; - const uint b = 4 * ((bf * 32) / aligned32_features) + (get_sub_group_local_id() / 8); - if (x >= OUTPUT_SIZE_X) - { - return; - } - - const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X; - const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y; - - ACCUMULATOR_VEC4 result = INIT_VAL; - - if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || - offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y) - { - return; - } - - const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b, f, 0, 0); - __attribute__((opencl_unroll_hint(POOL_SIZE_Y))) - for(uint j = 0; j < POOL_SIZE_Y; j++) - { - int input_offset_y = offset_y + j; - - __attribute__((opencl_unroll_hint(POOL_SIZE_X))) - for(uint i = 0; i < POOL_SIZE_X; i++) - { - int input_offset_x = offset_x + i; - bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; - bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; - bool zero = (zero_x || zero_y); - const uint input_idx = zero ? 0 : batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH; - - const __global uint* input_uint = (const __global uint*)(input + input_idx); - int int_data = as_int(input_uint[0]); - - char4 input_data = zero ? (char4)(INIT_VAL,INIT_VAL,INIT_VAL,INIT_VAL) : as_char4(int_data); - result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0])); - result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1])); - result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2])); - result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3])); - } - } - - OUTPUT_VEC4 final_result; - - #if HAS_FUSED_OPS - ACTIVATION_VEC4 pool_result; - pool_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result)); - FUSED_OPS; - final_result = FUSED_OPS_RESULT; - #else - char4 pool_result; - for(uint op = 0; op < 4; op++) - { - pool_result[op] = ACTIVATION(TO_OUTPUT_TYPE(result[op]), ACTIVATION_PARAMS); - } - final_result = TO_OUTPUT_VEC4(pool_result); - #endif - - const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x); - *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result; -} - -#undef INIT_VAL -#undef ACCUMULATOR_VEC4 - -#undef ACTIVATION_VEC4 -#undef TO_ACTIVATION_VEC4 - -#undef OUTPUT_VEC4 -#undef TO_OUTPUT_VEC4 diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl deleted file mode 100644 index fde8e973320..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/scale_grad_weights_gpu_ref.cl +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -#define LOCAL_SIZE INPUT0_BATCH_NUM - -KERNEL(scale_grad_weights_gpu_ref)( - const __global UNIT_TYPE* input, - const __global UNIT_TYPE* input_grad, - __global OUTPUT_TYPE* output, - __global float* scale, -#if BIAS_TERM - __global float* bias, -#endif -#if MOMENTUM - __global float* prev_grad_w, -#if BIAS_TERM - __global float* prev_grad_b, -#endif -#endif - const float lr - ) -{ - __local ACCUMULATOR_TYPE grad_sum[LOCAL_SIZE]; - __local ACCUMULATOR_TYPE grad_sum_in[LOCAL_SIZE]; - - const uint local_idx = (uint)get_local_id(0); - const uint f = (uint)get_global_id(1); - - grad_sum[local_idx] = 0; - grad_sum_in[local_idx] = 0; - - uint grad_idx = GET_DATA_INDEX(INPUT0, local_idx, f, 0, 0); - for (uint y = 0; y < INPUT0_SIZE_Y; y++) - { - for (uint x = 0; x < INPUT0_SIZE_X; x++) - { - ACCUMULATOR_TYPE in_g = TO_ACCUMULATOR_TYPE(input_grad[grad_idx]); - grad_sum[local_idx] += in_g * lr; - grad_sum_in[local_idx] += in_g * TO_ACCUMULATOR_TYPE(input[grad_idx]) * lr; - grad_idx += INPUT0_X_PITCH; - } - grad_idx += INPUT0_Y_PITCH - INPUT0_SIZE_X * INPUT0_X_PITCH; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2) - { - if (local_idx < offset) - { - grad_sum[local_idx] += grad_sum[local_idx + offset]; - grad_sum_in[local_idx] += grad_sum_in[local_idx + offset]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (local_idx == 0) - { -#if MOMENTUM - ACCUMULATOR_TYPE update_gradient_w = grad_sum_in[0] + prev_grad_w[f] * MOMENTUM_FACTOR + DECAY_RATE * lr * scale[f]; - scale[f] -= update_gradient_w; - prev_grad_w[f] = update_gradient_w; -#else - scale[f] -= grad_sum_in[0] + DECAY_RATE * lr * scale[f]; -#endif - -#if BIAS_TERM -#if MOMENTUM - ACCUMULATOR_TYPE update_gradient_b = prev_grad_b[f] * MOMENTUM_FACTOR + grad_sum[0]; - bias[f] -= update_gradient_b; - prev_grad_b[f] = update_gradient_b; -#else - bias[f] -= grad_sum[0]; -#endif -#endif - } -} - -#undef LOCAL_SIZE \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl deleted file mode 100644 index b3f09cc27b8..00000000000 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/softmax_loss_grad_gpu_ref.cl +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2018 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - - -#include "include/include_all.cl" - -KERNEL(softmax_loss_grad_gpu_ref)( - const __global INPUT0_TYPE* input_pred, - __global OUTPUT_TYPE* output, - const __global INPUT1_TYPE* labels - ) -{ - const uint b_x = get_global_id(0); - const uint batch_id = b_x / OUTPUT_SIZE_X; - const uint x = b_x % OUTPUT_SIZE_X; - - const uint input_pred_idx = GET_DATA_INDEX(INPUT0, batch_id, 0, 0, x); - const uint labels_idx = GET_DATA_INDEX(INPUT1, batch_id, 0, 0, 0); - - UNIT_TYPE label = labels[labels_idx]; - const uint output_idx = GET_DATA_INDEX(OUTPUT, batch_id, 0, 0, x); - - if(label == x) - output[output_idx] = input_pred[input_pred_idx] - 1; - else - output[output_idx] = input_pred[input_pred_idx]; -} \ No newline at end of file diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp index fb84aa9c33d..37a9258c658 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.cpp @@ -112,8 +112,6 @@ std::string common_kernel_base::CreateJit(const std::string& template_name, Arguments common_kernel_base::GetArgsDesc(uint32_t num_of_input, bool use_weights, bool use_bias, - bool use_quantization, - bool use_output_calibration, uint32_t number_of_inputs_for_fused_prim) const { Arguments args; @@ -131,14 +129,6 @@ Arguments common_kernel_base::GetArgsDesc(uint32_t num_of_input, args.push_back({ArgumentDescriptor::Types::BIAS, 0}); } - if (use_quantization && use_weights) { - args.push_back({ArgumentDescriptor::Types::WEIGHTS_QUANTIZATION_FACTORS, 0}); - } - - if (use_output_calibration) { - args.push_back({ArgumentDescriptor::Types::OUTPUT_CALIBRATION_FACTORS, 0}); - } - for (uint32_t i = 0; i < number_of_inputs_for_fused_prim; i++) { args.push_back({ArgumentDescriptor::Types::INPUT_OF_FUSED_PRIMITIVE, i}); } @@ -220,6 +210,6 @@ void common_kernel_base::FillCLKernelData(clKernelData& kernel, kernel.workGroups.local = {runInfo.lws0, runInfo.lws1, runInfo.lws2}; kernel.kernelString = GetKernelString(kernelMapName, jit, entryPoint, engine_info, exeMode); kernel.arguments = - GetArgsDesc(number_of_inputs, weights, bias, false, false, number_of_inputs_for_fused_prims); + GetArgsDesc(number_of_inputs, weights, bias, number_of_inputs_for_fused_prims); } } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h index 3bacb38eee5..3dc1c5ffd53 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/common_kernel_base.h @@ -48,8 +48,6 @@ protected: Arguments GetArgsDesc(uint32_t num_of_input, bool use_weights, bool use_bias, - bool use_quantization = false, - bool use_calibration = 0, uint32_t number_of_inputs_for_fused_prim = 0) const; std::shared_ptr GetKernelString(const std::string& kernel_name, const std::string& jit, diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp index 077aace30b6..a5449fc2157 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp @@ -557,8 +557,6 @@ JitConstants MakeActivationJitConstants(ActivationFunction activation_function, }; std::string macro_def = name + (use_type_parameter ? "(jit_type, input, m, n)" : "(input, m, n)"); - std::string macro_def_grad = name + (use_type_parameter ? "(jit_type, input_grad, input, m, n)" - : "(input_grad, input, m, n)"); std::string macro_def_params = use_type_parameter ? "(jit_type, input, params)" : "(input, params)"; jitConstants.AddConstant(MakeJitConstant("ACTIVATION_PARAMS" + suffix, "NL_M" + suffix + ", NL_N" + suffix)); @@ -656,25 +654,6 @@ JitConstants MakeActivationJitConstants(ActivationFunction activation_function, jitConstants.AddConstant(MakeJitConstant(macro_def, "(pow(input," + m.str() + "))")); break; } - case ActivationFunction::RELU_GRAD: - jitConstants.AddConstant(MakeJitConstant( - macro_def_grad, - ("input_grad"_jit * ternary(input.gt(zero), one, zero)).str())); - macro_def_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)"; - break; - case ActivationFunction::RELU_NEGATIVE_SLOPE_GRAD: { - const JitTerm slope = disable_type_conversion ? "m"_jit : to_type("m"_jit); - jitConstants.AddConstant(MakeJitConstant( - macro_def_grad, - ("input_grad"_jit * (ternary(input.gt(zero), one, zero) + (to_type(slope) * ternary(input.le(zero), one, zero)))) - .str())); - macro_def_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)"; - break; - } - case ActivationFunction::NONE_GRAD: - jitConstants.AddConstant(MakeJitConstant(macro_def_grad, "input_grad")); - macro_def_params = use_type_parameter ? "(jit_type, input_grad, input, params)" : "(input_grad, input, params)"; - break; case ActivationFunction::TAN: jitConstants.AddConstant(MakeJitConstant(macro_def, "(tan(input))")); break; @@ -986,23 +965,14 @@ JitConstants MakeActivationJitConstants(std::vector