[GPU] Better extension requirements checks in kernels. Subgroups basic emulation (#13926)
This commit is contained in:
committed by
GitHub
parent
4831a9ead4
commit
13c8b4fdc7
@@ -55,12 +55,15 @@ struct device_info {
|
||||
bool supports_fp16; ///< Does engine support FP16.
|
||||
bool supports_fp64; ///< Does engine support FP64.
|
||||
bool supports_fp16_denorms; ///< Does engine support denormalized FP16.
|
||||
bool supports_subgroups; ///< Does engine support cl_intel_subgroups extension.
|
||||
bool supports_subgroups_short; ///< Does engine support cl_intel_subgroups_short extension.
|
||||
bool supports_subgroups_char; ///< Does engine support cl_intel_subgroups_char extension.
|
||||
bool supports_local_block_io; ///< Does engine support cl_intel_subgroup_local_block_io extension. Check program build with this option.
|
||||
bool supports_khr_subgroups; ///< Does engine support cl_khr_subgroups extension.
|
||||
bool supports_intel_subgroups; ///< Does engine support cl_intel_subgroups extension.
|
||||
bool supports_intel_subgroups_short; ///< Does engine support cl_intel_subgroups_short extension.
|
||||
bool supports_intel_subgroups_char; ///< Does engine support cl_intel_subgroups_char extension.
|
||||
bool supports_intel_required_subgroup_size; ///< Does engine support cl_intel_required_subgroup_size extension.
|
||||
bool supports_local_block_io; ///< Does engine support cl_intel_subgroup_local_block_io extension.
|
||||
bool supports_queue_families; ///< Does engine support cl_intel_command_queue_families extension.
|
||||
bool supports_image; ///< Does engine support images (CL_DEVICE_IMAGE_SUPPORT cap).
|
||||
bool supports_intel_planar_yuv; ///< Does engine support cl_intel_planar_yuv extension.
|
||||
|
||||
bool supports_imad; ///< Does engine support int8 mad.
|
||||
bool supports_immad; ///< Does engine support int8 multi mad.
|
||||
|
||||
@@ -591,6 +591,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (node.get_primitive()->deformable_mode)
|
||||
return false;
|
||||
|
||||
// Since reorder inputs is called after this pass
|
||||
// we have to check that blocked formats can be used in the network and layer is optimized for it.
|
||||
if ((node.get_output_layout().format == format::b_fs_yx_fsv16 ||
|
||||
|
||||
@@ -1016,14 +1016,18 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
|
||||
const auto& device_info = program->get_engine().get_device_info();
|
||||
|
||||
params.uniqueID = std::to_string(param_info.unique_id);
|
||||
params.engineInfo.bSubGroupSupport = device_info.supports_subgroups;
|
||||
params.engineInfo.bSubGroupShortSupport = device_info.supports_subgroups_short;
|
||||
params.engineInfo.bSubGroupCharSupport = device_info.supports_subgroups_char;
|
||||
params.engineInfo.bFP16Support = device_info.supports_fp16;
|
||||
params.engineInfo.bFP64Support = device_info.supports_fp64;
|
||||
params.engineInfo.bIMADSupport = device_info.supports_imad != 0;
|
||||
params.engineInfo.bIMMADSupport = device_info.supports_immad != 0;
|
||||
params.engineInfo.bImageSupport = device_info.supports_image != 0;
|
||||
params.engineInfo.supports_fp16 = device_info.supports_fp16;
|
||||
params.engineInfo.supports_fp64 = device_info.supports_fp64;
|
||||
params.engineInfo.supports_fp16_denorms = device_info.supports_fp16_denorms;
|
||||
params.engineInfo.supports_khr_subgroups = device_info.supports_khr_subgroups;
|
||||
params.engineInfo.supports_intel_subgroups = device_info.supports_intel_subgroups;
|
||||
params.engineInfo.supports_intel_subgroups_short = device_info.supports_intel_subgroups_short;
|
||||
params.engineInfo.supports_intel_subgroups_char = device_info.supports_intel_subgroups_char;
|
||||
params.engineInfo.supports_intel_required_subgroup_size = device_info.supports_intel_required_subgroup_size;
|
||||
|
||||
params.engineInfo.supports_imad = device_info.supports_imad;
|
||||
params.engineInfo.supports_immad = device_info.supports_immad;
|
||||
params.engineInfo.enable_sub_groups_emulation = true;
|
||||
params.engineInfo.bOptHintsSupport = false;
|
||||
|
||||
params.engineInfo.bLocalBlockIOSupport = device_info.supports_local_block_io && program->is_local_block_io_supported();
|
||||
@@ -1038,6 +1042,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
|
||||
params.engineInfo.deviceCache = program->get_tuning_cache();
|
||||
params.engineInfo.driverVersion = device_info.driver_version;
|
||||
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
|
||||
params.engineInfo.vendor_id = device_info.vendor_id;
|
||||
|
||||
auto impl_forcing_bo = program->get_options().get<build_option_type::force_implementations>();
|
||||
const auto& impl_forcing = impl_forcing_bo->forcing;
|
||||
|
||||
@@ -1066,6 +1066,11 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
|
||||
auto input_layout = node.get_dependency(0).get_output_layout();
|
||||
auto output_layout = node.calc_output_layout();
|
||||
|
||||
if (prim->deformable_mode) {
|
||||
output_layout.format = format::adjust_to_rank(format::bfyx, output_layout.get_partial_shape().size());
|
||||
return output_layout;
|
||||
}
|
||||
|
||||
if (input_layout.is_dynamic() || output_layout.is_dynamic()) {
|
||||
if (input_layout.get_partial_shape().size() <= 4)
|
||||
expected_format = format::b_fs_yx_fsv16;
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
KERNEL(activation)(
|
||||
__global INPUT0_TYPE* input,
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#ifdef PARAMETERIZED
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#if MAX_POOLING
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/fetch_utils.cl"
|
||||
|
||||
#ifdef BATCH_AXIS
|
||||
#define VALUES_NUM INPUT0_BATCH_NUM
|
||||
@@ -44,32 +43,6 @@
|
||||
|
||||
#define MINIMUM_NUMBER_FOR_PARTIAL_SORTING 100
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
///////////////////////// Input offset /////////////////////////
|
||||
inline uint FUNC(get_input_offset)(uint b, uint f, uint z, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_DIMS < 5
|
||||
return INPUT0_GET_INDEX(b, f, y, x);
|
||||
#elif INPUT0_DIMS == 5
|
||||
return INPUT0_GET_INDEX(b, f, z, y, x);
|
||||
#else
|
||||
#error arg_max_min_axis.cl: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////// Output offset ////////////////////////
|
||||
inline uint FUNC(get_output_offset)(uint b, uint f, uint z, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_DIMS < 5
|
||||
return OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
return OUTPUT_GET_INDEX(b, f, z, y, x);
|
||||
#else
|
||||
#error arg_max_min_axis.cl: output format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
,__global OUTPUT_TYPE* output
|
||||
#ifdef SECOND_OUTPUT_EXIST
|
||||
@@ -174,41 +147,41 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
indices[AXIS] = sort_idx;
|
||||
|
||||
iav_type result;
|
||||
result.value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result.value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
result.index = sort_idx;
|
||||
|
||||
for (uint i = 0; i < sort_idx / 8; i++) {
|
||||
uint index_offset = i * 8;
|
||||
indices[AXIS] = index_offset;
|
||||
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 1;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 2;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 3;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 4;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 5;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 6;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
indices[AXIS] = index_offset + 7;
|
||||
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
if (sort_position >= TOP_K)
|
||||
@@ -217,7 +190,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
|
||||
for (uint i = (sort_idx / 8) * 8; i < sort_idx; i++) {
|
||||
indices[AXIS] = i;
|
||||
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
|
||||
sort_position++;
|
||||
}
|
||||
@@ -227,7 +200,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
|
||||
for (uint i = sort_idx + 1; i < VALUES_NUM; i++) {
|
||||
indices[AXIS] = i;
|
||||
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (result.value COMPARE_PARALLEL_SIGN_2 test_value)
|
||||
sort_position++;
|
||||
if (sort_position >= TOP_K)
|
||||
@@ -236,7 +209,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
|
||||
// Using simple sorting for sorting by indices and when TOP_K == 1
|
||||
#elif TOP_K == 1
|
||||
INPUT0_TYPE val = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
INPUT0_TYPE val = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
result[0].index = 0;
|
||||
result[0].value = val;
|
||||
bool already_exist = false;
|
||||
@@ -255,7 +228,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
}
|
||||
|
||||
indices[AXIS] = i;
|
||||
INPUT0_TYPE in_data = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
INPUT0_TYPE in_data = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
if (val COMPARE_SIGN in_data) {
|
||||
result[top_k].index = i;
|
||||
result[top_k].value = in_data;
|
||||
@@ -270,26 +243,26 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
for (uint i = 0; i < VALUES_NUM / 8; i++) {
|
||||
uint index_offset = i * 8;
|
||||
indices[AXIS] = result[index_offset].index = index_offset;
|
||||
result[index_offset].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 1].index = index_offset + 1;
|
||||
result[index_offset + 1].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 1].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 2].index = index_offset + 2;
|
||||
result[index_offset + 2].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 2].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 3].index = index_offset + 3;
|
||||
result[index_offset + 3].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 3].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 4].index = index_offset + 4;
|
||||
result[index_offset + 4].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 4].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 5].index = index_offset + 5;
|
||||
result[index_offset + 5].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 5].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 6].index = index_offset + 6;
|
||||
result[index_offset + 6].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 6].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = result[index_offset + 7].index = index_offset + 7;
|
||||
result[index_offset + 7].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[index_offset + 7].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
}
|
||||
|
||||
for (uint i = (VALUES_NUM / 8) * 8; i < VALUES_NUM; i++) {
|
||||
indices[AXIS] = result[i].index = i;
|
||||
result[i].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
result[i].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
}
|
||||
|
||||
for (uint k = 1; k < VALUES_NUM; k *= 2) {
|
||||
@@ -320,26 +293,26 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
for (uint i = 0; i < VALUES_NUM / 8; i++) {
|
||||
uint index_offset = i * 8;
|
||||
indices[AXIS] = temp_buf[index_offset].index = index_offset;
|
||||
temp_buf[index_offset].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 1].index = index_offset + 1;
|
||||
temp_buf[index_offset + 1].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 1].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 2].index = index_offset + 2;
|
||||
temp_buf[index_offset + 2].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 2].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 3].index = index_offset + 3;
|
||||
temp_buf[index_offset + 3].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 3].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 4].index = index_offset + 4;
|
||||
temp_buf[index_offset + 4].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 4].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 5].index = index_offset + 5;
|
||||
temp_buf[index_offset + 5].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 5].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 6].index = index_offset + 6;
|
||||
temp_buf[index_offset + 6].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 6].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
indices[AXIS] = temp_buf[index_offset + 7].index = index_offset + 7;
|
||||
temp_buf[index_offset + 7].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[index_offset + 7].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
}
|
||||
|
||||
for (uint i = (VALUES_NUM / 8) * 8; i < VALUES_NUM; i++) {
|
||||
indices[AXIS] = temp_buf[i].index = i;
|
||||
temp_buf[i].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
|
||||
temp_buf[i].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
|
||||
}
|
||||
|
||||
for (uint group = 0; group < group_num - 1; group++) {
|
||||
@@ -439,22 +412,22 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
#if SORT_BY_VALUE
|
||||
indices[AXIS] = sort_position;
|
||||
#ifdef TOP_K_ORDER
|
||||
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.value);
|
||||
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.value);
|
||||
#else
|
||||
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.index);
|
||||
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.index);
|
||||
#endif
|
||||
#ifdef SECOND_OUTPUT_EXIST
|
||||
#ifdef MULTIPLE_OUTPUTS
|
||||
#ifdef TOP_K_ORDER
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.index);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.index);
|
||||
#else
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.value);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.value);
|
||||
#endif
|
||||
#else
|
||||
#ifdef TOP_K_ORDER
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.index);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.index);
|
||||
#else
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.value);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.value);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
@@ -472,22 +445,22 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
|
||||
indices[AXIS] = out_position;
|
||||
#ifdef TOP_K_ORDER
|
||||
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].value);
|
||||
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].value);
|
||||
#else
|
||||
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].index);
|
||||
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].index);
|
||||
#endif
|
||||
#ifdef SECOND_OUTPUT_EXIST
|
||||
#ifdef MULTIPLE_OUTPUTS
|
||||
#ifdef TOP_K_ORDER
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].index);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].index);
|
||||
#else
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].value);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].value);
|
||||
#endif
|
||||
#else
|
||||
#ifdef TOP_K_ORDER
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].index);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].index);
|
||||
#else
|
||||
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].value);
|
||||
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].value);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
@@ -504,4 +477,3 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
|
||||
#undef AXIS
|
||||
#undef VALUES_NUM
|
||||
#undef MINIMUM_NUMBER_FOR_PARTIAL_SORTING
|
||||
#undef unroll_for
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
|
||||
#define GLOBAL_SIZE 128
|
||||
#define LOCAL_SIZE GLOBAL_SIZE
|
||||
|
||||
@@ -13,7 +12,7 @@
|
||||
#define INPUT0_FILL_VAL INPUT0_VAL_MIN
|
||||
#else
|
||||
#define COMPARE_SIGN >
|
||||
#define INPUT0_FILL_VAL INPUT0_VAL_MAX
|
||||
#define INPUT0_FILL_VAL INPUT0_VAL_MAX
|
||||
#endif
|
||||
|
||||
__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
|
||||
@@ -39,8 +38,7 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
|
||||
|
||||
uint temp_index = global_index;
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < TOP_K; i++){
|
||||
unroll_for(uint i = 0; i < TOP_K; i++){
|
||||
accumulator.index = global_index;
|
||||
accumulator.value = input[global_index];
|
||||
for (int j = 0; j < i; j++){
|
||||
@@ -49,10 +47,10 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
|
||||
}
|
||||
global_index += GLOBAL_SIZE;
|
||||
#ifdef INPUT0_LAYOUT_BFYX
|
||||
while (global_index < size + batch_offset)
|
||||
while (global_index < size + batch_offset)
|
||||
#else
|
||||
while (global_index < size)
|
||||
#endif
|
||||
#endif
|
||||
{
|
||||
iav_type element;
|
||||
element.value = input[global_index];
|
||||
@@ -72,7 +70,7 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
|
||||
global_index += GLOBAL_SIZE * INPUT0_BATCH_NUM;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef INPUT0_LAYOUT_BFYX
|
||||
if (local_index < size)
|
||||
scratch[local_index] = accumulator;
|
||||
@@ -84,14 +82,13 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
|
||||
else
|
||||
scratch[local_index].value = INPUT0_FILL_VAL;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
|
||||
unroll_for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
|
||||
{
|
||||
if (local_index < offset)
|
||||
if (local_index < offset)
|
||||
{
|
||||
iav_type other = scratch[local_index + offset];
|
||||
iav_type mine = scratch[local_index];
|
||||
@@ -103,16 +100,16 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
|
||||
#ifdef INPUT0_LAYOUT_BFYX
|
||||
if (local_index == 0)
|
||||
if (local_index == 0)
|
||||
{
|
||||
output[current_batch * TOP_K + i] = scratch[0].index % size;
|
||||
}
|
||||
global_index = temp_index;
|
||||
results[i] = scratch[0].index % size;
|
||||
#else
|
||||
if (local_index == 0)
|
||||
if (local_index == 0)
|
||||
{
|
||||
output[current_batch + i*INPUT0_BATCH_NUM] = scratch[0].index / INPUT0_BATCH_NUM;
|
||||
}
|
||||
@@ -123,4 +120,4 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
|
||||
}
|
||||
|
||||
#undef COMPARE_SIGN
|
||||
#undef INPUT0_FILL_VAL
|
||||
#undef INPUT0_FILL_VAL
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#ifndef SG_SIZE
|
||||
#define SG_SIZE 16
|
||||
@@ -36,7 +35,7 @@
|
||||
#endif
|
||||
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SG_SIZE)
|
||||
__attribute__((reqd_work_group_size(SG_SIZE, 1, 1)))
|
||||
KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
{
|
||||
@@ -56,8 +55,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
// (gid + 1) <= input_size / (INB_ARRAY_SIZE * SG_SIZE) -> as gid is integral, the floor is not an issue
|
||||
if (gid + 1 <= input_size / (INB_ARRAY_SIZE * SG_SIZE))
|
||||
{
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ai = 0; ai < INB_ARRAY_SIZE; ++ai)
|
||||
unroll_for(uint ai = 0; ai < INB_ARRAY_SIZE; ++ai)
|
||||
{
|
||||
// Can be exchanged with sub-group block read to INB_ARRAY_SIZE-component vector.
|
||||
input_blocks[ai] = input[gid * INB_ARRAY_SIZE * SG_SIZE + ai * SG_SIZE + lid];
|
||||
@@ -69,8 +67,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
const uint last_gid = input_size / (INB_ARRAY_SIZE * SG_SIZE);
|
||||
|
||||
uint ai = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint last_base_off = last_gid * INB_ARRAY_SIZE * SG_SIZE; last_base_off + SG_SIZE <= input_size; last_base_off += SG_SIZE)
|
||||
unroll_for(uint last_base_off = last_gid * INB_ARRAY_SIZE * SG_SIZE; last_base_off + SG_SIZE <= input_size; last_base_off += SG_SIZE)
|
||||
{
|
||||
// Can be exchanged with sub-group block read to scalar.
|
||||
input_blocks[ai] = input[last_base_off + lid];
|
||||
@@ -85,8 +82,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
indices[ai++] = lid < input_size - remainder_off ? remainder_off + lid : 0;
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ai < INB_ARRAY_SIZE; ++ai)
|
||||
unroll_for(; ai < INB_ARRAY_SIZE; ++ai)
|
||||
{
|
||||
input_blocks[ai] = UNIT_FILL_VAL;
|
||||
}
|
||||
@@ -98,8 +94,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
UNIT_TYPE acc[minmax_acc_array_size];
|
||||
uint result[minmax_acc_array_size];
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
|
||||
unroll_for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
|
||||
{
|
||||
acc[ai] = UNIT_FILL_VAL;
|
||||
result[ai] = 0;
|
||||
@@ -109,24 +104,22 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint ii = 0; ii < INB_ARRAY_SIZE * SG_SIZE; ++ii)
|
||||
{
|
||||
UNIT_TYPE in_val = intel_sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
|
||||
uint in_index = intel_sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
|
||||
UNIT_TYPE in_val = _sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
|
||||
uint in_index = _sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
|
||||
unroll_for(uint ai = 0; ai < minmax_acc_array_size; ++ai)
|
||||
{
|
||||
bool insert_flag = (in_val OP_ARG_REL acc[ai]);
|
||||
if (sub_group_any(insert_flag))
|
||||
{
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint aj = minmax_acc_array_size; aj > ai + 1; --aj)
|
||||
unroll_for(uint aj = minmax_acc_array_size; aj > ai + 1; --aj)
|
||||
{
|
||||
acc[aj - 1] = intel_sub_group_shuffle_up(acc[aj - 2], acc[aj - 1], 1);
|
||||
result[aj - 1] = intel_sub_group_shuffle_up(result[aj - 2], acc[aj - 1], 1);
|
||||
acc[aj - 1] = _sub_group_shuffle_up(acc[aj - 2], acc[aj - 1], 1);
|
||||
result[aj - 1] = _sub_group_shuffle_up(result[aj - 2], acc[aj - 1], 1);
|
||||
}
|
||||
UNIT_TYPE in_val_acc_mask = select(in_val, acc[ai], insert_flag);
|
||||
uint in_index_mask = select(in_index, result[ai], insert_flag);
|
||||
acc[ai] = select(acc[ai], intel_sub_group_shuffle_up(in_val, in_val_acc_mask, 1), insert_flag);
|
||||
result[ai] = select(result[ai], intel_sub_group_shuffle_up(in_index, in_index_mask, 1), insert_flag);
|
||||
acc[ai] = select(acc[ai], _sub_group_shuffle_up(in_val, in_val_acc_mask, 1), insert_flag);
|
||||
result[ai] = select(result[ai], _sub_group_shuffle_up(in_index, in_index_mask, 1), insert_flag);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -135,8 +128,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
|
||||
// Write TOP_K sorted results.
|
||||
uint ai = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint k_base_off = 0; k_base_off + SG_SIZE <= TOP_K; k_base_off += SG_SIZE)
|
||||
unroll_for (uint k_base_off = 0; k_base_off + SG_SIZE <= TOP_K; k_base_off += SG_SIZE)
|
||||
{
|
||||
output[k_base_off + lid] = result[ai++] % input_size;
|
||||
}
|
||||
@@ -161,4 +153,4 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
|
||||
#undef UNIT_FILL_VAL
|
||||
#undef UNIT_FILL_VAL_NEEDSUNDEF_
|
||||
#endif
|
||||
#undef OP_ARG_REL
|
||||
#undef OP_ARG_REL
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(average_unpooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(batch_to_space_ref)(const __global INPUT0_TYPE* input,
|
||||
|
||||
@@ -2,17 +2,19 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define OC_BLOCK_SIZE 32
|
||||
|
||||
#define GET_WEI(data, id) intel_sub_group_shuffle(data, id)
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define GET_WEI(data, id) _sub_group_shuffle(data, id)
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) _sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||
KERNEL(binary_convolution_1x1)(const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
|
||||
@@ -2,17 +2,19 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/unit_type.cl"
|
||||
|
||||
#define OC_BLOCK_SIZE 16
|
||||
|
||||
#define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define GET_SRC(data, id) _sub_group_shuffle(data, id)
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||
KERNEL(binary_convolution_1x1_b_fs_yx_fsv16)(const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
|
||||
@@ -2,13 +2,14 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define OC_BLOCK_SIZE 32
|
||||
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
|
||||
|
||||
#if BINARY_PACKED_OUTPUT
|
||||
#define BUFFER_TYPE UNIT_TYPE
|
||||
@@ -16,7 +17,7 @@
|
||||
#define BUFFER_TYPE OUTPUT_TYPE
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||
KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -107,7 +108,7 @@ KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
|
||||
__attribute__((opencl_unroll_hint(SUB_GROUP_SIZE)))
|
||||
for (int i = 0; i < SUB_GROUP_SIZE; i++)
|
||||
{
|
||||
INPUT0_TYPE src = intel_sub_group_shuffle(line_cache[(kw + i*STRIDE_SIZE_X) / SUB_GROUP_SIZE],
|
||||
INPUT0_TYPE src = _sub_group_shuffle(line_cache[(kw + i*STRIDE_SIZE_X) / SUB_GROUP_SIZE],
|
||||
(kw + i*STRIDE_SIZE_X) % SUB_GROUP_SIZE);
|
||||
#if EXCLUDE_PAD
|
||||
int compute = ((input_x + kw + i*STRIDE_SIZE_X >= 0) &&
|
||||
@@ -149,7 +150,7 @@ KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
|
||||
for (int i = 0; i < SUB_GROUP_SIZE*2; i++)
|
||||
{
|
||||
#if EXCLUDE_PAD
|
||||
CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*intel_sub_group_shuffle(real_ks, i%SUB_GROUP_SIZE) - 2*dst_buf[i]);
|
||||
CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*_sub_group_shuffle(real_ks, i%SUB_GROUP_SIZE) - 2*dst_buf[i]);
|
||||
#else
|
||||
CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*FILTER_SIZE_Y*FILTER_SIZE_X - 2*dst_buf[i]);
|
||||
#endif
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(binary_convolution_ref)(const __global INPUT0_TYPE* input,
|
||||
|
||||
@@ -2,34 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_DIMS < 5
|
||||
return INPUT0_GET_INDEX(b, f, y, x);
|
||||
#elif INPUT0_DIMS == 5
|
||||
return INPUT0_GET_INDEX(b, f, z, y, x);
|
||||
#elif INPUT0_DIMS == 6
|
||||
return INPUT0_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error [clDNN border_gpu_ref.cl]: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_DIMS < 5
|
||||
return OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
return OUTPUT_GET_INDEX(b, f, z, y, x);
|
||||
#elif OUTPUT_DIMS == 6
|
||||
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error [clDNN border_gpu_ref.cl]: output format - not supported
|
||||
#endif
|
||||
}
|
||||
#include "include/fetch_utils.cl"
|
||||
|
||||
KERNEL(border_gpu_ref)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/fetch_utils.cl"
|
||||
|
||||
#define GET_UPDATES_INDEX(prefix, idx_order) CAT(prefix, _GET_INDEX)(idx_order)
|
||||
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
|
||||
#define WORK_GROUP_SIZE 16
|
||||
#define IC_BLOCK 16
|
||||
@@ -21,10 +22,8 @@
|
||||
# define TILE_F 1
|
||||
#endif
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
__attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(WORK_GROUP_SIZE)
|
||||
KERNEL (concatenation_gpu_blocked)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -52,8 +51,7 @@ KERNEL (concatenation_gpu_blocked)(
|
||||
OUTPUT_BLOCK_WRITE(output, dst_index, res);
|
||||
} else {
|
||||
if (lid < INPUT0_FEATURE_NUM % IC_BLOCK) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(((INPUT0_TYPE*)&src)[tx], ACTIVATION_PARAMS));
|
||||
output[dst_index + tx * IC_BLOCK + lid] = res;
|
||||
}
|
||||
@@ -78,12 +76,11 @@ KERNEL (concatenation_gpu_blocked)(
|
||||
INPUT_VEC_TYPE src_al1 = 0;
|
||||
INPUT_VEC_TYPE src_al2 = 0;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
((INPUT0_TYPE*)&src_al0)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src0)[tx], ((INPUT0_TYPE*)&src1)[tx], (IC_BLOCK - MISALIGNMENT));
|
||||
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
((INPUT0_TYPE*)&src_al0)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src0)[tx], ((INPUT0_TYPE*)&src1)[tx], (IC_BLOCK - MISALIGNMENT));
|
||||
#if TILE_F == 4
|
||||
((INPUT0_TYPE*)&src_al1)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src1)[tx], ((INPUT0_TYPE*)&src2)[tx], (IC_BLOCK - MISALIGNMENT));
|
||||
((INPUT0_TYPE*)&src_al2)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src2)[tx], ((INPUT0_TYPE*)&src3)[tx], (IC_BLOCK - MISALIGNMENT));
|
||||
((INPUT0_TYPE*)&src_al1)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src1)[tx], ((INPUT0_TYPE*)&src2)[tx], (IC_BLOCK - MISALIGNMENT));
|
||||
((INPUT0_TYPE*)&src_al2)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src2)[tx], ((INPUT0_TYPE*)&src3)[tx], (IC_BLOCK - MISALIGNMENT));
|
||||
#endif
|
||||
}
|
||||
OUTPUT_VEC_TYPE res_al0 = TO_OUTPUT_VEC_TYPE(ACTIVATION(src_al0, ACTIVATION_PARAMS));
|
||||
@@ -105,8 +102,7 @@ KERNEL (concatenation_gpu_blocked)(
|
||||
#endif
|
||||
|
||||
dst_index = OUTPUT_GET_INDEX(b, (f_block*IC_BLOCK + lid_f_offset + output_offset_in_concat_axis), y, x);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
OUTPUT_TYPE res_unal = TO_OUTPUT_TYPE(ACTIVATION(((INPUT0_TYPE*)&src_unal)[tx], ACTIVATION_PARAMS));
|
||||
output[dst_index + tx * IC_BLOCK] = res_unal;
|
||||
}
|
||||
@@ -115,15 +111,13 @@ KERNEL (concatenation_gpu_blocked)(
|
||||
{
|
||||
const uint dst_index = OUTPUT_GET_INDEX(b, (f_block*IC_BLOCK + lid + output_offset_in_concat_axis), y, x);
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < TILE_F; ++fw) {
|
||||
unroll_for(uint fw = 0; fw < TILE_F; ++fw) {
|
||||
if (TILE_F != 1 && CEIL_DIV(INPUT0_FEATURE_NUM, IC_BLOCK) % TILE_F != 0 && CEIL_DIV(INPUT0_FEATURE_NUM, IC_BLOCK) % TILE_F == fw)
|
||||
break;
|
||||
|
||||
bool do_leftover_write = INPUT0_FEATURE_NUM % IC_BLOCK == 0 || f_block * IC_BLOCK + fw * IC_BLOCK + lid < INPUT0_FEATURE_NUM;
|
||||
if (do_leftover_write) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
|
||||
INPUT0_TYPE src = input[input_offset + lid + tx * IC_BLOCK + fw * INPUT0_FEATURE_PITCH * IC_BLOCK];
|
||||
OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(src, ACTIVATION_PARAMS));
|
||||
output[dst_index + tx * IC_BLOCK + fw * OUTPUT_FEATURE_PITCH * IC_BLOCK] = res;
|
||||
@@ -144,4 +138,3 @@ KERNEL (concatenation_gpu_blocked)(
|
||||
#undef OUTPUT_BLOCK_WRITE
|
||||
|
||||
#undef TILE_F
|
||||
#undef CEIL_DIV
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
//
|
||||
@@ -16,17 +17,9 @@
|
||||
#define WORK_GROUP_SIZE 16
|
||||
#define INPUT0_ELEMENTS_COUNT (INPUT0_LENGTH/INPUT0_BATCH_NUM)
|
||||
|
||||
#if FP16_UNIT_USED
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
|
||||
#else
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (byte_offset), as_uint8(val))
|
||||
#endif
|
||||
|
||||
__attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
|
||||
KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
|
||||
REQD_SUB_GROUP_SIZE(WORK_GROUP_SIZE)
|
||||
KERNEL(concatenation_gpu_depth_bfyx_no_pitch)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
|
||||
{
|
||||
const uint batch_id = get_group_id(0);
|
||||
|
||||
@@ -41,7 +34,7 @@ KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __gl
|
||||
const uint output_offset = OUTPUT_OFFSET + element_group_offset + output_batch_offset + output_offset_in_concat_axis*OUTPUT_PITCHES[CONCAT_AXIS_INDEX];
|
||||
|
||||
//Check if current group in batch starts from 16-byte aligned pos. If not then move block read to 16-byte aligned position.
|
||||
//Requirement for intel_sub_group_block_write8.
|
||||
//Requirement for _sub_group_block_write8.
|
||||
uint align_offset = 0;
|
||||
const uint group_start_pos = output_offset;
|
||||
if(group_start_pos % WORK_GROUP_SIZE != 0)
|
||||
@@ -52,8 +45,8 @@ KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __gl
|
||||
|
||||
if(element_group_offset + align_offset + WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM < INPUT0_ELEMENTS_COUNT)
|
||||
{
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) in = ALIGNED_BLOCK_READ8(input, input_offset + align_offset);
|
||||
ALIGNED_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, ACTIVATION_PARAMS));
|
||||
MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) in = DT_INPUT_BLOCK_READ8(input, input_offset + align_offset);
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, ACTIVATION_PARAMS));
|
||||
|
||||
//Fill the values that were missed upon adding align_offset
|
||||
if((align_offset != 0) && (element_offset + output_batch_offset < group_start_pos + align_offset))
|
||||
|
||||
@@ -2,12 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/unit_type.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
|
||||
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
|
||||
|
||||
@@ -23,10 +20,10 @@
|
||||
// must be equal FSV / SUB_GROUP_SIZE
|
||||
// ======================================================================================
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
|
||||
__global UNIT_TYPE* output,
|
||||
KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
uint output_offset_in_concat_axis
|
||||
)
|
||||
{
|
||||
@@ -44,12 +41,12 @@ KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
|
||||
input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
|
||||
input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM;
|
||||
|
||||
UNIT_TYPE2 in = UNIT_BLOCK_READ2(input, input_offset);
|
||||
MAKE_VECTOR_TYPE(INPUT0_TYPE, 2) in = DT_INPUT_BLOCK_READ2(input, input_offset);
|
||||
|
||||
in = ACTIVATION(in, ACTIVATION_PARAMS);
|
||||
#if ALIGNED
|
||||
const uint dst_index = OUTPUT_GET_INDEX(b, output_offset_in_concat_axis + fs * FSV, y, x);
|
||||
UNIT_BLOCK_WRITE2(output, dst_index, in);
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, in);
|
||||
#else
|
||||
const uint dst_feature = fs * FSV + output_offset_in_concat_axis + sglid;
|
||||
if (dst_feature + SUB_GROUP_SIZE < OUTPUT_FEATURE_NUM) {
|
||||
@@ -63,8 +60,6 @@ KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef INPUT0_SIZE_X_WITH_PADDING
|
||||
#undef INPUT0_SIZE_Y_WITH_PADDING
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define GET_INDEX(prefix, ORDER) CAT(prefix, _GET_INDEX)(ORDER)
|
||||
|
||||
@@ -2,53 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
///////////////////////// Input Index /////////////////////////
|
||||
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_SIMPLE && INPUT0_DIMS <= 4
|
||||
return GET_DATA_INDEX(INPUT0, b, f, y, x);
|
||||
#elif INPUT0_SIMPLE && INPUT0_DIMS == 5
|
||||
return GET_DATA_INDEX_5D(INPUT0, b, f, z, y, x);
|
||||
#elif INPUT0_SIMPLE && INPUT0_DIMS == 6
|
||||
return GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x);
|
||||
#elif INPUT0_LAYOUT_B_FS_ZYX_FSV16
|
||||
return GET_DATA_B_FS_ZYX_FSV16_INDEX(INPUT0, b, f, z, y, x);
|
||||
#elif INPUT0_LAYOUT_BS_FS_ZYX_BSV16_FSV16
|
||||
return GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(INPUT0, b, f, z, y, x);
|
||||
#elif INPUT0_LAYOUT_BS_FS_YX_BSV16_FSV16
|
||||
return GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, y, x);
|
||||
#elif INPUT0_LAYOUT_BS_FS_YX_BSV32_FSV32
|
||||
return GET_DATA_BS_FS_YX_BSV32_FSV32_INDEX(INPUT0, b, f, y, x);
|
||||
#else
|
||||
#error concatenation_gpu_simple_ref.cl: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////// Output Index /////////////////////////
|
||||
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_SIMPLE && OUTPUT_DIMS <= 4
|
||||
return GET_DATA_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif OUTPUT_SIMPLE && OUTPUT_DIMS == 5
|
||||
return GET_DATA_INDEX_5D(OUTPUT, b, f, z, y, x);
|
||||
#elif OUTPUT_SIMPLE && OUTPUT_DIMS == 6
|
||||
return GET_DATA_INDEX_6D(OUTPUT, b, f, w, z, y, x);
|
||||
#elif OUTPUT_LAYOUT_B_FS_ZYX_FSV16
|
||||
return GET_DATA_B_FS_ZYX_FSV16_INDEX(OUTPUT, b, f, z, y, x);
|
||||
#elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16
|
||||
return GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(OUTPUT, b, f, z, y, x);
|
||||
#elif OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16
|
||||
return GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, b, f, y, x);
|
||||
#elif OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32
|
||||
return GET_DATA_BS_FS_YX_BSV32_FSV32_INDEX(OUTPUT, b, f, y, x);
|
||||
#else
|
||||
#error concatenation_gpu_simple_ref.cl: output format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
#include "include/fetch_utils.cl"
|
||||
|
||||
KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
|
||||
{
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#if defined(CONVERT_FROM_NV12) || defined(CONVERT_FROM_I420)
|
||||
#ifdef BUFFER_MEM
|
||||
|
||||
@@ -4,8 +4,9 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
#define TYPE_N_(type, n) type##n
|
||||
#define TYPE_N(type, n) TYPE_N_(type, n)
|
||||
@@ -60,13 +61,10 @@
|
||||
|
||||
#endif
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
|
||||
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
|
||||
|
||||
#define FSV 16
|
||||
#define SIMD 16
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
__attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
|
||||
KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
@@ -102,8 +100,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
|
||||
const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
|
||||
uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
unroll_for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
|
||||
uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
|
||||
? out_yx_shuffle
|
||||
@@ -136,8 +133,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
uint input_y[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
unroll_for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
input_x[os] = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
|
||||
input_y[os] = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
|
||||
@@ -158,18 +154,15 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
uint4 weights_zp_val[OUT_BLOCK_FEATURES];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
|
||||
}
|
||||
#if INPUT0_FEATURE_NUM % FSV != 0
|
||||
uint4 weights_zp_vec_partial[OUT_BLOCK_FEATURES];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
|
||||
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
|
||||
unroll_for(uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
|
||||
wzp_p[f] = 0;
|
||||
}
|
||||
}
|
||||
@@ -181,8 +174,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
#if INPUT0_FEATURE_NUM % FSV != 0
|
||||
if (feature_offset + (k + 1) * FSV >= ALIGN(INPUT0_FEATURE_NUM, FSV)) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
|
||||
}
|
||||
}
|
||||
@@ -199,11 +191,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
|
||||
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OUT_BLOCK_FEATURES];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
dotProdAZPxWZP[ofb] = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
unroll_for(uint ive = 0; ive < 4; ive++) {
|
||||
dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProdAZPxWZP[ofb][ive],
|
||||
AS_INPUT0_TYPE_4(data_zp_val[ive]),
|
||||
@@ -213,14 +203,12 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
#endif
|
||||
|
||||
uint4 weights_val[OUT_BLOCK_FEATURES] = { };
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
|
||||
}
|
||||
|
||||
uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
#if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
|
||||
if (((input_x[os] < 0) || (input_x[os] >= INPUT0_SIZE_X)) ||
|
||||
((input_y[os] < 0) || (input_y[os] >= INPUT0_SIZE_Y))) {
|
||||
@@ -236,12 +224,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
#if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
|
||||
// For some cases compiler spills here due to loop order
|
||||
// Use suboptimal order to avoid this at cost of instruction dispatch delays.
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ++ive) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint ive = 0; ive < 4; ++ive) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
ACCUMULATOR_TYPE dotProdAZPxW = 0;
|
||||
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
|
||||
@@ -250,10 +235,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
#endif
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ++ive) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ive = 0; ive < 4; ++ive) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
ACCUMULATOR_TYPE dotProdAZPxW = 0;
|
||||
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
|
||||
@@ -261,10 +244,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
AS_INPUT0_TYPE_4(data_zp_val[ive]),
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
#endif
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
|
||||
|
||||
dotProd[ofb][os] = IMAD(dotProd[ofb][os],
|
||||
inputs,
|
||||
@@ -293,8 +275,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
}
|
||||
|
||||
filter_idx += WEIGHTS_IS_PITCH;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
|
||||
}
|
||||
|
||||
@@ -317,27 +298,21 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
|
||||
|
||||
if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
|
||||
unroll_for(uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
|
||||
if (get_sub_group_id() == wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < wg; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint ofb = 0; ofb < wg; ++ofb) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_SPATIAL * SIMD +
|
||||
os * SIMD;
|
||||
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
dotProd[0][os] = dotProd[wg][os];
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
|
||||
ofb * OUT_BLOCK_SPATIAL * SIMD +
|
||||
@@ -348,10 +323,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_SPATIAL * SIMD +
|
||||
os * SIMD;
|
||||
@@ -366,10 +339,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
return;
|
||||
|
||||
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
|
||||
os * SIMD;
|
||||
@@ -399,18 +370,15 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
|
||||
#ifdef COMPENSATION_TERM
|
||||
COMPENSATION_TYPE comp[FINAL_OUT_BLOCK_FEATURES];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
comp[ofb] = compensation[out_f + ofb * SIMD];
|
||||
}
|
||||
#endif
|
||||
|
||||
// Convert accumulator type to activation type
|
||||
ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
|
||||
|
||||
#if BIAS_TERM
|
||||
@@ -424,13 +392,11 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
|
||||
// Fused ops/activation
|
||||
OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_PRELOAD_SCALAR;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
#if HAS_FUSED_OPS
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_CALC_SCALAR;
|
||||
@@ -462,10 +428,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
if (can_use_full_block_write) {
|
||||
uint output_idx = OUTPUT_GET_INDEX(out_b,
|
||||
out_fg,
|
||||
intel_sub_group_shuffle(out_y_shuffle[0], 0),
|
||||
intel_sub_group_shuffle(out_x_shuffle[0], 0));
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
_sub_group_shuffle(out_y_shuffle[0], 0),
|
||||
_sub_group_shuffle(out_x_shuffle[0], 0));
|
||||
unroll_for(uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
|
||||
|| (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
|
||||
|| (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
|
||||
@@ -474,8 +439,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 8; ++i) {
|
||||
unroll_for(uint i = 0; i < 8; ++i) {
|
||||
result_val[i] = result[ofb][os + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
|
||||
@@ -485,8 +449,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
#if OUTPUT_TYPE_SIZE <= 2
|
||||
for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 4; ++i) {
|
||||
unroll_for(uint i = 0; i < 4; ++i) {
|
||||
result_val[i] = result[ofb][os + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
|
||||
@@ -495,8 +458,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
#endif
|
||||
for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 2; ++i) {
|
||||
unroll_for(uint i = 0; i < 2; ++i) {
|
||||
result_val[i] = result[ofb][os + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
|
||||
@@ -512,23 +474,20 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
}
|
||||
} else {
|
||||
uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
|
||||
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
|
||||
|| (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
|
||||
|| (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
|
||||
if (good_of_block) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
|
||||
bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
|
||||
if (!good_os)
|
||||
break;
|
||||
|
||||
uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
|
||||
uint output_idx = _sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
|
||||
bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
|
||||
|
||||
if (!good_of)
|
||||
@@ -538,8 +497,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
|
||||
output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
|
||||
}
|
||||
}
|
||||
@@ -582,8 +540,5 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
|
||||
|
||||
#undef AS_FILTER_TYPE_4
|
||||
|
||||
#undef CEIL_DIV
|
||||
#undef ALIGN
|
||||
|
||||
#undef SIMD
|
||||
#undef FSV
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
|
||||
// ======================================================================================
|
||||
// Host side jit-constants:
|
||||
@@ -23,8 +22,6 @@
|
||||
// data prefetching; requires additional global barrier
|
||||
// ======================================================================================
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#define FSV 4
|
||||
#define WEIGHTS_OSV 16
|
||||
|
||||
@@ -61,7 +58,7 @@
|
||||
// WI: 1 x FEATURES_PER_WI x 1
|
||||
// SG: 1 x FEATURES_PER_WI x SIMD
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
__attribute__((reqd_work_group_size(SIMD, 1, LWG_DEPTH)))
|
||||
KERNEL(convolution)(
|
||||
const __global uint *input,
|
||||
@@ -134,7 +131,7 @@ KERNEL(convolution)(
|
||||
weights_offset += WEIGHTS_IS_PITCH / FSV * LWG_DEPTH;
|
||||
|
||||
unroll_for (uint out_fi = 0; out_fi < FEATURES_PER_WI; ++out_fi) {
|
||||
int wei_i = intel_sub_group_shuffle(wei_sg[out_fi / SIMD], out_fi % SIMD);
|
||||
int wei_i = _sub_group_shuffle(wei_sg[out_fi / SIMD], out_fi % SIMD);
|
||||
FILTER_TYPE4 wei_val = AS_FILTER_TYPE4(wei_i);
|
||||
|
||||
dotProd[out_fi] = IMAD(dotProd[out_fi], in_val, wei_val);
|
||||
@@ -223,8 +220,6 @@ KERNEL(convolution)(
|
||||
}
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef FSV
|
||||
#undef WEIGHTS_OSV
|
||||
|
||||
|
||||
@@ -2,10 +2,10 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
// ======================================================================================
|
||||
// Host side jit-constants:
|
||||
@@ -51,7 +51,6 @@
|
||||
#define WEIGHTS_YXS_PITCH 4
|
||||
|
||||
#define FILTER_SPATIAL_SIZE (FILTER_SIZE_X * FILTER_SIZE_Y)
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
#if FILTER_BLOCKED < FILTER_SPATIAL_SIZE && FILTER_BLOCKED % 4 != 0
|
||||
# error convolution_gpu_b_fs_yx_fsv4_dw.cl - filter blocks must either cover whole spatial filter or be multiple of 4.
|
||||
@@ -76,9 +75,9 @@
|
||||
#endif
|
||||
|
||||
#if TILED
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
#endif
|
||||
KERNEL(convolution)(
|
||||
KERNEL(convolution_gpu_b_fs_yx_fsv4_dw)(
|
||||
const __global INPUT_TYPE4 *input,
|
||||
__global OUTPUT_TYPE4 *output,
|
||||
const __global FILTER_TYPE4 *weights,
|
||||
@@ -114,11 +113,9 @@ KERNEL(convolution)(
|
||||
#if PRELOAD_INPUT || TILED
|
||||
INPUT_TYPE4 in[FILTER_SIZE_Y * INPUT_LINE_SIZE];
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) {
|
||||
unroll_for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) {
|
||||
// TODO Try to avoid loading last input line in padded situations
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
|
||||
unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
|
||||
uint preload_offset = yi * INPUT_LINE_SIZE + xi;
|
||||
uint input_x_offset = xi * (INPUT_X_PITCH / FSV);
|
||||
uint input_y_offset = yi * (DILATION_SIZE_Y * INPUT_Y_PITCH / FSV);
|
||||
@@ -135,10 +132,8 @@ KERNEL(convolution)(
|
||||
|
||||
#if PRELOAD_WEIGHTS
|
||||
FILTER_TYPE4 wei[CEIL_DIV(FILTER_SPATIAL_SIZE, 4) * 4];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fsi = 0; fsi < FILTER_SPATIAL_SIZE; fsi += 4) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofi = 0; ofi < 4; ++ofi) {
|
||||
unroll_for (uint fsi = 0; fsi < FILTER_SPATIAL_SIZE; fsi += 4) {
|
||||
unroll_for(uint ofi = 0; ofi < 4; ++ofi) {
|
||||
uint preload_offset = (fsi / 4) * 4 + ofi;
|
||||
uint weights_idx = weights_offset + ofi * WEIGHTS_I_PITCH + (fsi / 4) * WEIGHTS_YXS_PITCH;
|
||||
wei[preload_offset] = weights[weights_idx];
|
||||
@@ -159,8 +154,7 @@ for (; y < tile_y_end; ++y) {
|
||||
|
||||
int acc[OUTPUT_BLOCK_X][4] = { };
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
|
||||
unroll_for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
|
||||
uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);
|
||||
|
||||
uint4 fx = fis % FILTER_SIZE_X;
|
||||
@@ -178,17 +172,16 @@ for (; y < tile_y_end; ++y) {
|
||||
wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
|
||||
unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
|
||||
INPUT_TYPE4 in_trans0;
|
||||
INPUT_TYPE4 in_trans1;
|
||||
INPUT_TYPE4 in_trans2;
|
||||
INPUT_TYPE4 in_trans3;
|
||||
#if TILED
|
||||
in_trans0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans1 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans2 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans3 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s3]), (fx.s3 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans1 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans2 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans3 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s3]), (fx.s3 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
#elif PRELOAD_INPUT
|
||||
uint4 input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
|
||||
uint4 input_y_offset = fy * INPUT_LINE_SIZE;
|
||||
@@ -243,19 +236,18 @@ for (; y < tile_y_end; ++y) {
|
||||
wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
|
||||
# endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
|
||||
unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
|
||||
INPUT_TYPE4 in_trans0;
|
||||
INPUT_TYPE4 in_trans1;
|
||||
INPUT_TYPE4 in_trans2;
|
||||
INPUT_TYPE4 in_trans3;
|
||||
#if TILED
|
||||
in_trans0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
# if FILTER_BLOCKED % 4 > 1
|
||||
in_trans1 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans1 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
# endif
|
||||
# if FILTER_BLOCKED % 4 > 2
|
||||
in_trans2 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in_trans2 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
# endif
|
||||
#elif PRELOAD_INPUT
|
||||
uint4 input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
|
||||
@@ -317,16 +309,14 @@ for (; y < tile_y_end; ++y) {
|
||||
wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
|
||||
# endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
|
||||
unroll_for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
|
||||
uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
|
||||
uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
|
||||
unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
|
||||
|
||||
# if TILED
|
||||
in0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy]), (fx * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
in0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy]), (fx * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
|
||||
# elif PRELOAD_INPUT
|
||||
uint input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
|
||||
uint input_y_offset = fy * INPUT_LINE_SIZE;
|
||||
@@ -349,17 +339,14 @@ for (; y < tile_y_end; ++y) {
|
||||
#endif
|
||||
|
||||
#if TILE_Y != 1
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint yi = 0; yi < FILTER_SIZE_Y - 1; ++yi) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
|
||||
unroll_for (uint yi = 0; yi < FILTER_SIZE_Y - 1; ++yi) {
|
||||
unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
|
||||
in[yi * INPUT_LINE_SIZE + xi] = in[(yi + 1) * INPUT_LINE_SIZE + xi];
|
||||
}
|
||||
}
|
||||
{
|
||||
uint yi = FILTER_SIZE_Y - 1;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
|
||||
unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
|
||||
in[yi * INPUT_LINE_SIZE + xi] = input[input_offset + xi * (INPUT_X_PITCH / FSV)];
|
||||
}
|
||||
input_offset += DILATION_SIZE_Y * INPUT_Y_PITCH / FSV;
|
||||
@@ -456,4 +443,3 @@ for (; y < tile_y_end; ++y) {
|
||||
#undef WEIGHTS_YXS_PITCH
|
||||
|
||||
#undef FILTER_SPATIAL_SIZE
|
||||
#undef CEIL_DIV
|
||||
|
||||
@@ -3,12 +3,11 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
|
||||
#define INPUT0_PACKED_TYPE uint
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_b_fs_yx_fsv4_int8)(
|
||||
const __global INPUT0_PACKED_TYPE* input,
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
|
||||
@@ -146,10 +147,8 @@
|
||||
# error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - internal error, CHECK_BOUNDARY_IN_SLM enabled without PRELOAD_INPUT_TO_SLM.
|
||||
#endif
|
||||
|
||||
#define CEIL_DIV(a, b) ( ((a) + (b) - 1) / (b) )
|
||||
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
__attribute__((reqd_work_group_size(LWS0, LWS1, SIMD)))
|
||||
KERNEL(convolution)(
|
||||
const __global INPUT0_TYPE *input,
|
||||
@@ -209,8 +208,7 @@ KERNEL(convolution)(
|
||||
|
||||
#if ASYMMETRIC_DATA_QUANTIZATION && CHECK_BOUNDARY_IN_SLM
|
||||
uint4 azp_uniform[FSV / iteration_preload_bytes];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < FSV / iteration_preload_bytes; ++i) {
|
||||
unroll_for(uint i = 0; i < FSV / iteration_preload_bytes; ++i) {
|
||||
azp_uniform[i] = ((const __global uint4*)(activations_zp + (f + i * iteration_preload_bytes)))[0];
|
||||
}
|
||||
#endif
|
||||
@@ -285,8 +283,7 @@ KERNEL(convolution)(
|
||||
if (early_return)
|
||||
return;
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
|
||||
unroll_for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
|
||||
// Loop over 4 filter spatials that match imad case
|
||||
uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);
|
||||
|
||||
@@ -307,8 +304,7 @@ KERNEL(convolution)(
|
||||
uint4 input_idx = input_spatial_offset + input_offset;
|
||||
|
||||
uint tx = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
|
||||
unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
|
||||
INPUT_TYPE16 tmp_in0 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s0);
|
||||
INPUT_TYPE16 tmp_in1 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s1);
|
||||
INPUT_TYPE16 tmp_in2 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s2);
|
||||
@@ -374,13 +370,11 @@ KERNEL(convolution)(
|
||||
uint4 input_y_offset = fy * dilation_size_y * input_y_pitch;
|
||||
uint4 input_spatial_offset = input_x_offset + input_y_offset;
|
||||
uint4 input_start_offset = input_spatial_offset + input_offset;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
uint4 input_idx = input_start_offset + tx * STRIDE_SIZE_X * input_x_pitch;
|
||||
// Block reads along feature slice
|
||||
uint fw = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
INPUT_TYPE4 tmp_in0 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s0);
|
||||
INPUT_TYPE4 tmp_in1 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s1);
|
||||
INPUT_TYPE4 tmp_in2 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s2);
|
||||
@@ -417,14 +411,12 @@ KERNEL(convolution)(
|
||||
#endif
|
||||
// Weights loading:
|
||||
FILTER_TYPE4 wei[F_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
wei[fw] = AS_FILTER_TYPE4(_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
|
||||
}
|
||||
|
||||
#if CHECK_BOUNDARY && !CHECK_BOUNDARY_IN_SLM
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
int4 input_x = convert_int4(x * STRIDE_SIZE_X + tx * STRIDE_SIZE_X + fx * DILATION_SIZE_X) - PADDING_SIZE_X;
|
||||
int4 input_y = convert_int4(y * STRIDE_SIZE_Y + fy * dilation_size_y) - PADDING_SIZE_Y;
|
||||
int4 input_pad = input_x < 0 || input_x >= INPUT0_SIZE_X || input_y < 0 || input_y >= INPUT0_SIZE_Y;
|
||||
@@ -433,20 +425,16 @@ KERNEL(convolution)(
|
||||
#else
|
||||
#define padding_value(fw) ((INPUT0_TYPE)0)
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
in_trans0[tx * F_PER_WI + fwp] = input_pad.s0 ? padding_value(fwp) : in_trans0[tx * F_PER_WI + fwp];
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
in_trans1[tx * F_PER_WI + fwp] = input_pad.s1 ? padding_value(fwp) : in_trans1[tx * F_PER_WI + fwp];
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
in_trans2[tx * F_PER_WI + fwp] = input_pad.s2 ? padding_value(fwp) : in_trans2[tx * F_PER_WI + fwp];
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
in_trans3[tx * F_PER_WI + fwp] = input_pad.s3 ? padding_value(fwp) : in_trans3[tx * F_PER_WI + fwp];
|
||||
}
|
||||
#undef padding_value
|
||||
@@ -455,30 +443,24 @@ KERNEL(convolution)(
|
||||
|
||||
// Transpose input:
|
||||
INPUT_TYPE4 in[TILE_X * F_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
uint in_offset = tx * F_PER_WI + fw;
|
||||
in[in_offset] = (INPUT_TYPE4)(in_trans0[in_offset], in_trans1[in_offset], in_trans2[in_offset], in_trans3[in_offset]);
|
||||
}
|
||||
}
|
||||
|
||||
// IMAD:
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
acc[tx * F_PER_WI + fw] = IMAD(acc[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], wei[fw]);
|
||||
}
|
||||
}
|
||||
|
||||
#if ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
// Accumulate for input values for asymmetric weights:
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
src_sum[tx * F_PER_WI + fw] = IMAD(src_sum[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], (char4)(1, 1, 1, 1));
|
||||
}
|
||||
}
|
||||
@@ -492,13 +474,11 @@ KERNEL(convolution)(
|
||||
// Leftovers in filters spatial - use raw multiplication instead of imad
|
||||
// Load inputs before loop to avoid byte scattered reads + there are at most 3 leftovers
|
||||
FILTER_TYPE4 wei[F_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
|
||||
unroll_for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
wei[fw] = AS_FILTER_TYPE4(_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
|
||||
unroll_for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
|
||||
// Input loading:
|
||||
uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
|
||||
uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
|
||||
@@ -511,8 +491,7 @@ KERNEL(convolution)(
|
||||
uint input_idx = input_spatial_offset + input_offset;
|
||||
|
||||
uint tx = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
|
||||
unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
|
||||
INPUT_TYPE16 tmp_in0 = INPUT_BLOCK_READN(16, input_ptr, input_idx);
|
||||
VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
|
||||
input_idx += 16 * SIMD;
|
||||
@@ -543,12 +522,10 @@ KERNEL(convolution)(
|
||||
uint input_y_offset = fy * dilation_size_y * input_y_pitch;
|
||||
uint input_spatial_offset = input_x_offset + input_y_offset;
|
||||
uint input_start_offset = input_spatial_offset + input_offset;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
uint input_idx = input_start_offset + tx * STRIDE_SIZE_X * input_x_pitch;
|
||||
uint fw = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
INPUT_TYPE4 tmp_in0 = INPUT_BLOCK_READN(4, input_ptr, input_idx);
|
||||
VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
|
||||
input_idx += 4 * SIMD;
|
||||
@@ -566,8 +543,7 @@ KERNEL(convolution)(
|
||||
# endif
|
||||
|
||||
#if CHECK_BOUNDARY && !CHECK_BOUNDARY_IN_SLM
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
int input_x = (x + tx) * STRIDE_SIZE_X + fx * DILATION_SIZE_X - PADDING_SIZE_X;
|
||||
int input_y = y * STRIDE_SIZE_Y + fy * dilation_size_y - PADDING_SIZE_Y;
|
||||
int input_pad = input_x < 0 || input_x >= INPUT0_SIZE_X || input_y < 0 || input_y >= INPUT0_SIZE_Y;
|
||||
@@ -576,8 +552,7 @@ KERNEL(convolution)(
|
||||
#else
|
||||
#define padding_value(fw) ((INPUT0_TYPE)0)
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
|
||||
in_trans0[tx * F_PER_WI + fwp] = input_pad ? padding_value(fwp) : in_trans0[tx * F_PER_WI + fwp];
|
||||
}
|
||||
#undef padding_value
|
||||
@@ -585,20 +560,16 @@ KERNEL(convolution)(
|
||||
#endif
|
||||
|
||||
// Raw multiply accumulate:
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
acc[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw] * (int)wei[fw][fi];
|
||||
}
|
||||
}
|
||||
|
||||
#if ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
// Accumulate input values for asymmetric weights:
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
src_sum[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw];
|
||||
}
|
||||
}
|
||||
@@ -614,18 +585,14 @@ KERNEL(convolution)(
|
||||
#if BIAS_TERM
|
||||
# if BIAS_PER_OFM
|
||||
MAKE_VECTOR_TYPE(BIAS_TYPE, F_PER_WI) bias_val = BLOCK_READN(BIAS_TYPE, F_PER_WI, biases, f);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(((BIAS_TYPE*)&bias_val)[fw]);
|
||||
}
|
||||
}
|
||||
# elif BIAS_PER_OUTPUT
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
uint bias_offset = GET_BIAS_INDEX(b, f + fw * SIMD + get_sub_group_local_id(), y, x + tx);
|
||||
BIAS_TYPE bias = biases[bias_offset];
|
||||
dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
|
||||
@@ -639,10 +606,8 @@ KERNEL(convolution)(
|
||||
#if ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
{
|
||||
MAKE_VECTOR_TYPE(WEIGHTS_ZERO_POINTS_TYPE, F_PER_WI) wzp = BLOCK_READN(WEIGHTS_ZERO_POINTS_TYPE, F_PER_WI, weights_zp, f);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
dequantized[tx * F_PER_WI + fw] -= TO_DEQUANTIZED_TYPE(src_sum[tx * F_PER_WI + fw]) * TO_DEQUANTIZED_TYPE(((WEIGHTS_ZERO_POINTS_TYPE*)&wzp)[fw]);
|
||||
}
|
||||
}
|
||||
@@ -652,10 +617,8 @@ KERNEL(convolution)(
|
||||
#if COMPENSATION_TERM
|
||||
{
|
||||
MAKE_VECTOR_TYPE(COMPENSATION_TYPE, F_PER_WI) comp = BLOCK_READN(COMPENSATION_TYPE, F_PER_WI, compensation, f);
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(((COMPENSATION_TYPE*)&comp)[fw]);
|
||||
}
|
||||
}
|
||||
@@ -664,14 +627,12 @@ KERNEL(convolution)(
|
||||
|
||||
OUTPUT_TYPE out[TILE_X * F_PER_WI];
|
||||
// Fused ops and conversion to output type
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
#if HAS_FUSED_OPS
|
||||
uint fused_ops_x = x + tx;
|
||||
uint fused_ops_f = f;
|
||||
uint fw = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
DEQUANTIZED_TYPE4 fused_ops_in;
|
||||
ARRAY_TO_VEC_4(fused_ops_in, dequantized, tx * F_PER_WI + fw);
|
||||
FUSED_OPS_4;
|
||||
@@ -693,8 +654,7 @@ KERNEL(convolution)(
|
||||
out[tx * F_PER_WI + fw] = FUSED_OPS_RESULT_1;
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
out[tx * F_PER_WI + fw] = TO_OUTPUT_TYPE(dequantized[tx * F_PER_WI + fw]);
|
||||
}
|
||||
#endif
|
||||
@@ -702,10 +662,8 @@ KERNEL(convolution)(
|
||||
|
||||
// Fill results outside output in features with OUTPUT_PAD_VALUE.
|
||||
if (OUTPUT_FEATURE_NUM % FSV != 0 && f + FSV > OUTPUT_FEATURE_NUM) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
|
||||
const uint sglid = get_sub_group_local_id();
|
||||
// Hint here can save some movs if features are divisible by SIMD and not by FSV
|
||||
ASSUME_HINT(sglid < SIMD);
|
||||
@@ -721,8 +679,7 @@ KERNEL(convolution)(
|
||||
// Full output tile x write using block write ladder
|
||||
uint tx = 0;
|
||||
#if OUTPUT_TYPE_SIZE * 16 <= MAX_OPT_BLOCK_WRITE_BYTES
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
|
||||
unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
|
||||
OUTPUT_TYPE16 tmp_write;
|
||||
ARRAY_TO_VEC_16(tmp_write, out, tx);
|
||||
DT_OUTPUT_BLOCK_WRITE16(output, output_offset, tmp_write);
|
||||
@@ -730,8 +687,7 @@ KERNEL(convolution)(
|
||||
}
|
||||
#endif
|
||||
#if OUTPUT_TYPE_SIZE * 8 <= MAX_OPT_BLOCK_WRITE_BYTES
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; tx + 8 <= TILE_X * F_PER_WI; tx += 8) {
|
||||
unroll_for(; tx + 8 <= TILE_X * F_PER_WI; tx += 8) {
|
||||
OUTPUT_TYPE8 tmp_write;
|
||||
ARRAY_TO_VEC_8(tmp_write, out, tx);
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, output_offset, tmp_write);
|
||||
@@ -739,16 +695,14 @@ KERNEL(convolution)(
|
||||
}
|
||||
#endif
|
||||
#if OUTPUT_TYPE_SIZE * 4 <= MAX_OPT_BLOCK_WRITE_BYTES
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; tx + 4 <= TILE_X * F_PER_WI; tx += 4) {
|
||||
unroll_for(; tx + 4 <= TILE_X * F_PER_WI; tx += 4) {
|
||||
OUTPUT_TYPE4 tmp_write;
|
||||
ARRAY_TO_VEC_4(tmp_write, out, tx);
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, output_offset, tmp_write);
|
||||
output_offset += 4 * SIMD;
|
||||
}
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; tx + 2 <= TILE_X * F_PER_WI; tx += 2) {
|
||||
unroll_for(; tx + 2 <= TILE_X * F_PER_WI; tx += 2) {
|
||||
OUTPUT_TYPE2 tmp_write;
|
||||
ARRAY_TO_VEC_2(tmp_write, out, tx);
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, output_offset, tmp_write);
|
||||
@@ -759,20 +713,17 @@ KERNEL(convolution)(
|
||||
}
|
||||
} else {
|
||||
// Leftovers write, block writes in f dimension only
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
if (tx < OUTPUT_SIZE_X % TILE_X) {
|
||||
uint fw = 0;
|
||||
#if OUTPUT_TYPE_SIZE * 4 <= MAX_OPT_BLOCK_WRITE_BYTES
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
|
||||
OUTPUT_TYPE4 tmp_write;
|
||||
ARRAY_TO_VEC_4(tmp_write, out, tx * F_PER_WI + fw);
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, output_offset + fw * SIMD, tmp_write);
|
||||
}
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; fw + 2 <= F_PER_WI; fw += 2) {
|
||||
unroll_for(; fw + 2 <= F_PER_WI; fw += 2) {
|
||||
OUTPUT_TYPE2 tmp_write;
|
||||
ARRAY_TO_VEC_2(tmp_write, out, tx * F_PER_WI + fw);
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, output_offset + fw * SIMD, tmp_write);
|
||||
|
||||
@@ -4,8 +4,9 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
#define TYPE_N_(type, n) type##n
|
||||
#define TYPE_N(type, n) TYPE_N_(type, n)
|
||||
@@ -45,15 +46,12 @@
|
||||
|
||||
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
|
||||
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
|
||||
|
||||
#define SIMD 16
|
||||
#define FSV 16
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
|
||||
KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
@@ -129,18 +127,15 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
uint4 weights_zp_val[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
|
||||
}
|
||||
#if FILTER_IFM_NUM % FSV != 0
|
||||
uint4 weights_zp_vec_partial[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
|
||||
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
|
||||
unroll_for(uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
|
||||
wzp_p[f] = 0;
|
||||
}
|
||||
}
|
||||
@@ -152,8 +147,7 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
|
||||
#if FILTER_IFM_NUM % FSV != 0
|
||||
if (in_f_start + (k + 1) * FSV >= ALIGN(FILTER_IFM_NUM, FSV)) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
|
||||
}
|
||||
}
|
||||
@@ -170,11 +164,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
|
||||
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
|
||||
ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
dotProdAZPxWZP[ofb] = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
unroll_for(uint ive = 0; ive < 4; ive++) {
|
||||
dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
|
||||
IMAD(dotProdAZPxWZP[ofb][ive],
|
||||
AS_INPUT0_TYPE_4(data_zp_val[ive]),
|
||||
@@ -188,12 +180,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
|
||||
// Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
|
||||
unroll_for(uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
|
||||
unroll_for(uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
|
||||
unroll_for(uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
|
||||
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
const int y_idx = input_y + fyn * DILATION_SIZE_Y + iyb;
|
||||
@@ -300,23 +289,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
|
||||
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
|
||||
for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
|
||||
unroll_for(uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
|
||||
unroll_for(uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
|
||||
unroll_for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
|
||||
|
||||
uint4 weights_val[OFM_BLOCKS_PER_SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ive = 0; ive < 4; ive++) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for (uint ive = 0; ive < 4; ive++) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
ACCUMULATOR_TYPE dotProdAZPxW = 0;
|
||||
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
|
||||
@@ -325,19 +308,16 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
unroll_for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
|
||||
const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
|
||||
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
|
||||
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
|
||||
const uint shuffle_wi = x_block_idx % SIMD;
|
||||
const uint shuffle_idx = x_block_idx / SIMD;
|
||||
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
|
||||
shuffle_wi));
|
||||
|
||||
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
|
||||
@@ -401,17 +381,12 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
get_sub_group_local_id();
|
||||
|
||||
if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
|
||||
unroll_for(uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
|
||||
if (get_sub_group_id() == wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < wg; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for(uint ofb = 0; ofb < wg; ++ofb) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
@@ -422,24 +397,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
|
||||
}
|
||||
}
|
||||
}
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for(uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
|
||||
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
@@ -454,14 +422,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
}
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
@@ -480,14 +444,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
return;
|
||||
|
||||
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
const uint partial_acc_ptr_idx =
|
||||
wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
|
||||
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
|
||||
@@ -510,29 +470,23 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
|
||||
#if BIAS_TERM
|
||||
BIAS_TYPE bias[OFM_VALUES_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
bias[ofb] = biases[out_f + ofb * SIMD];
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef COMPENSATION_TERM
|
||||
COMPENSATION_TYPE comp[OFM_VALUES_PER_WI];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
comp[ofb] = compensation[out_f + ofb * SIMD];
|
||||
}
|
||||
#endif
|
||||
|
||||
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
|
||||
#if BIAS_TERM
|
||||
dequantized[ofb][od][oh][ow] += bias[ofb];
|
||||
@@ -546,17 +500,13 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
|
||||
OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
|
||||
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
FUSED_OPS_PRELOAD_SCALAR;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
|
||||
ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
|
||||
#if HAS_FUSED_OPS
|
||||
# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
|
||||
@@ -585,21 +535,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
|
||||
bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
|
||||
if (good_of_block) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
|
||||
if (good_z) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
|
||||
if (good_y) {
|
||||
uint ow = 0;
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
|
||||
unroll_for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 8; ++i) {
|
||||
unroll_for (uint i = 0; i < 8; ++i) {
|
||||
result_val[i] = result[ofb][od][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
|
||||
@@ -607,11 +553,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
#endif
|
||||
#if OUTPUT_TYPE_SIZE <= 2
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
|
||||
unroll_for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 4; ++i) {
|
||||
unroll_for (uint i = 0; i < 4; ++i) {
|
||||
result_val[i] = result[ofb][od][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
|
||||
@@ -619,11 +563,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
|
||||
unroll_for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
|
||||
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 0; i < 2; ++i) {
|
||||
unroll_for (uint i = 0; i < 2; ++i) {
|
||||
result_val[i] = result[ofb][od][oh][ow + i];
|
||||
}
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
|
||||
@@ -655,12 +597,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
#else
|
||||
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
|
||||
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
|
||||
if (good_z) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
|
||||
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
|
||||
if (good_y) {
|
||||
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
|
||||
@@ -720,9 +660,6 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
|
||||
|
||||
#undef AS_FILTER_TYPE_4
|
||||
|
||||
#undef CEIL_DIV
|
||||
#undef ALIGN
|
||||
|
||||
#undef SIMD
|
||||
#undef FSV
|
||||
#undef OFM_VALUES_PER_WI
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
#if FP16_UNIT_USED
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
|
||||
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
|
||||
{ \
|
||||
const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
|
||||
@@ -29,9 +29,6 @@
|
||||
_result = fma( _blockB.s7, acol7, _result ); \
|
||||
}
|
||||
#else
|
||||
// Block read - currently block is 4 bytes aligned.
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
|
||||
|
||||
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
|
||||
{ \
|
||||
const float16 acol0 = TRANSPOSE_BLOCK_16( _blockA.s0 ); \
|
||||
@@ -53,7 +50,11 @@
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
#ifndef ACCUMULATOR_TYPE
|
||||
#define ACCUMULATOR_TYPE INPUT0_TYPE
|
||||
#endif
|
||||
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_bfyx_1x1)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -63,14 +64,15 @@ KERNEL(convolution_bfyx_1x1)(
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
const uint xy = (uint)get_group_id(0) * 16 + get_sub_group_local_id();
|
||||
const uint group_xy = (uint)get_group_id(0) * 16;
|
||||
const uint xy = group_xy + get_sub_group_local_id();
|
||||
const uint x = xy % OUTPUT_SIZE_X;
|
||||
const uint y = xy / OUTPUT_SIZE_X;
|
||||
const uint f = (uint)get_group_id(1) * 16 + get_sub_group_local_id();//get_global_id(1);
|
||||
const uint b = (uint)get_global_id(2);
|
||||
const uint group_f = (uint)get_group_id(1) * 16;
|
||||
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
|
||||
MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 16) blockC00 = INPUT0_VAL_ZERO;
|
||||
|
||||
#if BIAS_TERM
|
||||
#if BIAS_PER_OUTPUT
|
||||
@@ -80,7 +82,7 @@ KERNEL(convolution_bfyx_1x1)(
|
||||
#endif
|
||||
for(uint i = 0; i < 16; i++)
|
||||
{
|
||||
blockC00[i] = intel_sub_group_shuffle(biases[bias_index], i);
|
||||
blockC00[i] = _sub_group_shuffle(biases[bias_index], i);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -92,18 +94,18 @@ KERNEL(convolution_bfyx_1x1)(
|
||||
const uint filter_offset = group_f * ((FILTER_OFM_PITCH + 8 - 1) / 8) * 8;//f*FILTER_OFM_PITCH;
|
||||
const uint xy_block_num = (INPUT0_FEATURE_PITCH + 16 - 1) / 16;
|
||||
const uint f_block_num = (INPUT0_FEATURE_NUM + 8 - 1) / 8;
|
||||
const uint input_offset = in_split_offset + xy * 8 + b * xy_block_num * f_block_num * 128;//b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
|
||||
const uint input_offset = in_split_offset + group_xy * 8 + b * xy_block_num * f_block_num * 128;//b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
|
||||
|
||||
for (uint k = 0; k < (FILTER_IFM_NUM + 8 - 1) / 8; ++k)
|
||||
{
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
|
||||
MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) blockA00;
|
||||
MAKE_VECTOR_TYPE(FILTER_TYPE, 8) blockB00;
|
||||
|
||||
uint input_idx = input_offset + k * 8 * xy_block_num * 16;
|
||||
uint filter_idx = filter_offset + k * 8 * 16;
|
||||
|
||||
blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
|
||||
blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
|
||||
blockA00 = DT_INPUT_BLOCK_READ8(input, input_idx);
|
||||
blockB00 = DT_FILTER_BLOCK_READ8(weights, filter_idx);
|
||||
|
||||
MULTIPLY_BLOCKS_16x8_8x16(blockC00, blockB00, blockA00);
|
||||
}
|
||||
@@ -128,3 +130,4 @@ KERNEL(convolution_bfyx_1x1)(
|
||||
#undef CONCAT_TOKEN
|
||||
#undef CONCAT_TOKEN_HANDLER1
|
||||
#undef MULTIPLY_BLOCKS_16x16
|
||||
#undef ACCUMULATOR_TYPE
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/gemm_common.cl"
|
||||
|
||||
#define MULT(C_, A_, i_) \
|
||||
@@ -13,7 +12,7 @@
|
||||
DOT8i(C_, B24, A_, i_ + 3);
|
||||
|
||||
__attribute__((reqd_work_group_size(16, TY, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_gpu_bfyx_1x1_hgemm_buf_16x1)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -56,14 +55,14 @@ KERNEL(convolution_gpu_bfyx_1x1_hgemm_buf_16x1)(
|
||||
|
||||
// 512 MADs
|
||||
|
||||
half8 B0 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
|
||||
half8 B0 = as_half8(_sub_group_block_read_us8(weights, coordB));
|
||||
coordB.y += 8;
|
||||
half8 B8 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
|
||||
half8 B8 = as_half8(_sub_group_block_read_us8(weights, coordB));
|
||||
coordB.y += 8;
|
||||
|
||||
half8 B16 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
|
||||
half8 B16 = as_half8(_sub_group_block_read_us8(weights, coordB));
|
||||
coordB.y += 8;
|
||||
half8 B24 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
|
||||
half8 B24 = as_half8(_sub_group_block_read_us8(weights, coordB));
|
||||
coordB.y += 8;
|
||||
|
||||
half8 A0 = A_load[K8*0 + k8];
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define SIMD_SIZE 8
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD_SIZE)
|
||||
KERNEL(convolution)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -58,18 +57,18 @@ KERNEL(convolution)(
|
||||
}
|
||||
|
||||
#if OUT_BLOCK_DEPTH == 8
|
||||
float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
|
||||
float8 w = as_float8(_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
|
||||
#elif OUT_BLOCK_DEPTH == 4
|
||||
float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
|
||||
float4 w = as_float4(_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
|
||||
#elif OUT_BLOCK_DEPTH == 2
|
||||
float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
|
||||
float2 w = as_float2(_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
|
||||
#endif
|
||||
|
||||
for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
|
||||
{
|
||||
for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
|
||||
{
|
||||
float _in = intel_sub_group_shuffle(in[br], bc);
|
||||
float _in = _sub_group_shuffle(in[br], bc);
|
||||
for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
|
||||
{
|
||||
dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];
|
||||
|
||||
@@ -2,18 +2,11 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#if FP16_UNIT_USED
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_half(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
|
||||
#else
|
||||
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint8(val))
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
KERNEL(convolution_depthwise_weights_lwg)(
|
||||
__global INPUT0_TYPE* input,
|
||||
@@ -41,7 +34,7 @@ KERNEL(convolution_depthwise_weights_lwg)(
|
||||
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_group_offset;
|
||||
|
||||
#if FILTER_SIZE_Y * FILTER_SIZE_X % 16 == 0 && !FP16_UNIT_USED
|
||||
UNIT_TYPE w = ALIGNED_BLOCK_READ(weights, filter_offset);
|
||||
UNIT_TYPE w = DT_FILTER_BLOCK_READ(weights, filter_offset);
|
||||
#elif FILTER_SIZE_X * FILTER_SIZE_Y > 16 && FILTER_SIZE_X * FILTER_SIZE_Y <= 25
|
||||
const uint lid = get_local_id(0);
|
||||
UNIT_TYPE w[2] = { UNIT_VAL_ZERO };
|
||||
@@ -78,9 +71,9 @@ KERNEL(convolution_depthwise_weights_lwg)(
|
||||
#if FILTER_SIZE_X * FILTER_SIZE_Y > 16 && FILTER_SIZE_X * FILTER_SIZE_Y <= 25
|
||||
const uint id = (j*FILTER_Y_PITCH + i*FILTER_X_PITCH) / 16;
|
||||
const uint idx = (j*FILTER_Y_PITCH + i*FILTER_X_PITCH) % 16;
|
||||
UNIT_TYPE w1 = intel_sub_group_shuffle(w[id], idx);
|
||||
UNIT_TYPE w1 = _sub_group_shuffle(w[id], idx);
|
||||
#else
|
||||
UNIT_TYPE w1 = intel_sub_group_shuffle(w, j*FILTER_Y_PITCH + i*FILTER_X_PITCH);
|
||||
UNIT_TYPE w1 = _sub_group_shuffle(w, j*FILTER_Y_PITCH + i*FILTER_X_PITCH);
|
||||
#endif
|
||||
dotProd = mad(input[input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH],
|
||||
w1, dotProd);
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
@@ -16,7 +15,7 @@
|
||||
#define TILE_X 12 // Width of tile loaded in input (src0)
|
||||
#define TILE_Y 10 // Height of tile loaded in input (src0)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_f16_10x12x16)(
|
||||
const __global half *src0,
|
||||
__global half *dst,
|
||||
@@ -100,12 +99,12 @@ KERNEL(convolution_f16_10x12x16)(
|
||||
unsigned interleaved_y = 0;
|
||||
LOOP(KERNEL_SLICE_DIV2, interleaved_y,
|
||||
{
|
||||
p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
|
||||
p2BlockB[interleaved_y] = _sub_group_block_read_us2( (const __global ushort*)src1_read );
|
||||
src1_read += ALIGNED_OFM_PER_GROUP * 2;
|
||||
} )
|
||||
if ( kernel_slice_is_odd )
|
||||
{
|
||||
pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
|
||||
pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = _sub_group_block_read_us( (const __global ushort*)src1_read );
|
||||
src1_read += ALIGNED_OFM_PER_GROUP * 2;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,159 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
//#include "include/cnn_common.cl"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Direct Convolution
|
||||
#if defined(cl_intel_subgroups_short)
|
||||
|
||||
#define TILE_M DY // Height of tile in input patches (src0)
|
||||
#define TILE_K DX // Width of tile in input patches (src0)
|
||||
#define TILE_N 16 // Num filter channels per tile (src1)
|
||||
|
||||
#define TILE_X 8 // Width of tile loaded in input (src0)
|
||||
#define TILE_Y 8 // Height of tile loaded in input (src0)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
__kernel void convolution_f16_8x8x16(
|
||||
const __global half *src0,
|
||||
__global half *dst,
|
||||
const __global half *src1,
|
||||
const __global half *biases)
|
||||
{
|
||||
const unsigned global_x = (uint)get_global_id(0);
|
||||
const unsigned global_y = (uint)get_global_id(1);
|
||||
const unsigned global_z = (uint)get_global_id(2);
|
||||
const unsigned out_fm = global_z % ALIGNED_OFM;
|
||||
const unsigned batch_id = global_z / ALIGNED_OFM;
|
||||
const unsigned group_x = get_group_id(0);
|
||||
const unsigned group_z = get_group_id(2);
|
||||
const unsigned max_group_x = get_num_groups(0);
|
||||
const unsigned local_z = get_local_id(2);
|
||||
|
||||
half blockC[TILE_M * TILE_K] = { 0 };
|
||||
|
||||
uint src0_offset_tile =
|
||||
batch_id * INPUT_BATCH_PITCH // batch offset
|
||||
+ ( global_y * TILE_M * STRIDE_Y ) * INPUT_Y_PITCH // y offset
|
||||
+ ( global_x * TILE_K * STRIDE_X ); // x offset
|
||||
uint src0_offset = src0_offset_tile
|
||||
+ ( local_z / ( TILE_X / 4 ) ) * INPUT_Y_PITCH // y tile offset
|
||||
+ ( local_z % ( TILE_X / 4 ) ) * 4; // x tile offset
|
||||
|
||||
const __global half *src1_read = src1 + ( group_z * TILE_N % ALIGNED_OFM ) * 2;
|
||||
|
||||
unsigned patch_depth = 0;
|
||||
__attribute__((opencl_unroll_hint(3)))
|
||||
do
|
||||
{
|
||||
// Load atile (input) and btile (filters).
|
||||
// Kernel data is partially interleaved. Every 2 rows are interleaved at float16 granularity.
|
||||
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
|
||||
// interleaved row is padded with zero to ensure same size as interleaved rows. This
|
||||
// interleaving is done to increase consecutive data to fetch which reduces loads required.
|
||||
// For example, this is how the kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
|
||||
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
|
||||
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
|
||||
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
|
||||
// ...
|
||||
|
||||
// in case the data is not aligned to sizeof(T)*KERNEL_WIDTH we need to use vload or set the data in a loop
|
||||
half4 blockA = vload4(0, src0 + src0_offset );
|
||||
src0_offset += INPUT_FEATURE_PITCH;
|
||||
|
||||
half blockB[KERNEL_WIDTH * KERNEL_HEIGHT];
|
||||
ushort2* p2BlockB = (ushort2*)blockB;
|
||||
ushort* pBlockB = (ushort* )blockB;
|
||||
|
||||
const bool kernel_slice_is_odd = ( KERNEL_WIDTH * KERNEL_HEIGHT ) % 2 == 1;
|
||||
unsigned interleaved_y = 0;
|
||||
LOOP(KERNEL_SLICE_DIV2, interleaved_y,
|
||||
{
|
||||
p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
|
||||
src1_read += ALIGNED_OFM * 2;
|
||||
} )
|
||||
if ( kernel_slice_is_odd )
|
||||
{
|
||||
pBlockB[KERNEL_WIDTH * KERNEL_HEIGHT - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
|
||||
src1_read += ALIGNED_OFM * 2;
|
||||
}
|
||||
|
||||
#define BLOCK_A(n) sub_group_broadcast( blockA[(n)%4], (n)/4 )
|
||||
|
||||
// Perform MADs
|
||||
// Loop through all patches in tile (patch_x/y)
|
||||
// For each patch, sum values (x/y)
|
||||
unsigned patch_y=0;
|
||||
LOOP(TILE_M, patch_y,
|
||||
{
|
||||
unsigned patch_x=0;
|
||||
LOOP(TILE_K, patch_x,
|
||||
{
|
||||
unsigned tile_idx = patch_y * TILE_X * STRIDE_Y + patch_x * STRIDE_X;
|
||||
unsigned out_idx = patch_y * TILE_K + patch_x;
|
||||
|
||||
unsigned y=0;
|
||||
LOOP(KERNEL_HEIGHT, y,
|
||||
{
|
||||
unsigned x=0;
|
||||
LOOP(KERNEL_WIDTH, x,
|
||||
{
|
||||
unsigned offset_idx = y * TILE_X + x;
|
||||
unsigned out_chan_idx = y * KERNEL_WIDTH + x;
|
||||
|
||||
blockC[out_idx] = mad( BLOCK_A( tile_idx + offset_idx ), blockB[out_chan_idx], blockC[out_idx] );
|
||||
} )
|
||||
} )
|
||||
} )
|
||||
} )
|
||||
}
|
||||
while ( ++patch_depth < INPUT_FEATURE_NUM );
|
||||
|
||||
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
|
||||
// TILE_K x TILE_M x SIMD. Partial writes most likely generated if output padding used.
|
||||
// Group stores into vectors to expedite writeback. One large write is faster than many
|
||||
// small saves. Right-most column may be smaller if output width not divisible by tile width.
|
||||
__global half *out = dst
|
||||
+ batch_id * OUTPUT_BATCH_PITCH // batch offset
|
||||
+ out_fm * OUTPUT_FEATURE_PITCH // channel offset
|
||||
+ ( global_y * TILE_M ) * OUTPUT_Y_PITCH // y offset
|
||||
+ ( global_x * TILE_K ); // x offset
|
||||
|
||||
if ( batch_id < OUTPUT_BATCH_NUM && out_fm < OUTPUT_FEATURE_NUM )
|
||||
{
|
||||
half bias = biases[out_fm];
|
||||
if ( OUTPUT_SIZE_X % TILE_K == 0 ||
|
||||
group_x < max_group_x - 1 )
|
||||
{
|
||||
typedef CAT( half, TILE_K ) half_t;
|
||||
half bias = biases[out_fm];
|
||||
for( unsigned y = 0; y < TILE_M; y++ )
|
||||
{
|
||||
if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
|
||||
{
|
||||
half_t vBlockC;
|
||||
half *pvBlockC = (half*)&vBlockC;
|
||||
for (unsigned i = 0; i < TILE_K; i++) pvBlockC[i] = activation_function(blockC[y * TILE_K + i] + bias, ACTIVATION_PARAMS);
|
||||
*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
typedef CAT( half, RIGHT_PARTIAL_TILE_K ) half_t;
|
||||
for( unsigned y = 0; y < TILE_M; y++ )
|
||||
{
|
||||
if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
|
||||
{
|
||||
half_t vBlockC;
|
||||
half *pvBlockC = (half*)&vBlockC;
|
||||
for (unsigned i = 0; i < RIGHT_PARTIAL_TILE_K; i++) pvBlockC[i] = activation_function(blockC[y * TILE_K + i] + bias, ACTIVATION_PARAMS);
|
||||
*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // cl_intel_subgroups_short
|
||||
@@ -2,7 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define INPUT_TYPE INPUT0_TYPE
|
||||
@@ -19,59 +21,19 @@
|
||||
|
||||
#define AS_FILTER_TYPE8 CAT(as_, FILTER_TYPE8)
|
||||
|
||||
#if INPUT0_TYPE_SIZE == 2
|
||||
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ2(ptr, offset) AS_INPUT_TYPE2(intel_sub_group_block_read_us2((__global ushort*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read_us4((__global ushort*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
|
||||
#elif INPUT0_TYPE_SIZE == 4
|
||||
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ2(ptr, offset) AS_INPUT_TYPE2(intel_sub_group_block_read2((__global uint*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported input type
|
||||
#endif
|
||||
|
||||
#if FILTER_TYPE_SIZE == 2
|
||||
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
|
||||
#elif FILTER_TYPE_SIZE == 4
|
||||
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
|
||||
#endif
|
||||
|
||||
#if OUTPUT_FORMAT_BFYX
|
||||
# define OUTPUTVTYPE(n) CAT(OUTPUT_TYPE, n)
|
||||
# define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE(OUTPUT_X_BLOCK_SIZE))
|
||||
# define VSTORE CAT(vstore, OUTPUT_X_BLOCK_SIZE)
|
||||
#else
|
||||
# if OUTPUT_TYPE_SIZE == 1
|
||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
|
||||
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val))
|
||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
|
||||
# elif OUTPUT_TYPE_SIZE == 2
|
||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
|
||||
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val))
|
||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
|
||||
# elif OUTPUT_TYPE_SIZE == 4
|
||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
|
||||
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val))
|
||||
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
||||
# else
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported output type
|
||||
# endif
|
||||
#endif // OUTPUT_FORMAT_BFYX
|
||||
|
||||
#if INPUT0_TYPE_SIZE == 2
|
||||
# define AS_INPUT_SRC CAT(as_, MAKE_VECTOR_TYPE(INPUT_TYPE, OUTPUT_X_BLOCK_SIZE))
|
||||
# define AS_US_SRC CAT(as_, MAKE_VECTOR_TYPE(ushort, OUTPUT_X_BLOCK_SIZE))
|
||||
# define GET_SRC(data, id) AS_INPUT_SRC(intel_sub_group_shuffle(AS_US_SRC(data), id))
|
||||
# define GET_SRC(data, id) AS_INPUT_SRC(_sub_group_shuffle(AS_US_SRC(data), id))
|
||||
#else
|
||||
# define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
|
||||
# define GET_SRC(data, id) _sub_group_shuffle(data, id)
|
||||
#endif
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
@@ -79,7 +41,7 @@
|
||||
#define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
||||
#define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
|
||||
KERNEL(convolution_bfyx_f16)(
|
||||
__global INPUT0_TYPE* input,
|
||||
@@ -169,12 +131,12 @@ KERNEL(convolution_bfyx_f16)(
|
||||
|
||||
#if BIAS_TERM
|
||||
#if SLM_DIV_FACTOR == 1
|
||||
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
vec_t dst = (vec_t)(DT_INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
#else
|
||||
vec_t dst;
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
dst = (vec_t)(DT_INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
|
||||
} else {
|
||||
dst = INPUT0_VAL_ZERO;
|
||||
}
|
||||
@@ -240,7 +202,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
{
|
||||
int xb = 0;
|
||||
for (; xb + 8 <= INPUT_LINE_SIZE; xb += 8) {
|
||||
INPUT_TYPE8 vv = INPUT_BLOCK_READ8(input, grouped_input_offset +
|
||||
INPUT_TYPE8 vv = DT_INPUT_BLOCK_READ8(input, grouped_input_offset +
|
||||
icb * input_fs_pitch +
|
||||
kh * DILATION_SIZE_Y * input_y_pitch +
|
||||
xb * input_x_pitch);
|
||||
@@ -255,7 +217,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
line_cache[xb + 7] = vv[7];
|
||||
}
|
||||
for (; xb + 4 <= INPUT_LINE_SIZE; xb += 4) {
|
||||
INPUT_TYPE4 vv = INPUT_BLOCK_READ4(input, grouped_input_offset +
|
||||
INPUT_TYPE4 vv = DT_INPUT_BLOCK_READ4(input, grouped_input_offset +
|
||||
icb * input_fs_pitch +
|
||||
kh * DILATION_SIZE_Y * input_y_pitch +
|
||||
xb * input_x_pitch);
|
||||
@@ -266,7 +228,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
line_cache[xb + 3] = vv[3];
|
||||
}
|
||||
for (; xb < INPUT_LINE_SIZE; xb++) {
|
||||
line_cache[xb] = INPUT_BLOCK_READ(input, grouped_input_offset +
|
||||
line_cache[xb] = DT_INPUT_BLOCK_READ(input, grouped_input_offset +
|
||||
icb * input_fs_pitch +
|
||||
kh * DILATION_SIZE_Y * input_y_pitch +
|
||||
xb * input_x_pitch);
|
||||
@@ -333,11 +295,11 @@ KERNEL(convolution_bfyx_f16)(
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
|
||||
#endif // FILTER_IFM_NUM
|
||||
#else
|
||||
FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
||||
FILTER_TYPE8 wei0 = DT_FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
||||
icb * filter_is_pitch +
|
||||
kh * filter_y_pitch +
|
||||
kw * filter_x_pitch);
|
||||
FILTER_TYPE8 wei1 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
||||
FILTER_TYPE8 wei1 = DT_FILTER_BLOCK_READ8(weights, grouped_filter_offset +
|
||||
icb * filter_is_pitch +
|
||||
kh * filter_y_pitch +
|
||||
kw * filter_x_pitch +
|
||||
@@ -388,8 +350,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
unroll_for(int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
|
||||
@@ -453,13 +414,13 @@ KERNEL(convolution_bfyx_f16)(
|
||||
#endif
|
||||
#else
|
||||
#if OUTPUT_X_BLOCK_SIZE == 8
|
||||
OUTPUT_BLOCK_WRITE8(output, output_offset, res);
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, output_offset, res);
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 4
|
||||
OUTPUT_BLOCK_WRITE4(output, output_offset, res);
|
||||
DT_OUTPUT_BLOCK_WRITE4(output, output_offset, res);
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 2
|
||||
OUTPUT_BLOCK_WRITE2(output, output_offset, res);
|
||||
DT_OUTPUT_BLOCK_WRITE2(output, output_offset, res);
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 1
|
||||
OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
||||
DT_OUTPUT_BLOCK_WRITE(output, output_offset, res);
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
|
||||
#endif
|
||||
@@ -480,7 +441,7 @@ KERNEL(convolution_bfyx_f16)(
|
||||
#if OUTPUT_FORMAT_BFYX
|
||||
output[output_offset + i] = res[i];
|
||||
#else
|
||||
OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
|
||||
DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@@ -511,20 +472,8 @@ KERNEL(convolution_bfyx_f16)(
|
||||
|
||||
#undef AS_FILTER_TYPE8
|
||||
|
||||
#undef INPUT_BLOCK_READ
|
||||
#undef INPUT_BLOCK_READ2
|
||||
#undef INPUT_BLOCK_READ4
|
||||
#undef INPUT_BLOCK_READ8
|
||||
|
||||
#undef FILTER_BLOCK_READ8
|
||||
|
||||
#if OUTPUT_FORMAT_BFYX
|
||||
# undef OUTPUTVTYPE
|
||||
# undef TO_OUTPUTVTYPE
|
||||
# undef VSTORE
|
||||
#else
|
||||
# undef OUTPUT_BLOCK_WRITE
|
||||
# undef OUTPUT_BLOCK_WRITE2
|
||||
# undef OUTPUT_BLOCK_WRITE4
|
||||
# undef OUTPUT_BLOCK_WRITE8
|
||||
#endif // OUTPUT_FORMAT_BFYX
|
||||
|
||||
@@ -2,17 +2,18 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/unit_type.cl"
|
||||
|
||||
#if X_BLOCK_SIZE > 1
|
||||
# define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
|
||||
intel_sub_group_shuffle( \
|
||||
_sub_group_shuffle( \
|
||||
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
|
||||
id))
|
||||
#else
|
||||
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
|
||||
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, _sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
|
||||
#endif
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
@@ -22,7 +23,7 @@
|
||||
# define UNIT_BLOCK_WRITE_VEC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
|
||||
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
__global INPUT0_TYPE* input,
|
||||
@@ -211,8 +212,7 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (feature_sub_block == 0) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
unroll_for(int i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
|
||||
|
||||
@@ -3,9 +3,9 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
@@ -22,42 +22,9 @@
|
||||
#define AS_FILTER_TYPE2 CAT(as_, FILTER_TYPE2)
|
||||
#define TO_OUTPUT_TYPE8 CAT(convert_, OUTPUT_TYPE8)
|
||||
|
||||
#if INPUT0_TYPE_SIZE == 2
|
||||
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
|
||||
#elif INPUT0_TYPE_SIZE == 4
|
||||
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
|
||||
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16_depthwise.cl - unsupported input type.
|
||||
#endif
|
||||
|
||||
#if FILTER_TYPE_SIZE == 2
|
||||
# define FILTER_BLOCK_READ(ptr, offset) AS_FILTER_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
|
||||
# define FILTER_BLOCK_READ2(ptr, offset) AS_FILTER_TYPE2(intel_sub_group_block_read_us2((__global ushort*)(ptr) + (offset)))
|
||||
#elif FILTER_TYPE_SIZE == 4
|
||||
# define FILTER_BLOCK_READ(ptr, offset) AS_FILTER_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
|
||||
# define FILTER_BLOCK_READ2(ptr, offset) AS_FILTER_TYPE2(intel_sub_group_block_read2((__global uint*)(ptr) + (offset)))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16_depthwise.cl - unsupported filter type.
|
||||
#endif
|
||||
|
||||
#if OUTPUT_TYPE_SIZE == 1
|
||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
|
||||
#elif OUTPUT_TYPE_SIZE == 2
|
||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
|
||||
#elif OUTPUT_TYPE_SIZE == 4
|
||||
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
|
||||
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
|
||||
#else
|
||||
# error convolution_gpu_bfyx_f16_depthwise.cl - unsupported output type.
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
||||
KERNEL(convolution_depthwise)(
|
||||
KERNEL(convolution_gpu_bfyx_f16_depthwise)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
@@ -96,32 +63,32 @@ KERNEL(convolution_depthwise)(
|
||||
(f_block + input_fs_pad_before) * input_fs_pitch;
|
||||
|
||||
#if BIAS_TERM
|
||||
INPUT_TYPE8 dst = (INPUT_TYPE8)(INPUT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
|
||||
INPUT_TYPE8 dst = (INPUT_TYPE8)(DT_INPUT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
|
||||
#else
|
||||
INPUT_TYPE8 dst = (INPUT_TYPE8)(INPUT0_VAL_ZERO);
|
||||
#endif
|
||||
|
||||
#if ((FILTER_SIZE_X == 3) && (FILTER_SIZE_Y == 3) && (STRIDE_SIZE_X == 1) && (DILATION_SIZE_X == 1) && (DILATION_SIZE_Y == 1))
|
||||
|
||||
FILTER_TYPE wei_00 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_01 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_02 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_10 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_11 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_12 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_20 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_21 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_22 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_00 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_01 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_02 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_10 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_11 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_12 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_20 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_21 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
|
||||
FILTER_TYPE wei_22 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
|
||||
|
||||
INPUT_TYPE8 src_block_0 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 0) * input_y_pitch + (input_x) * input_x_pitch);
|
||||
INPUT_TYPE8 src_block_1 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 1) * input_y_pitch + (input_x) * input_x_pitch);
|
||||
INPUT_TYPE8 src_block_2 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 2) * input_y_pitch + (input_x) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_00 = INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 8) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_01 = INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 9) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_10 = INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 8) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_11 = INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 9) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_20 = INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 8) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_21 = INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 9) * input_x_pitch);
|
||||
INPUT_TYPE8 src_block_0 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 0) * input_y_pitch + (input_x) * input_x_pitch);
|
||||
INPUT_TYPE8 src_block_1 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 1) * input_y_pitch + (input_x) * input_x_pitch);
|
||||
INPUT_TYPE8 src_block_2 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 2) * input_y_pitch + (input_x) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_00 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 8) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_01 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 9) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_10 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 8) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_11 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 9) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_20 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 8) * input_x_pitch);
|
||||
INPUT_TYPE src_tail_21 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 9) * input_x_pitch);
|
||||
|
||||
#if X_BLOCK_SIZE == 8
|
||||
for (uint i = 0; i < X_BLOCK_SIZE - 2; i++)
|
||||
@@ -185,12 +152,12 @@ KERNEL(convolution_depthwise)(
|
||||
|
||||
unroll_for (uint i = 0; i < FILTER_SIZE_Y; i++) {
|
||||
unroll_for (uint j = 0; j < FILTER_SIZE_X_DIV_2; j++) {
|
||||
wei_temp = FILTER_BLOCK_READ2(weights, filter_offset + i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + j * 2 * FEATURE_SLICE_SIZE);
|
||||
wei_temp = DT_FILTER_BLOCK_READ2(weights, filter_offset + i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + j * 2 * FEATURE_SLICE_SIZE);
|
||||
wei[i * FILTER_SIZE_X + j * 2] = wei_temp.s0;
|
||||
wei[i * FILTER_SIZE_X + j * 2 + 1] = wei_temp.s1;
|
||||
}
|
||||
#if (FILTER_SIZE_X % 2)
|
||||
wei[i * FILTER_SIZE_X + FILTER_SIZE_X - 1] = FILTER_BLOCK_READ(weights, filter_offset +
|
||||
wei[i * FILTER_SIZE_X + FILTER_SIZE_X - 1] = DT_FILTER_BLOCK_READ(weights, filter_offset +
|
||||
i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE +
|
||||
(FILTER_SIZE_X - 1) * FEATURE_SLICE_SIZE);
|
||||
#endif // (FILTER_SIZE_X % 2)
|
||||
@@ -201,7 +168,7 @@ KERNEL(convolution_depthwise)(
|
||||
unroll_for (uint k = 0; k < X_BLOCK_SIZE; k++) {
|
||||
unroll_for (uint i = 0; i < FILTER_SIZE_Y; i++) {
|
||||
unroll_for (uint j = 0; j < FILTER_SIZE_X; j++) {
|
||||
src[k * FILTER_SIZE_Y * FILTER_SIZE_X + i * FILTER_SIZE_X + j] = INPUT_BLOCK_READ(input, input_offset +
|
||||
src[k * FILTER_SIZE_Y * FILTER_SIZE_X + i * FILTER_SIZE_X + j] = DT_INPUT_BLOCK_READ(input, input_offset +
|
||||
(input_y + (i * DILATION_SIZE_Y)) * input_y_pitch +
|
||||
(input_x + (j * DILATION_SIZE_X) + k * STRIDE_SIZE_X) * input_x_pitch);
|
||||
}
|
||||
@@ -260,7 +227,7 @@ KERNEL(convolution_depthwise)(
|
||||
#else
|
||||
res = TO_OUTPUT_TYPE8(dst);
|
||||
#endif // HAS_FUSED_OPS
|
||||
OUTPUT_BLOCK_WRITE8(output, output_offset + x * output_x_pitch, res);
|
||||
DT_OUTPUT_BLOCK_WRITE8(output, output_offset + x * output_x_pitch, res);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -271,7 +238,7 @@ KERNEL(convolution_depthwise)(
|
||||
#else
|
||||
res[i] = TO_OUTPUT_TYPE(dst[i]);
|
||||
#endif // HAS_FUSED_OPS
|
||||
OUTPUT_BLOCK_WRITE(output, output_offset + (x + i) * output_x_pitch, res[i]);
|
||||
DT_OUTPUT_BLOCK_WRITE(output, output_offset + (x + i) * output_x_pitch, res[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -300,13 +267,11 @@ KERNEL(convolution_depthwise)(
|
||||
#else
|
||||
res = TO_OUTPUT_TYPE(dst[0]);
|
||||
#endif // HAS_FUSED_OPS
|
||||
OUTPUT_BLOCK_WRITE(output, output_offset + x * output_x_pitch, res);
|
||||
DT_OUTPUT_BLOCK_WRITE(output, output_offset + x * output_x_pitch, res);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef FEATURE_SLICE_SIZE
|
||||
#undef X_BLOCK_SIZE
|
||||
|
||||
@@ -322,12 +287,3 @@ KERNEL(convolution_depthwise)(
|
||||
|
||||
#undef AS_FILTER_TYPE2
|
||||
#undef TO_OUTPUT_TYPE8
|
||||
|
||||
#undef INPUT_BLOCK_READ
|
||||
#undef INPUT_BLOCK_READ8
|
||||
|
||||
#undef FILTER_BLOCK_READ
|
||||
#undef FILTER_BLOCK_READ2
|
||||
|
||||
#undef OUTPUT_BLOCK_WRITE
|
||||
#undef OUTPUT_BLOCK_WRITE8
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#if defined(cl_intel_subgroups_short)
|
||||
@@ -10,7 +9,7 @@
|
||||
#define TILE_K FILTER_SIZE_X
|
||||
#define TILE_N 32
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_f16)(
|
||||
const __global half *src0,
|
||||
__global half *dst,
|
||||
@@ -207,12 +206,12 @@ KERNEL(convolution_f16)(
|
||||
interleaved_y = 0;
|
||||
LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
|
||||
{
|
||||
p4BlockB00[interleaved_y] = intel_sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
|
||||
p4BlockB00[interleaved_y] = _sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
|
||||
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
|
||||
} )
|
||||
if ( kernel_width_is_odd )
|
||||
{
|
||||
p2BlockB00[FILTER_SIZE_X - 1] = intel_sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
|
||||
p2BlockB00[FILTER_SIZE_X - 1] = _sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
|
||||
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#define TILE_K FILTER_SIZE_X
|
||||
#define TILE_N 32
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(8)))
|
||||
REQD_SUB_GROUP_SIZE(8)
|
||||
KERNEL(convolution_f32)(
|
||||
const __global float *src0,
|
||||
__global float *dst,
|
||||
@@ -149,12 +149,12 @@ KERNEL(convolution_f32)(
|
||||
interleaved_y = 0;
|
||||
LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
|
||||
{
|
||||
p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
|
||||
p8BlockB00[interleaved_y] = as_float8( _sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
|
||||
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
|
||||
} )
|
||||
if ( kernel_width_is_odd )
|
||||
{
|
||||
p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
|
||||
p4BlockB00[FILTER_SIZE_X - 1] = as_float4( _sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
|
||||
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_bfyx_iyxo_5x5)(
|
||||
const __global UNIT_TYPE* input,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
|
||||
@@ -53,7 +53,7 @@ if (_kernel_data.leftovers)
|
||||
#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM)
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
|
||||
const __global UNIT_TYPE* input,
|
||||
@@ -173,10 +173,10 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
|
||||
|
||||
#if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
|
||||
//if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
|
||||
UNIT_TYPE val = intel_sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
|
||||
UNIT_TYPE val = _sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
|
||||
(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) % SUB_GROUP_SIZE);
|
||||
#else
|
||||
UNIT_TYPE val = intel_sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
|
||||
UNIT_TYPE val = _sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
|
||||
#endif
|
||||
|
||||
out[br * OUTPUT_BLOCK_WIDTH + bc] = mad(w[wi % PREFETCH], val, out[br * OUTPUT_BLOCK_WIDTH + bc]);
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
@@ -17,9 +19,9 @@
|
||||
#error "convolution_gpu_bfyx_to_bfyx_f16: Filter and bias has different data type."
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
||||
KERNEL(convolution_bfyx_to_bfyx_f16)(
|
||||
KERNEL(convolution_gpu_bfyx_to_bfyx_f16)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
__global FILTER_TYPE* weights,
|
||||
@@ -134,7 +136,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
|
||||
INPUT0_TYPE src[INPUT0_FEATURE_NUM];
|
||||
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
|
||||
for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) {
|
||||
src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
|
||||
src[ic] = _sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
|
||||
dst[i] = mad(wei[ic], src[ic], dst[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/unit_type.cl"
|
||||
|
||||
@@ -10,7 +12,7 @@
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
#define INPUT_FEATURE_NUM 3
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
|
||||
KERNEL(convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16)(
|
||||
__global INPUT0_TYPE* input,
|
||||
@@ -142,7 +144,7 @@ KERNEL(convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16)(
|
||||
|
||||
__attribute__((opencl_unroll_hint(INPUT_FEATURE_NUM)))
|
||||
for (int ic = 0; ic < INPUT_FEATURE_NUM; ic++) {
|
||||
UNIT_TYPE src = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
|
||||
UNIT_TYPE src = _sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
|
||||
dst[i] = mad(w[ic], src, dst[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/unit_type.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
|
||||
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
|
||||
#define ALIGNED_IFM_NUM (((FILTER_IFM_NUM + FSV - 1) / FSV) * FSV)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
|
||||
__global UNIT_TYPE* input,
|
||||
@@ -164,7 +164,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
|
||||
// With simd along x dimension:
|
||||
// (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) / SUB_GROUP_SIZE - element number in simd-lane;
|
||||
// (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) % SUB_GROUP_SIZE - simd-lane with that element.
|
||||
UNIT_TYPE in_val = intel_sub_group_shuffle(
|
||||
UNIT_TYPE in_val = _sub_group_shuffle(
|
||||
in[(out_y * STRIDE_SIZE_Y + f_y * DILATION_SIZE_Y) * INPUT_BLOCK_WIDTH_EL_CNT +
|
||||
(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) / SUB_GROUP_SIZE],
|
||||
(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) % SUB_GROUP_SIZE);
|
||||
@@ -299,8 +299,6 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
|
||||
// ========================================================================
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef INPUT0_SIZE_X_WITH_PADDING
|
||||
#undef INPUT0_SIZE_Y_WITH_PADDING
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
|
||||
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
|
||||
#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
|
||||
@@ -41,7 +41,7 @@
|
||||
// ======================================================================================
|
||||
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_fs_byx_fsv32)(
|
||||
__global INPUT0_TYPE* input,
|
||||
@@ -128,7 +128,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
|
||||
{
|
||||
unroll_for (uint out_f = 0; out_f < FSV_PER_THREAD; ++out_f)
|
||||
{
|
||||
INPUT0_TYPE in_val = intel_sub_group_shuffle(
|
||||
INPUT0_TYPE in_val = _sub_group_shuffle(
|
||||
in[(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) * FSV_PER_THREAD + ifii / SUB_GROUP_SIZE],
|
||||
ifii % SUB_GROUP_SIZE);
|
||||
|
||||
@@ -242,8 +242,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
|
||||
// ========================================================================
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef INPUT0_SIZE_X_WITH_PADDING
|
||||
#undef INPUT0_SIZE_Y_WITH_PADDING
|
||||
#undef INPUT0_SIZE_B_WITH_PADDING
|
||||
|
||||
@@ -3,11 +3,11 @@
|
||||
//
|
||||
|
||||
#include "include/unit_type.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
|
||||
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
|
||||
#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
|
||||
@@ -33,7 +33,7 @@
|
||||
// OUTPUT_BLOCK_HEIGHT - [int] number of elements calculated in y dimension by one thread
|
||||
// ======================================================================================
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
|
||||
__global UNIT_TYPE* input,
|
||||
@@ -109,7 +109,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
|
||||
{
|
||||
unroll_for (uint out_f = 0; out_f < FSV_PER_THREAD; ++out_f)
|
||||
{
|
||||
UNIT_TYPE in_val = intel_sub_group_shuffle(input_read[in_f / SUB_GROUP_SIZE], in_f % SUB_GROUP_SIZE);
|
||||
UNIT_TYPE in_val = _sub_group_shuffle(input_read[in_f / SUB_GROUP_SIZE], in_f % SUB_GROUP_SIZE);
|
||||
|
||||
const uint out_idx = out_y * OUTPUT_BLOCK_WIDTH * FSV_PER_THREAD + out_x * FSV_PER_THREAD + out_f;
|
||||
out[out_idx] = mad(w[in_f * FSV_PER_THREAD + out_f], in_val, out[out_idx]);
|
||||
@@ -236,8 +236,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
|
||||
// ========================================================================
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef INPUT0_SIZE_X_WITH_PADDING
|
||||
#undef INPUT0_SIZE_Y_WITH_PADDING
|
||||
#undef INPUT0_SIZE_B_WITH_PADDING
|
||||
|
||||
@@ -2,12 +2,11 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/unit_type.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
|
||||
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
|
||||
#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
|
||||
@@ -32,9 +31,9 @@
|
||||
// ======================================================================================
|
||||
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
|
||||
KERNEL(convolution_gpu_fs_byx_fsv32)(
|
||||
KERNEL(convolution_gpu_fs_byx_fsv32_depthwise)(
|
||||
__global UNIT_TYPE* input,
|
||||
__global UNIT_TYPE* output,
|
||||
__global UNIT_TYPE* weights,
|
||||
@@ -226,8 +225,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
|
||||
// ========================================================================
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
|
||||
#undef INPUT0_SIZE_X_WITH_PADDING
|
||||
#undef INPUT0_SIZE_Y_WITH_PADDING
|
||||
#undef INPUT0_SIZE_B_WITH_PADDING
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#if QUANTIZATION_TERM
|
||||
# define ACCUMULATOR_TYPE int
|
||||
# define TO_ACCUMULATOR_TYPE(x) convert_int(x)
|
||||
@@ -40,9 +41,6 @@
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
#define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b)
|
||||
|
||||
#if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0
|
||||
#define NON_ZERO_INPUT0_PAD_BEFORE
|
||||
#endif
|
||||
@@ -68,7 +66,7 @@
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD_SIZE)
|
||||
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
|
||||
KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#if INPUT0_LAYOUT_B_FS_YX_FSV16
|
||||
@@ -134,8 +132,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
int weights_zp_vec_partial;
|
||||
weights_zp_vec_partial = weights_zp_val;
|
||||
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
|
||||
unroll_for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
|
||||
wzp_p[in_f] = 0;
|
||||
}
|
||||
#endif
|
||||
@@ -237,7 +234,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#endif
|
||||
#else
|
||||
#ifdef BLOCK_LOAD_INPUTS
|
||||
in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*) &conv_input[in_addr]));
|
||||
in[reg] = AS_PACKED_TYPE(_sub_group_block_read((const __global uint*) &conv_input[in_addr]));
|
||||
#ifdef SHOULD_USE_DATA_ZP
|
||||
if (input_on_padding)
|
||||
in[reg] = data_zp_val;
|
||||
@@ -255,8 +252,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
}
|
||||
|
||||
#ifdef BLOCK_LOAD_WEIGHTS
|
||||
*((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
|
||||
w[8] = as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
|
||||
*((int8*)&w[0]) = as_int8(_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
|
||||
w[8] = as_int(_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
|
||||
weight_addr += SIMD_SIZE*NUM_FILTERS;
|
||||
#else
|
||||
for(int pf = 0; pf < NUM_FILTERS; pf++) {
|
||||
@@ -278,10 +275,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
dotProdAZPxW = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxW, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(w[wi])));
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
|
||||
unroll_for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
|
||||
unroll_for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
|
||||
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y],
|
||||
bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X));
|
||||
|
||||
@@ -403,5 +398,3 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
|
||||
#undef FILTER_TYPE_4
|
||||
#undef AS_FILTER_TYPE_4
|
||||
#undef NUM_FILTERS
|
||||
#undef CEIL_DIV
|
||||
#undef ALIGN
|
||||
|
||||
@@ -4,7 +4,8 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#if QUANTIZATION_TERM
|
||||
#define ACCUMULATOR_TYPE int
|
||||
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
|
||||
@@ -24,7 +25,7 @@
|
||||
#define BATCH_SLICE_SIZE 16
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
__global OUTPUT_TYPE *output,
|
||||
@@ -63,15 +64,15 @@ KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
|
||||
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint j = 0; j < 16; j++) {
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val.s0, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val.s1, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val.s2, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val.s3, j))));
|
||||
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val2.s0, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val2.s1, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val2.s2, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val2.s3, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val2.s0, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val2.s1, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val2.s2, j))));
|
||||
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val2.s3, j))));
|
||||
}
|
||||
filter_idx += weights_x_pitch;
|
||||
filter_idx2 += weights_x_pitch;
|
||||
@@ -94,7 +95,7 @@ KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
|
||||
|
||||
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
|
||||
#if BIAS_TERM
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + intel_sub_group_shuffle(bias, i);
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + _sub_group_shuffle(bias, i);
|
||||
#else
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i];
|
||||
#endif
|
||||
|
||||
@@ -4,7 +4,8 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#if QUANTIZATION_TERM
|
||||
#define ACCUMULATOR_TYPE int
|
||||
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
|
||||
@@ -25,7 +26,7 @@
|
||||
|
||||
// int8 conv_input and weights data is packed to int32 "batches",
|
||||
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3)(
|
||||
const __global INPUT0_TYPE *conv_input,
|
||||
__global OUTPUT_TYPE *output,
|
||||
@@ -68,10 +69,10 @@ uint split_idx)
|
||||
|
||||
__attribute__((opencl_unroll_hint(16)))
|
||||
for (uint j = 0; j < 16; j++) {
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val.s0, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val.s1, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val.s2, j))));
|
||||
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val.s3, j))));
|
||||
}
|
||||
filter_idx += weights_x_pitch;
|
||||
input_idx += input_x_pitch;
|
||||
@@ -93,7 +94,7 @@ uint split_idx)
|
||||
for (uint i = 0; i < 16; i++) {
|
||||
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
|
||||
#if BIAS_TERM
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[i] + intel_sub_group_shuffle(bias, i);
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[i] + _sub_group_shuffle(bias, i);
|
||||
#else
|
||||
dequantized = (ACTIVATION_TYPE)dotProd[i];
|
||||
#endif
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/mmad.cl"
|
||||
|
||||
@@ -26,7 +27,7 @@
|
||||
#define ACTIVATION_TYPE_VEC float8
|
||||
#define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
|
||||
#define MMAD MMAD_8x8
|
||||
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 4
|
||||
#define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
|
||||
#define ACCUMULATOR_TYPE_VEC int4
|
||||
@@ -34,13 +35,13 @@
|
||||
#define ACTIVATION_TYPE_VEC float4
|
||||
#define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
|
||||
#define MMAD MMAD_4x8
|
||||
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
|
||||
#else
|
||||
#error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size"
|
||||
#endif
|
||||
|
||||
__attribute__((reqd_work_group_size(8, OW_GROUP, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
KERNEL(convolution_mmad_b_fs_yx_fsv32)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global PACKED_OUT_TYPE* output,
|
||||
@@ -145,7 +146,7 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
|
||||
}
|
||||
else
|
||||
{
|
||||
line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, intel_sub_group_block_read((const __global uint*)(input + in_addr +
|
||||
line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr +
|
||||
icb * input_fs_pitch +
|
||||
kd * DILATION_SIZE_Z * input_z_pitch +
|
||||
kh * DILATION_SIZE_Y * input_y_pitch +
|
||||
@@ -166,10 +167,10 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
|
||||
+ kh * ISV_SIZE * OSV_SIZE * FILTER_SIZE_X
|
||||
+ kw * ISV_SIZE * OSV_SIZE;
|
||||
|
||||
int8 weights_data0 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
|
||||
int8 weights_data1 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
|
||||
int8 weights_data2 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
|
||||
int8 weights_data3 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));
|
||||
int8 weights_data0 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
|
||||
int8 weights_data1 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
|
||||
int8 weights_data2 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
|
||||
int8 weights_data3 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));
|
||||
|
||||
acc[0] = MMAD(src, weights_data0, acc[0]); // 8 elements in 4*lid+0 out channel
|
||||
acc[1] = MMAD(src, weights_data1, acc[1]); // 8 elements in 4*lid+1 out channel
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(convolution_mmad_b_fs_yx_fsv32_dw)(
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/imad.cl"
|
||||
|
||||
#define CEIL_DIV(x, y) (1 + ((x) - 1) / (y))
|
||||
#include "include/batch_headers/imad.cl"
|
||||
|
||||
#define ISV 4
|
||||
|
||||
@@ -30,9 +29,9 @@
|
||||
#define ACTIVATION_TYPE_VEC float8
|
||||
#define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
|
||||
#if OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
|
||||
#else // OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
#define BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr), as_uchar8(val))
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_uc8((__global uchar*)(ptr), as_uchar8(val))
|
||||
#endif // OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 4
|
||||
#define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
|
||||
@@ -41,9 +40,9 @@
|
||||
#define ACTIVATION_TYPE_VEC float4
|
||||
#define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
|
||||
#if OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
|
||||
#else // OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
#define BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr), as_uchar4(val))
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_uc4((__global uchar*)(ptr), as_uchar4(val))
|
||||
#endif // OUTPUT_LAYOUT_B_FS_YX_FSV32
|
||||
#else
|
||||
#error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: Unsupported block size"
|
||||
@@ -52,9 +51,8 @@
|
||||
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(LWS0, LWS1, LWS2)))
|
||||
KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
|
||||
__global INPUT0_TYPE* input,
|
||||
@@ -265,9 +263,9 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
|
||||
+ kh * OSV * ISV * FILTER_SIZE_X
|
||||
+ kw * OSV * ISV;
|
||||
|
||||
int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
|
||||
int weights_data0 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off)));
|
||||
#if OUTPUT_FEATURE_NUM > 16
|
||||
int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
|
||||
int weights_data1 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
|
||||
#endif
|
||||
PACKED_TYPE_VEC src;
|
||||
|
||||
@@ -492,7 +490,6 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
|
||||
|
||||
#endif // OUTPUT_IS_FP
|
||||
}
|
||||
#undef CEIL_DIV
|
||||
#undef PACKED_TYPE_VEC
|
||||
#undef ACCUMULATOR_TYPE_VEC
|
||||
#undef TO_ACCUMULATOR_TYPE_VEC
|
||||
|
||||
@@ -2,11 +2,10 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/imad.cl"
|
||||
|
||||
#define CEIL_DIV(x, y) (1 + ((x) - 1) / (y))
|
||||
#include "include/batch_headers/imad.cl"
|
||||
|
||||
#ifdef ACCUMULATOR_TYPE
|
||||
#undef ACCUMULATOR_TYPE
|
||||
@@ -27,14 +26,14 @@
|
||||
#define TO_ACCUMULATOR_TYPE_VEC(x) convert_int8(x)
|
||||
#define ACTIVATION_TYPE_VEC float8
|
||||
#define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
|
||||
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
|
||||
#elif OUTPUT_X_BLOCK_SIZE == 4
|
||||
#define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
|
||||
#define ACCUMULATOR_TYPE_VEC int4
|
||||
#define TO_ACCUMULATOR_TYPE_VEC(x) convert_int4(x)
|
||||
#define ACTIVATION_TYPE_VEC float4
|
||||
#define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
|
||||
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
|
||||
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
|
||||
#else
|
||||
#error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: Unsupported block size"
|
||||
#endif
|
||||
@@ -43,7 +42,7 @@
|
||||
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
|
||||
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
|
||||
__global INPUT0_TYPE* input,
|
||||
__global PACKED_OUT_TYPE* output,
|
||||
@@ -129,8 +128,8 @@ KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
|
||||
+ kh * OSV * 4 * FILTER_SIZE_X
|
||||
+ kw * OSV * 4;
|
||||
|
||||
int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
|
||||
int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + 16*4)));
|
||||
int weights_data0 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off)));
|
||||
int weights_data1 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off + 16*4)));
|
||||
|
||||
PACKED_TYPE_VEC src;
|
||||
|
||||
@@ -223,7 +222,6 @@ KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
|
||||
#endif // OUTPUT_IS_FP
|
||||
}
|
||||
|
||||
#undef CEIL_DIV
|
||||
#undef PACKED_TYPE_VEC
|
||||
#undef ACCUMULATOR_TYPE_VEC
|
||||
#undef TO_ACCUMULATOR_TYPE_VEC
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------------------
|
||||
// L3_SIMD_4x8
|
||||
// Input matrices dimensions: M x K x N
|
||||
@@ -35,7 +37,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
|
||||
const int INPUT0_SIZE_Y_PITCH_UNIT_4 = INPUT0_PITCH_SIZE_Y / VEC_SIZE; //for bxyf -> INPUT0_PITCH_SIZE_Y is equal to input features count, since ifm % 32 == 0, division by VEC_SIZE is ok
|
||||
const int OUTPUT_SIZE_Y_PITCH_UNIT_4 = OUTPUT_Y_PITCH / VEC_SIZE; //for bxyf -> OUTPUT_Y_PITCH is equal to output features count, since ofm % 32 == 0, division by VEC_SIZE is ok
|
||||
const int WEIGHTS_FEATURE_PITCH_UNIT_4 = WEIGHTS_PITCH_FEATURE / VEC_SIZE; //for xyio -> WEIGHTS_PITCH_FEATURE is equal to the output features count
|
||||
|
||||
|
||||
const int group_x = get_group_id(0);
|
||||
const int group_y = get_group_id(1);
|
||||
const int group_z = get_group_id(2);
|
||||
@@ -59,10 +61,10 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
|
||||
const int y_idx = tile_idx_y; //winograd tile height == 1
|
||||
const int f_idx = group_x * TILE_N + local_x * VEC_SIZE;
|
||||
const int b_idx = batch_idx;
|
||||
|
||||
|
||||
const int in_tile_idx = (x_idx % WINOGRAD_TILE_WIDTH);
|
||||
const int tile_idx_x = (x_idx / WINOGRAD_TILE_WIDTH);
|
||||
|
||||
|
||||
// Result ctile is M rows x N columns
|
||||
// M = 8, we have 1 rows of work-items, so we need 8/1 = 8 results down
|
||||
// N = 32, we have 8 columns of work-items, so we need 32/8 = 4 results across = 1 float4s across
|
||||
@@ -124,11 +126,11 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
|
||||
const UNIT_TYPE_4 a6 = src0[6 * INPUT0_SIZE_Y_PITCH_UNIT_4];
|
||||
const UNIT_TYPE_4 a7 = src0[7 * INPUT0_SIZE_Y_PITCH_UNIT_4];
|
||||
|
||||
#define DOT_PRODUCT( _i, _j ) { a = intel_sub_group_shuffle(a ## _i, _j); c ## _i = mad(a.x, b0, mad(a.y, b1, mad(a.z, b2, mad(a.w, b3, c ## _i)))); }
|
||||
#define DOT_PRODUCT( _i, _j ) { a = _sub_group_shuffle(a ## _i, _j); c ## _i = mad(a.x, b0, mad(a.y, b1, mad(a.z, b2, mad(a.w, b3, c ## _i)))); }
|
||||
|
||||
//in one iteration load weights tile 1-width, 1-height, 4-depth from 4 different filters (ofms)
|
||||
//SIMD reads are chained along b-axis (different ofms), resulting in 1-width, 1-height, 4-depth blocks from 4*8=32 different filters
|
||||
//consecutive reads are chained along f-dim and overflows to y-dim, reading in total
|
||||
//consecutive reads are chained along f-dim and overflows to y-dim, reading in total
|
||||
#define ITERATION( _j ) \
|
||||
{ \
|
||||
const UNIT_TYPE_4 b0 = src1[0]; src1 += WEIGHTS_FEATURE_PITCH_UNIT_4; \
|
||||
@@ -165,7 +167,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
|
||||
|
||||
src0 += TILE_K / VEC_SIZE;
|
||||
}
|
||||
|
||||
|
||||
dst[0] = c0; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
|
||||
dst[0] = c1; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
|
||||
dst[0] = c2; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
|
||||
|
||||
@@ -9,26 +9,27 @@
|
||||
// --------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
|
||||
#define DOT4i0( _result, _A, _B, i) \
|
||||
{ \
|
||||
_result = mad(_A.s0, intel_sub_group_shuffle( _B.s0, (i)), _result); \
|
||||
_result = mad(_A.s0, _sub_group_shuffle( _B.s0, (i)), _result); \
|
||||
}
|
||||
|
||||
#define DOT4i1( _result, _A, _B, i) \
|
||||
{ \
|
||||
_result = mad(_A.s1, intel_sub_group_shuffle( _B.s1, (i)), _result); \
|
||||
_result = mad(_A.s1, _sub_group_shuffle( _B.s1, (i)), _result); \
|
||||
}
|
||||
|
||||
#define DOT4i2( _result, _A, _B, i) \
|
||||
{ \
|
||||
_result = mad(_A.s2, intel_sub_group_shuffle( _B.s2, (i)), _result); \
|
||||
_result = mad(_A.s2, _sub_group_shuffle( _B.s2, (i)), _result); \
|
||||
}
|
||||
|
||||
#define DOT4i3( _result, _A, _B, i) \
|
||||
{ \
|
||||
_result = mad(_A.s3, intel_sub_group_shuffle( _B.s3, (i)), _result); \
|
||||
_result = mad(_A.s3, _sub_group_shuffle( _B.s3, (i)), _result); \
|
||||
}
|
||||
|
||||
#define UNIT_TYPE_2 CAT(UNIT_TYPE, 2)
|
||||
@@ -36,15 +37,15 @@
|
||||
#define UNIT_TYPE_8 CAT(UNIT_TYPE, 8)
|
||||
|
||||
__attribute__((reqd_work_group_size(8, 2, 8)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
(
|
||||
__global INPUT0_TYPE* I,
|
||||
__global OUTPUT_TYPE* O,
|
||||
__global FILTER_TYPE* U,
|
||||
__global INPUT0_TYPE* I,
|
||||
__global OUTPUT_TYPE* O,
|
||||
__global FILTER_TYPE* U,
|
||||
#if BIAS_TERM
|
||||
const __global UNIT_TYPE * bias,
|
||||
#endif
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
// (DxC2)x(UxWx8c)
|
||||
@@ -52,17 +53,17 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
__local UNIT_TYPE_4 V[slmSize*2]; // 8 KB
|
||||
|
||||
/* These constants are defined as precompiler macros during compilation. */
|
||||
const uint WC = W*INPUT0_FEATURE_NUM;
|
||||
const uint HW = H*W;
|
||||
const uint HWC = H*WC;
|
||||
const uint WC4 = WC >> 2;
|
||||
const uint K16 = FILTER_OFM_NUM >> 4;
|
||||
const uint C4 = INPUT0_FEATURE_NUM >> 2;
|
||||
const uint K2 = FILTER_OFM_NUM >> 1;
|
||||
const uint QK2 = Q*K2;
|
||||
const uint QK = Q*FILTER_OFM_NUM;
|
||||
const uint PQK = P*QK;
|
||||
|
||||
const uint WC = W*INPUT0_FEATURE_NUM;
|
||||
const uint HW = H*W;
|
||||
const uint HWC = H*WC;
|
||||
const uint WC4 = WC >> 2;
|
||||
const uint K16 = FILTER_OFM_NUM >> 4;
|
||||
const uint C4 = INPUT0_FEATURE_NUM >> 2;
|
||||
const uint K2 = FILTER_OFM_NUM >> 1;
|
||||
const uint QK2 = Q*K2;
|
||||
const uint QK = Q*FILTER_OFM_NUM;
|
||||
const uint PQK = P*QK;
|
||||
|
||||
const uint upperHalf = get_local_id(1);
|
||||
uint gx = get_group_id(0);
|
||||
uint gy = (uint)get_group_id(1)*2+((uint)get_group_id(2)%2);
|
||||
@@ -86,7 +87,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
int x = gx*14 + lz*2 + lxd4 - px;
|
||||
int y = gy*4 - py;
|
||||
uint k = gk*16 + lzd4*8;
|
||||
|
||||
|
||||
// # x->
|
||||
// # M0 M1 M2 M3 M4 M5 M6
|
||||
// # +------------------------------------------
|
||||
@@ -113,13 +114,13 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
|
||||
uint lxm2 = lx % 2;
|
||||
uint lxb1 = (lx & 2)/2;
|
||||
|
||||
|
||||
uint2 coordU0;
|
||||
coordU0.x = (lzm4*24 + k*12);
|
||||
coordU0.y = 0;
|
||||
|
||||
uint slmPipeStage = 0;
|
||||
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint c = lxm4; c < C4_up16; c += 4) {
|
||||
|
||||
@@ -142,7 +143,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
bool y5_in = 0 <= (y + 5) && (y + 5) < H && x_in;
|
||||
|
||||
#if INPUT0_LAYOUT_BYXF
|
||||
|
||||
|
||||
/* const UNIT_TYPE_4 I_load_0 = y0_in ? I_load[0*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
|
||||
const UNIT_TYPE_4 I_load_1 = y1_in ? I_load[1*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
|
||||
const UNIT_TYPE_4 I_load_2 = y2_in ? I_load[2*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
|
||||
@@ -227,10 +228,10 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
//uint coordU_x = coordU0.x + get_sub_group_local_id()%8;
|
||||
const uint flatA = coordU0.y*FILTER_OFM_NUM*KCOLSW*KROWSW + coordU0.x + get_sub_group_local_id()%8;
|
||||
const UNIT_TYPE_4 f0 = (UNIT_TYPE_4)(
|
||||
*(__global UNIT_TYPE *)(&U[flatA+0*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+1*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+2*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+3*FILTER_OFM_NUM*KCOLSW*KROWSW])); // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+0*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+1*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+2*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
|
||||
*(__global UNIT_TYPE *)(&U[flatA+3*FILTER_OFM_NUM*KCOLSW*KROWSW])); // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
|
||||
|
||||
// row 0
|
||||
|
||||
@@ -554,7 +555,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
DOT4i3(M6.s2, f1, V13, 2 + c4);
|
||||
DOT4i3(M6.s3, f1, V13, 4 + c4);
|
||||
|
||||
|
||||
|
||||
//flatA += 8;
|
||||
const UNIT_TYPE_4 f2 = (UNIT_TYPE_4)(
|
||||
*(__global UNIT_TYPE *)(&U[flatA + 16 + 0 * FILTER_OFM_NUM*KCOLSW*KROWSW]),
|
||||
@@ -563,7 +564,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
*(__global UNIT_TYPE *)(&U[flatA + 16 + 3 * FILTER_OFM_NUM*KCOLSW*KROWSW]));
|
||||
coordU0.y += 4;
|
||||
|
||||
|
||||
|
||||
// f2[c4] x v[2 .. 16]
|
||||
DOT4i0(M0.s0, f2, V00, 4 + c4);
|
||||
DOT4i0(M0.s1, f2, V00, 6 + c4);
|
||||
@@ -628,7 +629,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
|
||||
// row 1
|
||||
|
||||
|
||||
|
||||
// f2 x v[2 .. 16]
|
||||
DOT4i1(M0.s2, f2, V10, 4 + c4);
|
||||
DOT4i1(M0.s3, f2, V10, 6 + c4);
|
||||
@@ -649,7 +650,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
DOT4i1(M6.s3, f2, V13, 6 + c4);
|
||||
|
||||
|
||||
|
||||
|
||||
// f2[c4] x v[2 .. 16]
|
||||
DOT4i2(M0.s0, f2, V00, 4 + c4);
|
||||
DOT4i2(M0.s1, f2, V00, 6 + c4);
|
||||
@@ -759,11 +760,11 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (lz < 7)
|
||||
if (lz < 7)
|
||||
{
|
||||
// Load multiplies from SLM.
|
||||
__local const UNIT_TYPE_8 *M_read = (__local UNIT_TYPE_8*)&V[lz*8 + lxd4*224 + lxm4*2 + slmSize*upperHalf];
|
||||
|
||||
|
||||
UNIT_TYPE_8 M0 = M_read[0*28];
|
||||
UNIT_TYPE_8 M1 = M_read[1*28];
|
||||
UNIT_TYPE_8 M2 = M_read[2*28];
|
||||
@@ -821,7 +822,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write_0[0] = ACTIVATION(S0.s0, ACTIVATION_PARAMS);
|
||||
O_write_0[0+Q*P] = ACTIVATION(S0.s4, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
#if OUTPUT_LAYOUT_BYXF
|
||||
@@ -829,7 +830,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[0*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s1 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s5 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[0*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s1, ACTIVATION_PARAMS), ACTIVATION(S0.s5, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_0[1] = ACTIVATION(S0.s1 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -837,8 +838,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_0[1] = ACTIVATION(S0.s1, ACTIVATION_PARAMS);
|
||||
O_write_0[1+Q*P] = ACTIVATION(S0.s5, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -850,7 +851,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[1*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s0 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s4 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[1*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s0, ACTIVATION_PARAMS), ACTIVATION(S1.s4, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_1[0] = ACTIVATION(S1.s0 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -858,8 +859,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_1[0] = ACTIVATION(S1.s0, ACTIVATION_PARAMS);
|
||||
O_write_1[0+Q*P] = ACTIVATION(S1.s4, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
#if OUTPUT_LAYOUT_BYXF
|
||||
@@ -867,7 +868,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[1*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s1 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s5 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[1*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s1, ACTIVATION_PARAMS), ACTIVATION(S1.s5, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_1[1] = ACTIVATION(S1.s1 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -875,8 +876,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_1[1] = ACTIVATION(S1.s1, ACTIVATION_PARAMS);
|
||||
O_write_1[1+Q*P] = ACTIVATION(S1.s5, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -888,7 +889,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[2*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s2 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s6 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[2*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s2, ACTIVATION_PARAMS), ACTIVATION(S0.s6, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_2[0] = ACTIVATION(S0.s2 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -896,8 +897,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_2[0] = ACTIVATION(S0.s2, ACTIVATION_PARAMS);
|
||||
O_write_2[0+Q*P] = ACTIVATION(S0.s6, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
#if OUTPUT_LAYOUT_BYXF
|
||||
@@ -905,7 +906,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[2*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s3 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s7 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[2*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s3, ACTIVATION_PARAMS), ACTIVATION(S0.s7, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_2[1] = ACTIVATION(S0.s3 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -913,8 +914,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_2[1] = ACTIVATION(S0.s3, ACTIVATION_PARAMS);
|
||||
O_write_2[1+Q*P] = ACTIVATION(S0.s7, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -926,7 +927,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[3*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s2 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s6 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[3*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s2, ACTIVATION_PARAMS), ACTIVATION(S1.s6, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_3[0] = ACTIVATION(S1.s2 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -934,7 +935,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_3[0] = ACTIVATION(S1.s2, ACTIVATION_PARAMS);
|
||||
O_write_3[0+Q*P] = ACTIVATION(S1.s6, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
@@ -943,7 +944,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
O_write[3*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s3 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s7 + bias[bias_index1], ACTIVATION_PARAMS));
|
||||
#else
|
||||
O_write[3*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s3, ACTIVATION_PARAMS), ACTIVATION(S1.s7, ACTIVATION_PARAMS));
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_3[1] = ACTIVATION(S1.s3 + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
@@ -951,8 +952,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
|
||||
#else
|
||||
O_write_3[1] = ACTIVATION(S1.s3, ACTIVATION_PARAMS);
|
||||
O_write_3[1+Q*P] = ACTIVATION(S1.s7, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
// --------------------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
|
||||
|
||||
#define DOT8i_0( _result, _A, _B, i) \
|
||||
@@ -63,7 +63,7 @@
|
||||
|
||||
|
||||
__attribute__((reqd_work_group_size(16, 1, 8)))
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
(
|
||||
__global INPUT0_TYPE* I,
|
||||
@@ -75,7 +75,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
#endif
|
||||
#if BIAS_TERM
|
||||
const __global UNIT_TYPE * bias,
|
||||
#endif
|
||||
#endif
|
||||
uint split_idx)
|
||||
{
|
||||
// (DxC2)x(UxWx8c)
|
||||
@@ -100,7 +100,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
|
||||
uint gx = get_group_id(0);
|
||||
uint gy = get_group_id(1);
|
||||
uint gz = get_group_id(2);
|
||||
uint gz = get_group_id(2);
|
||||
uint gk = gz % K16;
|
||||
uint gn = gz / K16;
|
||||
|
||||
@@ -266,7 +266,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
__local const UNIT_TYPE_8 *V_read_c16 = V_read;
|
||||
|
||||
__attribute__((opencl_unroll_hint(1)))
|
||||
for (uint c16 = 0; c16 < 2
|
||||
for (uint c16 = 0; c16 < 2
|
||||
#ifndef FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB
|
||||
&& coordU0.y < last_coord_y
|
||||
#endif
|
||||
@@ -297,17 +297,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
|
||||
// Fetch 8 channels of Winograd components from f(k,s)
|
||||
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
|
||||
const UNIT_TYPE_8 f00 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x, coordU0.y)));
|
||||
const UNIT_TYPE_8 f00 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x, coordU0.y)));
|
||||
#else
|
||||
const UNIT_TYPE_8 f00 = (UNIT_TYPE_8)(
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 0 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 1 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 2 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 3 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 4 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 5 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 6 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 7 * WEIGHTWIDTH])));
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 0 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 1 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 2 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 3 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 4 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 5 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 6 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 7 * WEIGHTWIDTH])));
|
||||
#endif
|
||||
|
||||
|
||||
@@ -467,17 +467,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
DOT8i_7(M6.s1, f00, V8, 10 + c8);
|
||||
|
||||
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
|
||||
const UNIT_TYPE_8 f01 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x + 16 * sizeof(UNIT_TYPE), coordU0.y)));
|
||||
const UNIT_TYPE_8 f01 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x + 16 * sizeof(UNIT_TYPE), coordU0.y)));
|
||||
#else
|
||||
const UNIT_TYPE_8 f01 = (UNIT_TYPE_8)(
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 0 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 1 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 2 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 3 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 4 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 5 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 6 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 7 * WEIGHTWIDTH])));
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 0 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 1 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 2 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 3 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 4 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 5 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 6 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 7 * WEIGHTWIDTH])));
|
||||
#endif
|
||||
|
||||
// f1[c8] x v[1 .. 15]
|
||||
@@ -637,17 +637,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
DOT8i_7(M6.s1, f01, V8, 12 + c8);
|
||||
|
||||
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
|
||||
const UNIT_TYPE_8 f02 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x + 32 * sizeof(UNIT_TYPE), coordU0.y)));
|
||||
const UNIT_TYPE_8 f02 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x + 32 * sizeof(UNIT_TYPE), coordU0.y)));
|
||||
#else
|
||||
const UNIT_TYPE_8 f02 = (UNIT_TYPE_8)(
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 0 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 1 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 2 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 3 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 4 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 5 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 6 * WEIGHTWIDTH])),
|
||||
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 7 * WEIGHTWIDTH])));
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 0 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 1 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 2 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 3 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 4 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 5 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 6 * WEIGHTWIDTH])),
|
||||
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 7 * WEIGHTWIDTH])));
|
||||
#endif
|
||||
coordU0.y += 8;
|
||||
|
||||
@@ -919,7 +919,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
#else
|
||||
O_write_0[0] = ACTIVATION(S0.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
#if OUTPUT_LAYOUT_BYXF
|
||||
@@ -927,14 +927,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[0 * QK + 1 * K] = ACTIVATION(S0.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[0 * QK + 1 * K] = ACTIVATION(S0.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_0[1] = ACTIVATION(S0.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_0[1] = ACTIVATION(S0.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -946,14 +946,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[1 * QK + 0 * K] = ACTIVATION(S1.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[1 * QK + 0 * K] = ACTIVATION(S1.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_1[0] = ACTIVATION(S1.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_1[0] = ACTIVATION(S1.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
#if OUTPUT_LAYOUT_BYXF
|
||||
@@ -961,14 +961,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[1 * QK + 1 * K] = ACTIVATION(S1.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[1 * QK + 1 * K] = ACTIVATION(S1.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_1[1] = ACTIVATION(S1.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_1[1] = ACTIVATION(S1.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -980,14 +980,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[2 * QK + 0 * K] = ACTIVATION(S2.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[2 * QK + 0 * K] = ACTIVATION(S2.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_2[0] = ACTIVATION(S2.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_2[0] = ACTIVATION(S2.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
#if OUTPUT_LAYOUT_BYXF
|
||||
@@ -995,14 +995,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[2 * QK + 1 * K] = ACTIVATION(S2.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[2 * QK + 1 * K] = ACTIVATION(S2.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_2[1] = ACTIVATION(S2.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_2[1] = ACTIVATION(S2.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1014,13 +1014,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[3 * QK + 0 * K] = ACTIVATION(S3.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[3 * QK + 0 * K] = ACTIVATION(S3.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_3[0] = ACTIVATION(S3.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_3[0] = ACTIVATION(S3.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
@@ -1029,14 +1029,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[3 * QK + 1 * K] = ACTIVATION(S3.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[3 * QK + 1 * K] = ACTIVATION(S3.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_3[1] = ACTIVATION(S3.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_3[1] = ACTIVATION(S3.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1049,13 +1049,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[4 * QK + 0 * K] = ACTIVATION(S4.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[4 * QK + 0 * K] = ACTIVATION(S4.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_4[0] = ACTIVATION(S4.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_4[0] = ACTIVATION(S4.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
@@ -1064,14 +1064,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[4 * QK + 1 * K] = ACTIVATION(S4.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[4 * QK + 1 * K] = ACTIVATION(S4.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_4[1] = ACTIVATION(S4.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_4[1] = ACTIVATION(S4.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1083,13 +1083,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[5 * QK + 0 * K] = ACTIVATION(S5.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[5 * QK + 0 * K] = ACTIVATION(S5.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_5[0] = ACTIVATION(S5.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_5[0] = ACTIVATION(S5.s0 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
if (q1_in) {
|
||||
@@ -1098,14 +1098,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
O_write[5 * QK + 1 * K] = ACTIVATION(S5.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write[5 * QK + 1 * K] = ACTIVATION(S5.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#if BIAS_TERM
|
||||
O_write_5[1] = ACTIVATION(S5.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
|
||||
#else
|
||||
O_write_5[1] = ACTIVATION(S5.s1 * scl, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1113,4 +1113,4 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
|
||||
}
|
||||
#undef UNIT_TYPE_2
|
||||
#undef UNIT_TYPE_4
|
||||
#undef UNIT_TYPE_8
|
||||
#undef UNIT_TYPE_8
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(convolution_gpu_yxfb_ref)(
|
||||
|
||||
@@ -2,11 +2,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
__attribute__((reqd_work_group_size(16, 1, 1)))
|
||||
KERNEL(convolution_gpu_yxfb_yxio_b16)(
|
||||
const __global UNIT_TYPE* input,
|
||||
@@ -94,7 +96,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
|
||||
for (uint h = 0; h < FILTER_IFM_NUM; h++)
|
||||
{
|
||||
#if defined(USE_BLOCK_READ_2)
|
||||
half4 _input = as_half4(intel_sub_group_block_read2((const __global uint*)(input + input_idx)));
|
||||
half4 _input = as_half4(_sub_group_block_read2((const __global uint*)(input + input_idx)));
|
||||
uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
|
||||
half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
|
||||
_data[0] = fma(_input.s0, filter_transp, _data[0]);
|
||||
@@ -103,7 +105,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
|
||||
_data[3] = fma(_input.s3, filter_transp, _data[3]);
|
||||
input_idx += INPUT0_FEATURE_PITCH;
|
||||
#elif defined(USE_BLOCK_READ_1)
|
||||
half2 _input = as_half2(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
|
||||
half2 _input = as_half2(_sub_group_block_read((const __global uint*)(input + input_idx)));
|
||||
uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
|
||||
half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
|
||||
_data[0] = fma(_input.s0, filter_transp, _data[0]);
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
@@ -93,7 +95,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
|
||||
for (uint h = 0; h < FILTER_IFM_NUM; h++)
|
||||
{
|
||||
#ifdef USE_BLOCK_READ_2
|
||||
float2 _input = as_float2(intel_sub_group_block_read2((const __global uint*)input + input_idx));
|
||||
float2 _input = as_float2(_sub_group_block_read2((const __global uint*)input + input_idx));
|
||||
float8 filter_transp = TRANSPOSE_BLOCK_8(filter[filter_idx]);
|
||||
_data[0] = fma(_input.s0, filter_transp, _data[0]);
|
||||
_data[1] = fma(_input.s1, filter_transp, _data[1]);
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
@@ -18,23 +19,23 @@ KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
|
||||
{
|
||||
#if USE_VECTOR == 8
|
||||
#define VECTOR_FLOAT float8
|
||||
#define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
|
||||
#define BLOCK_READ(IN) as_float8(_sub_group_block_read8((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
|
||||
#endif
|
||||
#if USE_VECTOR == 4
|
||||
#define VECTOR_FLOAT float4
|
||||
#define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
|
||||
#define BLOCK_READ(IN) as_float4(_sub_group_block_read4((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
|
||||
#endif
|
||||
#if USE_VECTOR == 2
|
||||
#define VECTOR_FLOAT float2
|
||||
#define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
|
||||
#define BLOCK_READ(IN) as_float2(_sub_group_block_read2((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
|
||||
#endif
|
||||
#if USE_VECTOR == 1
|
||||
#define VECTOR_FLOAT float
|
||||
#define BLOCK_READ(IN) as_float(intel_sub_group_block_read((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write((__global uint*)OUT, as_uint(DATA));
|
||||
#define BLOCK_READ(IN) as_float(_sub_group_block_read((const __global uint*)IN))
|
||||
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write((__global uint*)OUT, as_uint(DATA));
|
||||
#endif
|
||||
|
||||
const uint batch_num = INPUT0_BATCH_NUM;
|
||||
@@ -99,7 +100,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
|
||||
float _in[X_PER_WORK_ITEM];
|
||||
for(uint a = 0; a < X_PER_WORK_ITEM; a++)
|
||||
{
|
||||
_in[a] = as_float(intel_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
|
||||
_in[a] = as_float(_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
|
||||
}
|
||||
float8 _input[X_PER_WORK_ITEM];
|
||||
for(uint a = 0; a < X_PER_WORK_ITEM; a++)
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
@@ -65,7 +67,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b8)(
|
||||
#endif
|
||||
for (uint h = 0; h < FILTER_IFM_NUM / 8; h++)
|
||||
{
|
||||
float8 _input = as_float8(intel_sub_group_block_read8((const __global uint*)input + input_idx));
|
||||
float8 _input = as_float8(_sub_group_block_read8((const __global uint*)input + input_idx));
|
||||
|
||||
DOT_PRODUCT_8(_data0, _input.s0, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
|
||||
#if OFM_PER_WORK_ITEM == 16
|
||||
@@ -128,8 +130,8 @@ KERNEL(convolution_gpu_yxfb_yxio_b8)(
|
||||
#endif
|
||||
|
||||
const uint _out_id = OUTPUT_OFFSET + out_id;
|
||||
intel_sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
|
||||
_sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
|
||||
#if OFM_PER_WORK_ITEM == 16
|
||||
intel_sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
|
||||
_sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(ctc_greedy_decoder_ref)(const __global INPUT0_TYPE* probabilities
|
||||
|
||||
@@ -2,36 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
///////////////////////// Input Index /////////////////////////
|
||||
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_DIMS < 5
|
||||
return INPUT0_GET_INDEX(b, f, y, x);
|
||||
#elif INPUT0_DIMS == 5
|
||||
return INPUT0_GET_INDEX(b, f, z, y, x);
|
||||
#elif INPUT0_DIMS == 6
|
||||
return INPUT0_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error cum_sum_ref.cl: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////// Output Index /////////////////////////
|
||||
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_DIMS < 5
|
||||
return OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
return OUTPUT_GET_INDEX(b, f, z, y, x);
|
||||
#elif OUTPUT_DIMS == 6
|
||||
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error cum_sum_ref.cl: output format - not supported
|
||||
#endif
|
||||
}
|
||||
#include "include/fetch_utils.cl"
|
||||
|
||||
inline void FUNC(get_indices)(int *axes)
|
||||
{
|
||||
@@ -87,8 +58,6 @@ inline void FUNC(get_indices)(int *axes)
|
||||
#endif
|
||||
}
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
|
||||
#if CUM_SUM_PARTIAL_SUM
|
||||
inline uint FUNC(get_current_index)(int axis, int i)
|
||||
{
|
||||
@@ -99,7 +68,7 @@ inline uint FUNC(get_current_index)(int axis, int i)
|
||||
#endif
|
||||
}
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
__attribute__((reqd_work_group_size(LWS, 1, 1)))
|
||||
KERNEL(cum_sum_partial_sum)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
@@ -160,7 +129,7 @@ inline uint FUNC(get_current_index)(int i)
|
||||
}
|
||||
|
||||
// main
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
__attribute__((reqd_work_group_size(LWS, 1, 1)))
|
||||
KERNEL(cum_sum_final)(
|
||||
const __global PARTIAL_TYPE* partial,
|
||||
|
||||
@@ -2,36 +2,7 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
///////////////////////// Input Index /////////////////////////
|
||||
inline uint FUNC(get_input_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if INPUT0_DIMS < 5
|
||||
return INPUT0_GET_INDEX(b, f, y, x);
|
||||
#elif INPUT0_DIMS == 5
|
||||
return INPUT0_GET_INDEX(b, f, z, y, x);
|
||||
#elif INPUT0_DIMS == 6
|
||||
return INPUT0_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error cum_sum_ref.cl: input format - not supported
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////// Output Index /////////////////////////
|
||||
inline uint FUNC(get_output_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
|
||||
{
|
||||
#if OUTPUT_DIMS < 5
|
||||
return OUTPUT_GET_INDEX(b, f, y, x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
return OUTPUT_GET_INDEX(b, f, z, y, x);
|
||||
#elif OUTPUT_DIMS == 6
|
||||
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
|
||||
#else
|
||||
#error cum_sum_ref.cl: output format - not supported
|
||||
#endif
|
||||
}
|
||||
#include "include/fetch_utils.cl"
|
||||
|
||||
KERNEL(cum_sum_ref)(
|
||||
OPTIONAL_SHAPE_INFO_ARG
|
||||
|
||||
@@ -2,13 +2,11 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#include "deconvolution_gpu_imad_common.cl"
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
#if X_BLOCK_SIZE == 1
|
||||
@@ -54,7 +52,7 @@ DECLARE_READ_BLOCK_8(preload_weights, FILTER_TYPE)
|
||||
# endif
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE))) // attr:no-format
|
||||
REQD_SUB_GROUP_SIZE(FEATURE_SLICE_SIZE) // attr:no-format
|
||||
__attribute__((reqd_work_group_size(1, FEATURE_SLICE_SIZE, 1)))
|
||||
KERNEL(deconvolution_gpu_b_fs_zyx_fsv16_dw)(
|
||||
const __global INPUT0_TYPE *input,
|
||||
@@ -272,7 +270,6 @@ KERNEL(deconvolution_gpu_b_fs_zyx_fsv16_dw)(
|
||||
}
|
||||
}
|
||||
|
||||
#undef unroll_for
|
||||
#undef FEATURE_SLICE_SIZE
|
||||
|
||||
#undef GET_VEC_ELEM
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define WORK_GROUP_GROUP_SIZE 16
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
#include "deconvolution_gpu_imad_common.cl"
|
||||
|
||||
@@ -31,7 +33,7 @@ DECLARE_STORE_BLOCK_4(store_output, OUTPUT_TYPE)
|
||||
#define WEIGHTS_IN_TILE_OFM_PITCH (TILE_IFM * SIMD)
|
||||
|
||||
__attribute__((reqd_work_group_size(1, SIMD, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
KERNEL(deconvolution_gpu_imad_ref)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* restrict output,
|
||||
@@ -127,8 +129,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
|
||||
for (uint fi = 0; fi < FILTER_IFM_NUM; fi += TILE_IFM) {
|
||||
// Load weights [TILE_OFM, TILE_IFM, 1, 1]
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
uint weights_idx = weights_offset + of * WEIGHTS_IN_TILE_OFM_PITCH / 4;
|
||||
FUNC_CALL(load_weights_ui)(weights_ui, weights_idx, TILE_IFM / 4, wei[of]);
|
||||
}
|
||||
@@ -142,8 +143,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
uint input_offset = INPUT0_GET_INDEX(out_b, if_start + fi, fixed_in_z, fixed_in_y, fixed_in_x) / 4;
|
||||
# endif
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
uint input_idx = input_offset + ob * INPUT_IN_TILE_B_PITCH / 4;
|
||||
FUNC_CALL(load_input_ui)(input_ui, input_idx, TILE_IFM / 4, in[ob]);
|
||||
}
|
||||
@@ -151,24 +151,18 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
input_offset += INPUT_TILE_IFM_PITCH / 4;
|
||||
#endif
|
||||
if (zero_x) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ifp = 0; ifp < TILE_IFM / 4; ++ifp) {
|
||||
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for(uint ifp = 0; ifp < TILE_IFM / 4; ++ifp) {
|
||||
in[ob][ifp] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint imad_it = 0; imad_it < TILE_IFM / 4; ++imad_it) {
|
||||
uint in_val = intel_sub_group_shuffle(in[ob][imad_it], tx);
|
||||
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for (uint imad_it = 0; imad_it < TILE_IFM / 4; ++imad_it) {
|
||||
uint in_val = _sub_group_shuffle(in[ob][imad_it], tx);
|
||||
acc[ob][of][tx] = IMAD(acc[ob][of][tx], AS_INPUT_TYPE4(in_val), AS_FILTER_TYPE4(wei[of][imad_it]));
|
||||
}
|
||||
}
|
||||
@@ -180,25 +174,19 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
}
|
||||
|
||||
ACTIVATION_TYPE dequantized[TILE_B][TILE_OFM][TILE_X];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
dequantized[ob][of][tx] = TO_ACTIVATION_TYPE(acc[ob][of][tx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if BIAS_TERM
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
BIAS_TYPE bias_val = bias[out_f + of * SIMD];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
dequantized[ob][of][tx] += TO_ACTIVATION_TYPE(bias_val);
|
||||
}
|
||||
}
|
||||
@@ -206,15 +194,12 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
#endif
|
||||
|
||||
OUTPUT_TYPE result[TILE_B][TILE_OFM][TILE_X];
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
#if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_PRELOAD;
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for(uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
#if HAS_FUSED_OPS
|
||||
# if FUSED_OPS_CAN_USE_PRELOAD
|
||||
FUSED_OPS_CALC;
|
||||
@@ -233,12 +218,9 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
bool leftovers_f = OUTPUT_FEATURE_NUM % SIMD != 0 && out_f + SIMD >= OUTPUT_FEATURE_NUM;
|
||||
|
||||
#if OUTPUT_NAIVE_STORE
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
if ((leftovers_x && tx >= OUTPUT_SIZE_X % TILE_X) ||
|
||||
(leftovers_f && out_f + of * SIMD >= OUTPUT_FEATURE_NUM))
|
||||
break;
|
||||
@@ -252,10 +234,8 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
}
|
||||
}
|
||||
#elif OUTPUT_BLOCK_X_STORE
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint of = 0; of < TILE_OFM; ++of) {
|
||||
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
|
||||
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
|
||||
#if OUTPUT_DIMS <= 4
|
||||
uint output_idx = OUTPUT_GET_INDEX(out_b + ob, out_fg + of * SIMD, out_y, out_x);
|
||||
#elif OUTPUT_DIMS == 5
|
||||
@@ -266,8 +246,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
|
||||
} else if (!leftovers_f) {
|
||||
FUNC_CALL(store_output)(output, output_idx, OUTPUT_SIZE_X % TILE_X, result[ob][of]);
|
||||
} else {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
|
||||
if (out_f + of * SIMD < OUTPUT_FEATURE_NUM && out_x + tx < OUTPUT_SIZE_X) {
|
||||
output[output_idx + sglid + tx * SIMD] = result[ob][of][tx];
|
||||
}
|
||||
|
||||
@@ -2,10 +2,8 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#define CEIL_DIV(a, b) (((a) + ((b) - 1)) / (b))
|
||||
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
|
||||
#define VEC_TO_ARR_1(var, arr, idx) \
|
||||
arr[idx] = var
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/imad.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/imad.cl"
|
||||
|
||||
#include "deconvolution_gpu_imad_common.cl"
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(deconvolution_gpu_yxfb_ref)(
|
||||
|
||||
@@ -2,15 +2,16 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/unit_type.cl"
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
|
||||
#define GET_WEI(filter, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, filter), id))
|
||||
#define GET_WEI(filter, id) AS_TYPE(UNIT_TYPE, _sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, filter), id))
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(deformable_convolution_gpu_bfyx_conv)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
|
||||
@@ -2,10 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL(deformable_convolution_gpu_bfyx_interp)(
|
||||
const __global INPUT0_TYPE* data,
|
||||
const __global INPUT1_TYPE* trans,
|
||||
@@ -29,7 +28,7 @@ KERNEL(deformable_convolution_gpu_bfyx_interp)(
|
||||
|
||||
const int input_offset_x = input_x + kw * DILATION_SIZE_X;
|
||||
const int input_offset_y = input_y + kh * DILATION_SIZE_Y;
|
||||
|
||||
|
||||
#if DEFORMABLE_MASK_ENABLED
|
||||
const int dg_size = dg * FILTER_SIZE_Y * FILTER_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_X;
|
||||
const int trans_offset = b * INPUT1_BATCH_PITCH + 2 * dg_size;
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(depth_to_space_block2_opt)(const __global half* input, __global half* output)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(depth_to_space_ref)(const __global INPUT0_TYPE* input,
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/detection_output_common.cl"
|
||||
|
||||
@@ -56,7 +55,6 @@
|
||||
// LOCAL_BATCHES_NUM - number of batch that can be process per work-group
|
||||
// =================================================================================================================
|
||||
|
||||
#define unroll_for __attribute__((opencl_unroll_hint)) for
|
||||
#define NUM_CLASSES_ACC (NUM_CLASSES + 2)
|
||||
|
||||
typedef struct __attribute__((__packed__)) {
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define FEATURE_SLICE_SIZE 16
|
||||
#define unroll_for __attribute__((opencl_unroll_hint())) for
|
||||
|
||||
#define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE)
|
||||
#define TO_TYPE(type, val) CAT(convert_, type)(val)
|
||||
@@ -25,7 +25,7 @@
|
||||
#define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX)(idx_order)
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(FEATURE_SLICE_SIZE)
|
||||
KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
|
||||
__global OUTPUT_TYPE* output
|
||||
#if HAS_FUSED_OPS_DECLS
|
||||
@@ -107,7 +107,6 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
|
||||
}
|
||||
|
||||
#undef FEATURE_SLICE_SIZE
|
||||
#undef unroll_for
|
||||
#undef OUTPUT_TYPE_BLOCK
|
||||
#undef TO_TYPE
|
||||
#undef READ_FUNC
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(eltwise_fs_b_yx_fsv32)(
|
||||
|
||||
@@ -2,18 +2,19 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/unit_type.cl"
|
||||
|
||||
// Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI
|
||||
#define REQD_SUB_GROUP_SIZE 16
|
||||
#define SUB_GROUP_SIZE 16
|
||||
#define REQD_FEATURE_SLICE_SIZE 32
|
||||
#define REQD_FEATURES_PER_WORK_ITEM 2
|
||||
|
||||
//inputs_decls -> __global unit_type * input0, __global unit_type * input1
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(REQD_SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
KERNEL(eltwise_mixed_byxf_and_fs_b_yx_fsv32)(
|
||||
INPUTS_DECLS
|
||||
__global UNIT_TYPE* output)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(eltwise_gpu_vload8)(INPUTS_DECLS
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#ifdef PACKED_SUM
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
#define INPUT_TYPE INPUT0_TYPE
|
||||
#define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
inline int FUNC(get_pyramid_level_index)(uint level, uint c, uint y, uint x) {
|
||||
uint idx = 0;
|
||||
@@ -64,9 +63,10 @@ KERNEL(experimental_detectron_roi_feature_extractor_ref)(const __global INPUT0_T
|
||||
const uint roi_bin_grid_w = (SAMPLING_RATIO > 0) ? SAMPLING_RATIO : (uint)ceil(roi_width / POOLED_WIDTH);
|
||||
const uint roi_bin_grid_h = (SAMPLING_RATIO > 0) ? SAMPLING_RATIO : (uint)ceil(roi_height / POOLED_HEIGHT);
|
||||
|
||||
const uint level_h = LEVEL_SIZES[3 * level];
|
||||
const uint level_w = LEVEL_SIZES[3 * level + 1];
|
||||
const uint level_offset = LEVEL_SIZES[3 * level + 2];
|
||||
size_t level_sizes_arr[3*NUM_PYRAMID_LEVELS] = LEVEL_SIZES;
|
||||
const uint level_h = level_sizes_arr[3 * level];
|
||||
const uint level_w = level_sizes_arr[3 * level + 1];
|
||||
const uint level_offset = level_sizes_arr[3 * level + 2];
|
||||
|
||||
INPUT0_TYPE output_val = 0.0;
|
||||
INPUT0_TYPE current_bin_start_h = roi_start_h + y * bin_height;
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
|
||||
KERNEL(experimental_detectron_topk_rois_ref)(const __global INPUT0_TYPE* input_rois,
|
||||
const __global INPUT1_TYPE* topk_indices, __global OUTPUT_TYPE* output_rois)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
KERNEL(extract_image_patches_ref)(const __global INPUT0_TYPE* input,
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
#include "include/mmad.cl"
|
||||
@@ -12,12 +11,12 @@
|
||||
#define INPUT_PACKED_TYPE_VEC CAT(INPUT_PACKED_TYPE, SUB_GROUP_SIZE)
|
||||
#define FILTER_PACKED_TYPE_VEC CAT(FILTER_PACKED_TYPE, SUB_GROUP_SIZE)
|
||||
|
||||
#define BLOCK_READ(ptr) intel_sub_group_block_read((const __global uint*)(ptr))
|
||||
#define BLOCK_READ_8(ptr) intel_sub_group_block_read8((const __global uint*)(ptr))
|
||||
#define BLOCK_READ(ptr) _sub_group_block_read((const __global uint*)(ptr))
|
||||
#define BLOCK_READ_8(ptr) _sub_group_block_read8((const __global uint*)(ptr))
|
||||
|
||||
#define MMAD CAT(MMAD_, SUB_GROUP_SIZE)
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
KERNEL(fully_connected_gpu_MMAD)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -133,8 +132,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
|
||||
FILTER_PACKED_TYPE_VEC weights_data[UNROLL_FACTOR];
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||
unroll_for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||
input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH));
|
||||
#if SUB_GROUP_SIZE == 8
|
||||
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
|
||||
@@ -144,8 +142,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
#endif // SUB_GROUP_SIZE
|
||||
}
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||
unroll_for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||
INPUT_PACKED_TYPE_VEC in;
|
||||
|
||||
in.s0 = sub_group_broadcast(input_data[kb], 0);
|
||||
@@ -177,8 +174,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (feature_block == 0) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
unroll_for(uint i = 1; i < SLM_DIV_FACTOR; i++)
|
||||
dotProd += partial_summ[lid0 % feature_per_wg + i * feature_per_wg];
|
||||
#endif // SLM_DIV_FACTOR > 1
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
#if defined(__fc_f16)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
// Required JIT constants:
|
||||
@@ -18,7 +17,7 @@
|
||||
|
||||
#define ACC_TYPE float
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(16)))
|
||||
REQD_SUB_GROUP_SIZE(16)
|
||||
KERNEL (fully_connected_gpu_bf_io_input_spatial)(
|
||||
const __global UNIT_TYPE* input,
|
||||
__global UNIT_TYPE* output,
|
||||
@@ -47,8 +46,8 @@ KERNEL (fully_connected_gpu_bf_io_input_spatial)(
|
||||
uint it_w_addr = _inG == UNIT_VAL_ZERO ? weight_idx_base : s_w_idx;
|
||||
for (uint j = 0; j < 16; j++)
|
||||
{
|
||||
UNIT_TYPE _in = intel_sub_group_shuffle(_inG, j);
|
||||
uint wi_w_addr = intel_sub_group_shuffle(it_w_addr, j);
|
||||
UNIT_TYPE _in = _sub_group_shuffle(_inG, j);
|
||||
uint wi_w_addr = _sub_group_shuffle(it_w_addr, j);
|
||||
wi_w_addr += MULTIPLY_OFFSET(UNIT_TYPE, get_sub_group_local_id());
|
||||
UNIT_TYPE _w = *OFFSET_GLOBAL_PTR(UNIT_TYPE, weight, wi_w_addr);
|
||||
result += _in * _w;
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
// Required JIT constants:
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
//
|
||||
|
||||
#include "include/batch_headers/common.cl"
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
|
||||
// JIT Parameters:
|
||||
// SIMD - sub-group size/simd width, one of {8, 16};
|
||||
@@ -51,11 +53,6 @@
|
||||
#define BIAS_BLOCK_READ(ptr, offset) BLOCK_READN(BIAS_TYPE, TILE_OFM, ptr, offset)
|
||||
#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, TILE_OFM, ptr, offset, val)
|
||||
|
||||
// Utility math macros.
|
||||
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// Check alignment restrictions for using block writes on output.
|
||||
#define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * TILE_OUT_B_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)
|
||||
|
||||
@@ -80,7 +77,7 @@
|
||||
# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
|
||||
#endif
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD)))
|
||||
REQD_SUB_GROUP_SIZE(SIMD)
|
||||
KERNEL(fc)(
|
||||
const __global INPUT0_TYPE* input,
|
||||
__global OUTPUT_TYPE* output,
|
||||
@@ -122,9 +119,8 @@ KERNEL(fc)(
|
||||
INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * TILE_IN_B_PITCH];
|
||||
MAKE_VECTOR_TYPE(FILTER_TYPE, TILE_OFM) tmp_wei = BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset);
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
acc[bi] = intel_sub_group_shuffle(tmp_input, bi) * tmp_wei;
|
||||
unroll_for(uint bi = 0; bi < TILE_B; ++bi) {
|
||||
acc[bi] = _sub_group_shuffle(tmp_input, bi) * tmp_wei;
|
||||
}
|
||||
|
||||
weights_offset += TILE_OFM * SIMD;
|
||||
@@ -148,19 +144,15 @@ KERNEL(fc)(
|
||||
// NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
|
||||
// but significantly degrades readability and generality of code.
|
||||
// It doesn't also show noticable performance improvement on tested configurations.
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
|
||||
unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
|
||||
wei = FILTER_BLOCK_READ(weights, weights_offset);
|
||||
weights_offset += TILE_K_OFM * SIMD;
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint kii = 0; kii < TILE_K; ++kii) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
|
||||
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
const uint total_k = ki * TILE_K + kii;
|
||||
INPUT0_TYPE in_val = intel_sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
|
||||
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
|
||||
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
|
||||
}
|
||||
}
|
||||
@@ -181,20 +173,16 @@ KERNEL(fc)(
|
||||
CONST_LOOP(TILE_B, LOAD_IN_0);
|
||||
#undef LOAD_IN_0
|
||||
input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
|
||||
unroll_for(uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
|
||||
wei = FILTER_BLOCK_READ(weights, weights_offset);
|
||||
weights_offset += TILE_K_OFM * SIMD;
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint kii = 0; kii < TILE_K; ++kii) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
|
||||
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
const uint total_k = ki * TILE_K + kii;
|
||||
if (total_k < LEFTOVER_IFM) {
|
||||
INPUT0_TYPE in_val = intel_sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
|
||||
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
|
||||
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
|
||||
}
|
||||
}
|
||||
@@ -216,24 +204,20 @@ KERNEL(fc)(
|
||||
BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
|
||||
#else
|
||||
BIAS_VEC_TYPE bias = 0;
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
((BIAS_TYPE*)(&bias))[fi] = biases[out_f + sglid + fi * SIMD];
|
||||
}
|
||||
#endif
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
activated[bi] += TO_ACTIVATION_VEC_TYPE(bias);
|
||||
}
|
||||
#endif
|
||||
|
||||
OUTPUT_VEC_TYPE result[TILE_B] = { };
|
||||
#if HAS_FUSED_OPS
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
#if TILE_OFM > 1
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
|
||||
FUSED_OPS_VEC;
|
||||
result[bi][fi] = FUSED_OPS_RESULT_VEC;
|
||||
}
|
||||
@@ -243,8 +227,7 @@ KERNEL(fc)(
|
||||
#endif // TILE_OFM > 1
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
|
||||
result[bi] = TO_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(activated[bi], ACTIVATION_PARAMS_TYPED));
|
||||
}
|
||||
#endif
|
||||
@@ -314,10 +297,6 @@ KERNEL(fc)(
|
||||
#undef BIAS_BLOCK_READ
|
||||
#undef OUTPUT_BLOCK_WRITE
|
||||
|
||||
#undef CEIL_DIV
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
|
||||
#undef USE_BLOCK_WRITE
|
||||
|
||||
#undef MAIN_LOOP_ELEMENTS_COUNT
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/batch_headers/fetch_weights.cl"
|
||||
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
// Block read - currently block is 4 bytes aligned.
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
|
||||
#define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB) \
|
||||
{ \
|
||||
@@ -32,8 +33,8 @@
|
||||
#define SUB_GROUP_SIZE 16
|
||||
|
||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv16_vload)(
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
|
||||
const __global UNIT_TYPE* input,
|
||||
__global UNIT_TYPE* output,
|
||||
const __global UNIT_TYPE* weight
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/sub_group_block_read.cl"
|
||||
#include "include/batch_headers/sub_group_block_write.cl"
|
||||
#include "include/batch_headers/sub_group_shuffle.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------------------
|
||||
@@ -93,7 +95,7 @@
|
||||
|
||||
// Extracts one scalar element of UNIT_TYPE from sub-group chunk;
|
||||
// chunk - name of chunk variable, idx - 0-based index of element.
|
||||
#define SG_UNIT_SELECT(chunk, idx) CHUNK_UNIT_SELECT(intel_sub_group_shuffle(chunk, (idx) / UNITS_PER_CHUNK), (idx) % UNITS_PER_CHUNK)
|
||||
#define SG_UNIT_SELECT(chunk, idx) CHUNK_UNIT_SELECT(_sub_group_shuffle(chunk, (idx) / UNITS_PER_CHUNK), (idx) % UNITS_PER_CHUNK)
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------------------
|
||||
// Reads / Writes:
|
||||
@@ -118,10 +120,10 @@
|
||||
(array)[(idx) + 6] = chunk_vec.s6, (array)[(idx) + 7] = chunk_vec.s7))
|
||||
|
||||
// Currently block read is 4 bytes aligned.
|
||||
#define ALIGNED_BLOCK_READ1(ptr, byte_offset) intel_sub_group_block_read((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) intel_sub_group_block_read2((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ4(ptr, byte_offset) intel_sub_group_block_read4((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) intel_sub_group_block_read8((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ1(ptr, byte_offset) _sub_group_block_read((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) _sub_group_block_read2((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ4(ptr, byte_offset) _sub_group_block_read4((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) _sub_group_block_read8((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
|
||||
// Currently read is 4 bytes aligned.
|
||||
#define ALIGNED_READ1(ptr, byte_offset) (*(const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
@@ -130,10 +132,10 @@
|
||||
#define ALIGNED_READ8(ptr, byte_offset) vload8(0, (const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
|
||||
|
||||
// Currently block write is 16 bytes aligned.
|
||||
#define ALIGNED_BLOCK_WRITE1(ptr, byte_offset, val) intel_sub_group_block_write((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE2(ptr, byte_offset, val) intel_sub_group_block_write2((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE4(ptr, byte_offset, val) intel_sub_group_block_write4((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE1(ptr, byte_offset, val) _sub_group_block_write((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE2(ptr, byte_offset, val) _sub_group_block_write2((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE4(ptr, byte_offset, val) _sub_group_block_write4((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) _sub_group_block_write8((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
|
||||
|
||||
// Currently block write is 4 bytes aligned.
|
||||
#define ALIGNED_WRITE1(ptr, byte_offset, val) ((void)(*(__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)) = (val)))
|
||||
@@ -156,7 +158,7 @@
|
||||
|
||||
|
||||
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||
KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
const __global UNIT_TYPE* input,
|
||||
@@ -210,32 +212,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
CHUNK_TYPE input_val[IN_CHUNK_PREFETCH_SIZE];
|
||||
|
||||
#if IN_CHUNK_PREFETCH_SIZE % 8 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 8)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 8)
|
||||
{
|
||||
CHUNK_VEC8_TYPE input_vals = ALIGNED_BLOCK_READ8(input, input_offset + 8 * sg_elem_offset);
|
||||
input_offset += 8 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
|
||||
}
|
||||
#elif IN_CHUNK_PREFETCH_SIZE % 4 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 4)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 4)
|
||||
{
|
||||
CHUNK_VEC4_TYPE input_vals = ALIGNED_BLOCK_READ4(input, input_offset + 4 * sg_elem_offset);
|
||||
input_offset += 4 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
|
||||
}
|
||||
#elif IN_CHUNK_PREFETCH_SIZE % 2 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 2)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 2)
|
||||
{
|
||||
CHUNK_VEC2_TYPE input_vals = ALIGNED_BLOCK_READ2(input, input_offset + 2 * sg_elem_offset);
|
||||
input_offset += 2 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 1)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 1)
|
||||
{
|
||||
CHUNK_VEC1_TYPE input_vals = ALIGNED_BLOCK_READ1(input, input_offset + sg_elem_offset);
|
||||
input_offset += BYTES_PER_SG_READ;
|
||||
@@ -243,8 +241,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint elem_base_idx = 0; elem_base_idx < IN_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
|
||||
unroll_for(uint elem_base_idx = 0; elem_base_idx < IN_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
|
||||
{
|
||||
// Contains group of weights for RESPONSES_PER_SG_EXEC responses and for (FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC) spatial points.
|
||||
// Currently for floats:
|
||||
@@ -264,32 +261,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
CHUNK_TYPE filter_val[FILTER_CHUNK_PREFETCH_SIZE];
|
||||
|
||||
#if FILTER_CHUNK_PREFETCH_SIZE % 8 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 8)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 8)
|
||||
{
|
||||
CHUNK_VEC8_TYPE filter_vals = ALIGNED_BLOCK_READ8(weight, filter_offset + 8 * sg_elem_offset);
|
||||
filter_offset += 8 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
|
||||
}
|
||||
#elif FILTER_CHUNK_PREFETCH_SIZE % 4 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 4)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 4)
|
||||
{
|
||||
CHUNK_VEC4_TYPE filter_vals = ALIGNED_BLOCK_READ4(weight, filter_offset + 4 * sg_elem_offset);
|
||||
filter_offset += 4 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
|
||||
}
|
||||
#elif FILTER_CHUNK_PREFETCH_SIZE % 2 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 2)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 2)
|
||||
{
|
||||
CHUNK_VEC2_TYPE filter_vals = ALIGNED_BLOCK_READ2(weight, filter_offset + 2 * sg_elem_offset);
|
||||
filter_offset += 2 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 1)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 1)
|
||||
{
|
||||
CHUNK_VEC1_TYPE filter_vals = ALIGNED_BLOCK_READ1(weight, filter_offset + sg_elem_offset);
|
||||
filter_offset += BYTES_PER_SG_READ;
|
||||
@@ -298,8 +291,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
#endif
|
||||
|
||||
// Processing of cached filter chunks.
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; ++filter_val_idx)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; ++filter_val_idx)
|
||||
{
|
||||
const uint input_base_elem_idx = elem_base_idx + filter_val_idx * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC;
|
||||
|
||||
@@ -338,32 +330,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
CHUNK_TYPE input_val[IN_CHUNK_PREFETCH_SIZE];
|
||||
|
||||
#if IN_CHUNK_PREFETCH_SIZE % 8 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 8 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 16)
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 8)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 8)
|
||||
{
|
||||
CHUNK_VEC8_TYPE input_vals = ALIGNED_BLOCK_READ8(input, input_offset + 8 * sg_elem_offset);
|
||||
input_offset += 8 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
|
||||
}
|
||||
#elif IN_CHUNK_PREFETCH_SIZE % 4 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 4 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 8)
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 4)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 4)
|
||||
{
|
||||
CHUNK_VEC4_TYPE input_vals = ALIGNED_BLOCK_READ4(input, input_offset + 4 * sg_elem_offset);
|
||||
input_offset += 4 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
|
||||
}
|
||||
#elif IN_CHUNK_PREFETCH_SIZE % 2 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 2 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 4)
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 2)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 2)
|
||||
{
|
||||
CHUNK_VEC2_TYPE input_vals = ALIGNED_BLOCK_READ2(input, input_offset + 2 * sg_elem_offset);
|
||||
input_offset += 2 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 1)
|
||||
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 1)
|
||||
{
|
||||
CHUNK_VEC1_TYPE input_vals = ALIGNED_BLOCK_READ1(input, input_offset + sg_elem_offset);
|
||||
input_offset += BYTES_PER_SG_READ;
|
||||
@@ -371,8 +359,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
}
|
||||
#endif
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint elem_base_idx = 0; elem_base_idx < INPUT0_ELEMENTS_REMAINDER; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
|
||||
unroll_for(uint elem_base_idx = 0; elem_base_idx < INPUT0_ELEMENTS_REMAINDER; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
|
||||
{
|
||||
// Size of array of CHUNK_TYPE needed to contain filter elements for input elements in range [elem_base_idx; INPUT0_ELEMENTS_REMAINDER).
|
||||
const uint filter_chunk_remainder_size = ((INPUT0_ELEMENTS_REMAINDER - elem_base_idx) * RESPONSES_PER_SG_EXEC + UNITS_PER_SG_READ - 1) / UNITS_PER_SG_READ;
|
||||
@@ -381,32 +368,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
CHUNK_TYPE filter_val[FILTER_CHUNK_PREFETCH_SIZE];
|
||||
|
||||
#if FILTER_CHUNK_PREFETCH_SIZE % 8 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 8)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 8)
|
||||
{
|
||||
CHUNK_VEC8_TYPE filter_vals = ALIGNED_BLOCK_READ8(weight, filter_offset + 8 * sg_elem_offset);
|
||||
filter_offset += 8 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
|
||||
}
|
||||
#elif FILTER_CHUNK_PREFETCH_SIZE % 4 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 4)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 4)
|
||||
{
|
||||
CHUNK_VEC4_TYPE filter_vals = ALIGNED_BLOCK_READ4(weight, filter_offset + 4 * sg_elem_offset);
|
||||
filter_offset += 4 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
|
||||
}
|
||||
#elif FILTER_CHUNK_PREFETCH_SIZE % 2 == 0
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 2)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 2)
|
||||
{
|
||||
CHUNK_VEC2_TYPE filter_vals = ALIGNED_BLOCK_READ2(weight, filter_offset + 2 * sg_elem_offset);
|
||||
filter_offset += 2 * BYTES_PER_SG_READ;
|
||||
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
|
||||
}
|
||||
#else
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 1)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 1)
|
||||
{
|
||||
CHUNK_VEC1_TYPE filter_vals = ALIGNED_BLOCK_READ1(weight, filter_offset + sg_elem_offset);
|
||||
filter_offset += BYTES_PER_SG_READ;
|
||||
@@ -415,8 +398,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
#endif
|
||||
|
||||
// Processing of cached filter chunks.
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; ++filter_val_idx)
|
||||
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; ++filter_val_idx)
|
||||
{
|
||||
const uint input_base_elem_idx = elem_base_idx + filter_val_idx * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC;
|
||||
|
||||
@@ -458,15 +440,14 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
|
||||
sg_reduce_offset < SUB_GROUP_SIZE;
|
||||
sg_reduce_offset += SUB_GROUP_SIZE * RESPONSES_PER_SG_EXEC / UNITS_PER_SG_READ)
|
||||
{
|
||||
reduced_acc = AS_CHUNK(AS_UNITS(reduced_acc) + AS_UNITS(intel_sub_group_shuffle_down(acc, zero, sg_reduce_offset)));
|
||||
reduced_acc = AS_CHUNK(AS_UNITS(reduced_acc) + AS_UNITS(_sub_group_shuffle_down(acc, zero, sg_reduce_offset)));
|
||||
}
|
||||
|
||||
|
||||
// Expand accumulator chunks to units.
|
||||
const uint expanded_acc_size = (RESPONSES_PER_SG_EXEC + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE;
|
||||
|
||||
__attribute__((opencl_unroll_hint))
|
||||
for (uint expanded_acc_idx = 0; expanded_acc_idx < expanded_acc_size; ++expanded_acc_idx)
|
||||
unroll_for (uint expanded_acc_idx = 0; expanded_acc_idx < expanded_acc_size; ++expanded_acc_idx)
|
||||
{
|
||||
const uint output_id = output_base_id + expanded_acc_idx * SUB_GROUP_SIZE;
|
||||
#if BIAS_TERM
|
||||
|
||||
@@ -2,13 +2,12 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
#if FP16_UNIT_USED
|
||||
// Block read - currently block is 4 bytes aligned.
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
|
||||
|
||||
#define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \
|
||||
{ \
|
||||
@@ -31,7 +30,7 @@
|
||||
}
|
||||
#else
|
||||
// Block read - currently block is 4 bytes aligned.
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
|
||||
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
|
||||
|
||||
#define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \
|
||||
{ \
|
||||
@@ -57,7 +56,7 @@
|
||||
#define SUB_GROUP_SIZE 8
|
||||
|
||||
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
|
||||
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
|
||||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
|
||||
KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv8_vload)(
|
||||
const __global UNIT_TYPE* input,
|
||||
__global UNIT_TYPE* output,
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "include/batch_headers/data_types.cl"
|
||||
#include "include/batch_headers/fetch_data.cl"
|
||||
#include "include/sub_group.cl"
|
||||
|
||||
__attribute__((reqd_work_group_size(8, 1, 1)))
|
||||
KERNEL (fully_connected_gpu_xb_xb_b8_x8)(
|
||||
const __global float* input,
|
||||
__global float* output,
|
||||
const __global float* weight
|
||||
#if BIAS_TERM
|
||||
, __global UNIT_TYPE* bias)
|
||||
#else
|
||||
)
|
||||
#endif
|
||||
{
|
||||
const uint global_id = get_global_id(0);
|
||||
const int x = get_global_id(0);
|
||||
const uint batch_id = x % INPUT0_BATCH_NUM;
|
||||
|
||||
uint neuronIdx = (x / INPUT0_BATCH_NUM) * NEURONS_PER_WORK_ITEM;
|
||||
|
||||
const uint sub_group_id = get_local_id(0);
|
||||
const uint batch_num = INPUT0_BATCH_NUM;
|
||||
|
||||
const int out_id = (global_id / batch_num) * NEURONS_PER_WORK_ITEM * batch_num + batch_id;
|
||||
|
||||
const int ofm_offset = (global_id * NEURONS_PER_WORK_ITEM) / batch_num;
|
||||
|
||||
float8 _data0 = 0.f;
|
||||
#if NEURONS_PER_WORK_ITEM > 8
|
||||
float8 _data1 = 0.f;
|
||||
#endif
|
||||
|
||||
uint weight_offset = sub_group_id + neuronIdx;
|
||||
|
||||
for (uint h = 0; h < INPUT0_ELEMENTS_COUNT; h++)
|
||||
{
|
||||
DOT_PRODUCT_8(_data0, input[h * batch_num + batch_id], weight[weight_offset])
|
||||
#if NEURONS_PER_WORK_ITEM > 8
|
||||
DOT_PRODUCT_8(_data1, input[h * batch_num + batch_id], weight[weight_offset + 8])
|
||||
#endif
|
||||
weight_offset += FILTER_OFM_NUM;
|
||||
}
|
||||
|
||||
#if BIAS_TERM
|
||||
ADD_BIAS_8(_data0, bias[neuronIdx + sub_group_id]);
|
||||
#if NEURONS_PER_WORK_ITEM > 8
|
||||
ADD_BIAS_8(_data1, bias[neuronIdx + sub_group_id + 8]);
|
||||
#endif
|
||||
#endif
|
||||
_data0 = ACTIVATION(_data0, ACTIVATION_PARAMS);
|
||||
#if NEURONS_PER_WORK_ITEM > 8
|
||||
_data1 = ACTIVATION(_data1, ACTIVATION_PARAMS);
|
||||
#endif
|
||||
|
||||
intel_sub_group_block_write8((__global uint*)output + out_id, as_uint8(_data0));
|
||||
#if NEURONS_PER_WORK_ITEM > 8
|
||||
intel_sub_group_block_write8((__global uint*)output + out_id + 8 * batch_num, as_uint8(_data1));
|
||||
#endif
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user