[GPU] Better extension requirements checks in kernels. Subgroups basic emulation (#13926)

This commit is contained in:
Vladimir Paramuzov
2022-12-29 09:08:05 +04:00
committed by GitHub
parent 4831a9ead4
commit 13c8b4fdc7
398 changed files with 2706 additions and 3093 deletions

View File

@@ -55,12 +55,15 @@ struct device_info {
bool supports_fp16; ///< Does engine support FP16.
bool supports_fp64; ///< Does engine support FP64.
bool supports_fp16_denorms; ///< Does engine support denormalized FP16.
bool supports_subgroups; ///< Does engine support cl_intel_subgroups extension.
bool supports_subgroups_short; ///< Does engine support cl_intel_subgroups_short extension.
bool supports_subgroups_char; ///< Does engine support cl_intel_subgroups_char extension.
bool supports_local_block_io; ///< Does engine support cl_intel_subgroup_local_block_io extension. Check program build with this option.
bool supports_khr_subgroups; ///< Does engine support cl_khr_subgroups extension.
bool supports_intel_subgroups; ///< Does engine support cl_intel_subgroups extension.
bool supports_intel_subgroups_short; ///< Does engine support cl_intel_subgroups_short extension.
bool supports_intel_subgroups_char; ///< Does engine support cl_intel_subgroups_char extension.
bool supports_intel_required_subgroup_size; ///< Does engine support cl_intel_required_subgroup_size extension.
bool supports_local_block_io; ///< Does engine support cl_intel_subgroup_local_block_io extension.
bool supports_queue_families; ///< Does engine support cl_intel_command_queue_families extension.
bool supports_image; ///< Does engine support images (CL_DEVICE_IMAGE_SUPPORT cap).
bool supports_intel_planar_yuv; ///< Does engine support cl_intel_planar_yuv extension.
bool supports_imad; ///< Does engine support int8 mad.
bool supports_immad; ///< Does engine support int8 multi mad.

View File

@@ -591,6 +591,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
return true;
}
if (node.get_primitive()->deformable_mode)
return false;
// Since reorder inputs is called after this pass
// we have to check that blocked formats can be used in the network and layer is optimized for it.
if ((node.get_output_layout().format == format::b_fs_yx_fsv16 ||

View File

@@ -1016,14 +1016,18 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
const auto& device_info = program->get_engine().get_device_info();
params.uniqueID = std::to_string(param_info.unique_id);
params.engineInfo.bSubGroupSupport = device_info.supports_subgroups;
params.engineInfo.bSubGroupShortSupport = device_info.supports_subgroups_short;
params.engineInfo.bSubGroupCharSupport = device_info.supports_subgroups_char;
params.engineInfo.bFP16Support = device_info.supports_fp16;
params.engineInfo.bFP64Support = device_info.supports_fp64;
params.engineInfo.bIMADSupport = device_info.supports_imad != 0;
params.engineInfo.bIMMADSupport = device_info.supports_immad != 0;
params.engineInfo.bImageSupport = device_info.supports_image != 0;
params.engineInfo.supports_fp16 = device_info.supports_fp16;
params.engineInfo.supports_fp64 = device_info.supports_fp64;
params.engineInfo.supports_fp16_denorms = device_info.supports_fp16_denorms;
params.engineInfo.supports_khr_subgroups = device_info.supports_khr_subgroups;
params.engineInfo.supports_intel_subgroups = device_info.supports_intel_subgroups;
params.engineInfo.supports_intel_subgroups_short = device_info.supports_intel_subgroups_short;
params.engineInfo.supports_intel_subgroups_char = device_info.supports_intel_subgroups_char;
params.engineInfo.supports_intel_required_subgroup_size = device_info.supports_intel_required_subgroup_size;
params.engineInfo.supports_imad = device_info.supports_imad;
params.engineInfo.supports_immad = device_info.supports_immad;
params.engineInfo.enable_sub_groups_emulation = true;
params.engineInfo.bOptHintsSupport = false;
params.engineInfo.bLocalBlockIOSupport = device_info.supports_local_block_io && program->is_local_block_io_supported();
@@ -1038,6 +1042,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.deviceCache = program->get_tuning_cache();
params.engineInfo.driverVersion = device_info.driver_version;
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
params.engineInfo.vendor_id = device_info.vendor_id;
auto impl_forcing_bo = program->get_options().get<build_option_type::force_implementations>();
const auto& impl_forcing = impl_forcing_bo->forcing;

View File

@@ -1066,6 +1066,11 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
auto input_layout = node.get_dependency(0).get_output_layout();
auto output_layout = node.calc_output_layout();
if (prim->deformable_mode) {
output_layout.format = format::adjust_to_rank(format::bfyx, output_layout.get_partial_shape().size());
return output_layout;
}
if (input_layout.is_dynamic() || output_layout.is_dynamic()) {
if (input_layout.get_partial_shape().size() <= 4)
expected_format = format::b_fs_yx_fsv16;

View File

@@ -3,7 +3,6 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
KERNEL(activation)(
__global INPUT0_TYPE* input,

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#ifdef PARAMETERIZED

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#if MAX_POOLING

View File

@@ -2,8 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/fetch_utils.cl"
#ifdef BATCH_AXIS
#define VALUES_NUM INPUT0_BATCH_NUM
@@ -44,32 +43,6 @@
#define MINIMUM_NUMBER_FOR_PARTIAL_SORTING 100
#define unroll_for __attribute__((opencl_unroll_hint)) for
///////////////////////// Input offset /////////////////////////
inline uint FUNC(get_input_offset)(uint b, uint f, uint z, uint y, uint x)
{
#if INPUT0_DIMS < 5
return INPUT0_GET_INDEX(b, f, y, x);
#elif INPUT0_DIMS == 5
return INPUT0_GET_INDEX(b, f, z, y, x);
#else
#error arg_max_min_axis.cl: input format - not supported
#endif
}
///////////////////////// Output offset ////////////////////////
inline uint FUNC(get_output_offset)(uint b, uint f, uint z, uint y, uint x)
{
#if OUTPUT_DIMS < 5
return OUTPUT_GET_INDEX(b, f, y, x);
#elif OUTPUT_DIMS == 5
return OUTPUT_GET_INDEX(b, f, z, y, x);
#else
#error arg_max_min_axis.cl: output format - not supported
#endif
}
KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
,__global OUTPUT_TYPE* output
#ifdef SECOND_OUTPUT_EXIST
@@ -174,41 +147,41 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
indices[AXIS] = sort_idx;
iav_type result;
result.value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result.value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
result.index = sort_idx;
for (uint i = 0; i < sort_idx / 8; i++) {
uint index_offset = i * 8;
indices[AXIS] = index_offset;
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 1;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 2;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 3;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 4;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 5;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 6;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
indices[AXIS] = index_offset + 7;
test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
if (sort_position >= TOP_K)
@@ -217,7 +190,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
for (uint i = (sort_idx / 8) * 8; i < sort_idx; i++) {
indices[AXIS] = i;
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_1 test_value)
sort_position++;
}
@@ -227,7 +200,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
for (uint i = sort_idx + 1; i < VALUES_NUM; i++) {
indices[AXIS] = i;
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
INPUT0_TYPE test_value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (result.value COMPARE_PARALLEL_SIGN_2 test_value)
sort_position++;
if (sort_position >= TOP_K)
@@ -236,7 +209,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
// Using simple sorting for sorting by indices and when TOP_K == 1
#elif TOP_K == 1
INPUT0_TYPE val = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
INPUT0_TYPE val = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
result[0].index = 0;
result[0].value = val;
bool already_exist = false;
@@ -255,7 +228,7 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
}
indices[AXIS] = i;
INPUT0_TYPE in_data = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
INPUT0_TYPE in_data = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
if (val COMPARE_SIGN in_data) {
result[top_k].index = i;
result[top_k].value = in_data;
@@ -270,26 +243,26 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
for (uint i = 0; i < VALUES_NUM / 8; i++) {
uint index_offset = i * 8;
indices[AXIS] = result[index_offset].index = index_offset;
result[index_offset].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 1].index = index_offset + 1;
result[index_offset + 1].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 1].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 2].index = index_offset + 2;
result[index_offset + 2].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 2].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 3].index = index_offset + 3;
result[index_offset + 3].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 3].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 4].index = index_offset + 4;
result[index_offset + 4].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 4].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 5].index = index_offset + 5;
result[index_offset + 5].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 5].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 6].index = index_offset + 6;
result[index_offset + 6].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 6].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = result[index_offset + 7].index = index_offset + 7;
result[index_offset + 7].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[index_offset + 7].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
}
for (uint i = (VALUES_NUM / 8) * 8; i < VALUES_NUM; i++) {
indices[AXIS] = result[i].index = i;
result[i].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
result[i].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
}
for (uint k = 1; k < VALUES_NUM; k *= 2) {
@@ -320,26 +293,26 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
for (uint i = 0; i < VALUES_NUM / 8; i++) {
uint index_offset = i * 8;
indices[AXIS] = temp_buf[index_offset].index = index_offset;
temp_buf[index_offset].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 1].index = index_offset + 1;
temp_buf[index_offset + 1].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 1].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 2].index = index_offset + 2;
temp_buf[index_offset + 2].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 2].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 3].index = index_offset + 3;
temp_buf[index_offset + 3].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 3].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 4].index = index_offset + 4;
temp_buf[index_offset + 4].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 4].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 5].index = index_offset + 5;
temp_buf[index_offset + 5].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 5].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 6].index = index_offset + 6;
temp_buf[index_offset + 6].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 6].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
indices[AXIS] = temp_buf[index_offset + 7].index = index_offset + 7;
temp_buf[index_offset + 7].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[index_offset + 7].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
}
for (uint i = (VALUES_NUM / 8) * 8; i < VALUES_NUM; i++) {
indices[AXIS] = temp_buf[i].index = i;
temp_buf[i].value = input[FUNC_CALL(get_input_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])];
temp_buf[i].value = input[FUNC_CALL(get_input_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])];
}
for (uint group = 0; group < group_num - 1; group++) {
@@ -439,22 +412,22 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
#if SORT_BY_VALUE
indices[AXIS] = sort_position;
#ifdef TOP_K_ORDER
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.value);
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.value);
#else
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.index);
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result.index);
#endif
#ifdef SECOND_OUTPUT_EXIST
#ifdef MULTIPLE_OUTPUTS
#ifdef TOP_K_ORDER
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.index);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.index);
#else
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.value);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result.value);
#endif
#else
#ifdef TOP_K_ORDER
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.index);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.index);
#else
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.value);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result.value);
#endif
#endif
#endif
@@ -472,22 +445,22 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
indices[AXIS] = out_position;
#ifdef TOP_K_ORDER
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].value);
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].value);
#else
output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].index);
output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT_TYPE(result[top_k].index);
#endif
#ifdef SECOND_OUTPUT_EXIST
#ifdef MULTIPLE_OUTPUTS
#ifdef TOP_K_ORDER
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].index);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].index);
#else
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].value);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_OUTPUT1_TYPE(result[top_k].value);
#endif
#else
#ifdef TOP_K_ORDER
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].index);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].index);
#else
second_output[FUNC_CALL(get_output_offset)(indices[0], indices[1], indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].value);
second_output[FUNC_CALL(get_output_index)(indices[0], indices[1], 0, indices[2], indices[3], indices[4])] = TO_INPUT1_TYPE(result[top_k].value);
#endif
#endif
#endif
@@ -504,4 +477,3 @@ KERNEL(arg_max_min_modified)(const __global INPUT0_TYPE* input
#undef AXIS
#undef VALUES_NUM
#undef MINIMUM_NUMBER_FOR_PARTIAL_SORTING
#undef unroll_for

View File

@@ -3,8 +3,7 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#define GLOBAL_SIZE 128
#define LOCAL_SIZE GLOBAL_SIZE
@@ -13,7 +12,7 @@
#define INPUT0_FILL_VAL INPUT0_VAL_MIN
#else
#define COMPARE_SIGN >
#define INPUT0_FILL_VAL INPUT0_VAL_MAX
#define INPUT0_FILL_VAL INPUT0_VAL_MAX
#endif
__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1)))
@@ -39,8 +38,7 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
uint temp_index = global_index;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < TOP_K; i++){
unroll_for(uint i = 0; i < TOP_K; i++){
accumulator.index = global_index;
accumulator.value = input[global_index];
for (int j = 0; j < i; j++){
@@ -49,10 +47,10 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
}
global_index += GLOBAL_SIZE;
#ifdef INPUT0_LAYOUT_BFYX
while (global_index < size + batch_offset)
while (global_index < size + batch_offset)
#else
while (global_index < size)
#endif
#endif
{
iav_type element;
element.value = input[global_index];
@@ -72,7 +70,7 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
global_index += GLOBAL_SIZE * INPUT0_BATCH_NUM;
#endif
}
#ifdef INPUT0_LAYOUT_BFYX
if (local_index < size)
scratch[local_index] = accumulator;
@@ -84,14 +82,13 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
else
scratch[local_index].value = INPUT0_FILL_VAL;
#endif
barrier(CLK_LOCAL_MEM_FENCE);
__attribute__((opencl_unroll_hint))
for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
unroll_for(uint offset = LOCAL_SIZE / 2; offset > 0; offset /= 2)
{
if (local_index < offset)
if (local_index < offset)
{
iav_type other = scratch[local_index + offset];
iav_type mine = scratch[local_index];
@@ -103,16 +100,16 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
}
barrier(CLK_LOCAL_MEM_FENCE);
}
#ifdef INPUT0_LAYOUT_BFYX
if (local_index == 0)
if (local_index == 0)
{
output[current_batch * TOP_K + i] = scratch[0].index % size;
}
global_index = temp_index;
results[i] = scratch[0].index % size;
#else
if (local_index == 0)
if (local_index == 0)
{
output[current_batch + i*INPUT0_BATCH_NUM] = scratch[0].index / INPUT0_BATCH_NUM;
}
@@ -123,4 +120,4 @@ KERNEL(arg_max_gpu_top_k)(const __global INPUT0_TYPE* input, __global OUTPUT_TYP
}
#undef COMPARE_SIGN
#undef INPUT0_FILL_VAL
#undef INPUT0_FILL_VAL

View File

@@ -3,7 +3,6 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#ifndef SG_SIZE
#define SG_SIZE 16
@@ -36,7 +35,7 @@
#endif
__attribute__((intel_reqd_sub_group_size(SG_SIZE)))
REQD_SUB_GROUP_SIZE(SG_SIZE)
__attribute__((reqd_work_group_size(SG_SIZE, 1, 1)))
KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
{
@@ -56,8 +55,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
// (gid + 1) <= input_size / (INB_ARRAY_SIZE * SG_SIZE) -> as gid is integral, the floor is not an issue
if (gid + 1 <= input_size / (INB_ARRAY_SIZE * SG_SIZE))
{
__attribute__((opencl_unroll_hint))
for (uint ai = 0; ai < INB_ARRAY_SIZE; ++ai)
unroll_for(uint ai = 0; ai < INB_ARRAY_SIZE; ++ai)
{
// Can be exchanged with sub-group block read to INB_ARRAY_SIZE-component vector.
input_blocks[ai] = input[gid * INB_ARRAY_SIZE * SG_SIZE + ai * SG_SIZE + lid];
@@ -69,8 +67,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
const uint last_gid = input_size / (INB_ARRAY_SIZE * SG_SIZE);
uint ai = 0;
__attribute__((opencl_unroll_hint))
for (uint last_base_off = last_gid * INB_ARRAY_SIZE * SG_SIZE; last_base_off + SG_SIZE <= input_size; last_base_off += SG_SIZE)
unroll_for(uint last_base_off = last_gid * INB_ARRAY_SIZE * SG_SIZE; last_base_off + SG_SIZE <= input_size; last_base_off += SG_SIZE)
{
// Can be exchanged with sub-group block read to scalar.
input_blocks[ai] = input[last_base_off + lid];
@@ -85,8 +82,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
indices[ai++] = lid < input_size - remainder_off ? remainder_off + lid : 0;
}
__attribute__((opencl_unroll_hint))
for (; ai < INB_ARRAY_SIZE; ++ai)
unroll_for(; ai < INB_ARRAY_SIZE; ++ai)
{
input_blocks[ai] = UNIT_FILL_VAL;
}
@@ -98,8 +94,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
UNIT_TYPE acc[minmax_acc_array_size];
uint result[minmax_acc_array_size];
__attribute__((opencl_unroll_hint))
for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
unroll_for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
{
acc[ai] = UNIT_FILL_VAL;
result[ai] = 0;
@@ -109,24 +104,22 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
__attribute__((opencl_unroll_hint(1)))
for (uint ii = 0; ii < INB_ARRAY_SIZE * SG_SIZE; ++ii)
{
UNIT_TYPE in_val = intel_sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
uint in_index = intel_sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
__attribute__((opencl_unroll_hint))
for (uint ai = 0; ai < minmax_acc_array_size; ++ai)
UNIT_TYPE in_val = _sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
uint in_index = _sub_group_shuffle(input_blocks[ii / SG_SIZE], ii % SG_SIZE);
unroll_for(uint ai = 0; ai < minmax_acc_array_size; ++ai)
{
bool insert_flag = (in_val OP_ARG_REL acc[ai]);
if (sub_group_any(insert_flag))
{
__attribute__((opencl_unroll_hint))
for (uint aj = minmax_acc_array_size; aj > ai + 1; --aj)
unroll_for(uint aj = minmax_acc_array_size; aj > ai + 1; --aj)
{
acc[aj - 1] = intel_sub_group_shuffle_up(acc[aj - 2], acc[aj - 1], 1);
result[aj - 1] = intel_sub_group_shuffle_up(result[aj - 2], acc[aj - 1], 1);
acc[aj - 1] = _sub_group_shuffle_up(acc[aj - 2], acc[aj - 1], 1);
result[aj - 1] = _sub_group_shuffle_up(result[aj - 2], acc[aj - 1], 1);
}
UNIT_TYPE in_val_acc_mask = select(in_val, acc[ai], insert_flag);
uint in_index_mask = select(in_index, result[ai], insert_flag);
acc[ai] = select(acc[ai], intel_sub_group_shuffle_up(in_val, in_val_acc_mask, 1), insert_flag);
result[ai] = select(result[ai], intel_sub_group_shuffle_up(in_index, in_index_mask, 1), insert_flag);
acc[ai] = select(acc[ai], _sub_group_shuffle_up(in_val, in_val_acc_mask, 1), insert_flag);
result[ai] = select(result[ai], _sub_group_shuffle_up(in_index, in_index_mask, 1), insert_flag);
break;
}
}
@@ -135,8 +128,7 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
// Write TOP_K sorted results.
uint ai = 0;
__attribute__((opencl_unroll_hint))
for (uint k_base_off = 0; k_base_off + SG_SIZE <= TOP_K; k_base_off += SG_SIZE)
unroll_for (uint k_base_off = 0; k_base_off + SG_SIZE <= TOP_K; k_base_off += SG_SIZE)
{
output[k_base_off + lid] = result[ai++] % input_size;
}
@@ -161,4 +153,4 @@ KERNEL(arg_max_min_opt)(const __global UNIT_TYPE* input, __global uint* output)
#undef UNIT_FILL_VAL
#undef UNIT_FILL_VAL_NEEDSUNDEF_
#endif
#undef OP_ARG_REL
#undef OP_ARG_REL

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(average_unpooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(batch_to_space_ref)(const __global INPUT0_TYPE* input,

View File

@@ -2,17 +2,19 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#define OC_BLOCK_SIZE 32
#define GET_WEI(data, id) intel_sub_group_shuffle(data, id)
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
#define GET_WEI(data, id) _sub_group_shuffle(data, id)
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) _sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint(val))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
KERNEL(binary_convolution_1x1)(const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,

View File

@@ -2,17 +2,19 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/unit_type.cl"
#define OC_BLOCK_SIZE 16
#define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
#define GET_SRC(data, id) _sub_group_shuffle(data, id)
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
KERNEL(binary_convolution_1x1_b_fs_yx_fsv16)(const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,

View File

@@ -2,13 +2,14 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#define OC_BLOCK_SIZE 32
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(intel_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_uint(_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) as_uint2(_sub_group_block_read2((const __global uint*)(ptr) + (byte_offset)))
#if BINARY_PACKED_OUTPUT
#define BUFFER_TYPE UNIT_TYPE
@@ -16,7 +17,7 @@
#define BUFFER_TYPE OUTPUT_TYPE
#endif
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -107,7 +108,7 @@ KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
__attribute__((opencl_unroll_hint(SUB_GROUP_SIZE)))
for (int i = 0; i < SUB_GROUP_SIZE; i++)
{
INPUT0_TYPE src = intel_sub_group_shuffle(line_cache[(kw + i*STRIDE_SIZE_X) / SUB_GROUP_SIZE],
INPUT0_TYPE src = _sub_group_shuffle(line_cache[(kw + i*STRIDE_SIZE_X) / SUB_GROUP_SIZE],
(kw + i*STRIDE_SIZE_X) % SUB_GROUP_SIZE);
#if EXCLUDE_PAD
int compute = ((input_x + kw + i*STRIDE_SIZE_X >= 0) &&
@@ -149,7 +150,7 @@ KERNEL(binary_convolution_generic)(const __global INPUT0_TYPE* input,
for (int i = 0; i < SUB_GROUP_SIZE*2; i++)
{
#if EXCLUDE_PAD
CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*intel_sub_group_shuffle(real_ks, i%SUB_GROUP_SIZE) - 2*dst_buf[i]);
CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*_sub_group_shuffle(real_ks, i%SUB_GROUP_SIZE) - 2*dst_buf[i]);
#else
CONV_RESULT_TYPE res = TO_CONV_RESULT_TYPE(INPUT0_FEATURE_NUM*FILTER_SIZE_Y*FILTER_SIZE_X - 2*dst_buf[i]);
#endif

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(binary_convolution_ref)(const __global INPUT0_TYPE* input,

View File

@@ -2,34 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if INPUT0_DIMS < 5
return INPUT0_GET_INDEX(b, f, y, x);
#elif INPUT0_DIMS == 5
return INPUT0_GET_INDEX(b, f, z, y, x);
#elif INPUT0_DIMS == 6
return INPUT0_GET_INDEX(b, f, w, z, y, x);
#else
#error [clDNN border_gpu_ref.cl]: input format - not supported
#endif
}
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if OUTPUT_DIMS < 5
return OUTPUT_GET_INDEX(b, f, y, x);
#elif OUTPUT_DIMS == 5
return OUTPUT_GET_INDEX(b, f, z, y, x);
#elif OUTPUT_DIMS == 6
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
#else
#error [clDNN border_gpu_ref.cl]: output format - not supported
#endif
}
#include "include/fetch_utils.cl"
KERNEL(border_gpu_ref)(
const __global INPUT0_TYPE* input,

View File

@@ -2,8 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/fetch_utils.cl"
#define GET_UPDATES_INDEX(prefix, idx_order) CAT(prefix, _GET_INDEX)(idx_order)

View File

@@ -3,7 +3,8 @@
//
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#define WORK_GROUP_SIZE 16
#define IC_BLOCK 16
@@ -21,10 +22,8 @@
# define TILE_F 1
#endif
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
__attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
__attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(WORK_GROUP_SIZE)
KERNEL (concatenation_gpu_blocked)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -52,8 +51,7 @@ KERNEL (concatenation_gpu_blocked)(
OUTPUT_BLOCK_WRITE(output, dst_index, res);
} else {
if (lid < INPUT0_FEATURE_NUM % IC_BLOCK) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_XY; ++tx) {
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(((INPUT0_TYPE*)&src)[tx], ACTIVATION_PARAMS));
output[dst_index + tx * IC_BLOCK + lid] = res;
}
@@ -78,12 +76,11 @@ KERNEL (concatenation_gpu_blocked)(
INPUT_VEC_TYPE src_al1 = 0;
INPUT_VEC_TYPE src_al2 = 0;
#endif
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_XY; ++tx) {
((INPUT0_TYPE*)&src_al0)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src0)[tx], ((INPUT0_TYPE*)&src1)[tx], (IC_BLOCK - MISALIGNMENT));
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
((INPUT0_TYPE*)&src_al0)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src0)[tx], ((INPUT0_TYPE*)&src1)[tx], (IC_BLOCK - MISALIGNMENT));
#if TILE_F == 4
((INPUT0_TYPE*)&src_al1)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src1)[tx], ((INPUT0_TYPE*)&src2)[tx], (IC_BLOCK - MISALIGNMENT));
((INPUT0_TYPE*)&src_al2)[tx] = intel_sub_group_shuffle_down(((INPUT0_TYPE*)&src2)[tx], ((INPUT0_TYPE*)&src3)[tx], (IC_BLOCK - MISALIGNMENT));
((INPUT0_TYPE*)&src_al1)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src1)[tx], ((INPUT0_TYPE*)&src2)[tx], (IC_BLOCK - MISALIGNMENT));
((INPUT0_TYPE*)&src_al2)[tx] = _sub_group_shuffle_down(((INPUT0_TYPE*)&src2)[tx], ((INPUT0_TYPE*)&src3)[tx], (IC_BLOCK - MISALIGNMENT));
#endif
}
OUTPUT_VEC_TYPE res_al0 = TO_OUTPUT_VEC_TYPE(ACTIVATION(src_al0, ACTIVATION_PARAMS));
@@ -105,8 +102,7 @@ KERNEL (concatenation_gpu_blocked)(
#endif
dst_index = OUTPUT_GET_INDEX(b, (f_block*IC_BLOCK + lid_f_offset + output_offset_in_concat_axis), y, x);
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_XY; ++tx) {
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
OUTPUT_TYPE res_unal = TO_OUTPUT_TYPE(ACTIVATION(((INPUT0_TYPE*)&src_unal)[tx], ACTIVATION_PARAMS));
output[dst_index + tx * IC_BLOCK] = res_unal;
}
@@ -115,15 +111,13 @@ KERNEL (concatenation_gpu_blocked)(
{
const uint dst_index = OUTPUT_GET_INDEX(b, (f_block*IC_BLOCK + lid + output_offset_in_concat_axis), y, x);
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < TILE_F; ++fw) {
unroll_for(uint fw = 0; fw < TILE_F; ++fw) {
if (TILE_F != 1 && CEIL_DIV(INPUT0_FEATURE_NUM, IC_BLOCK) % TILE_F != 0 && CEIL_DIV(INPUT0_FEATURE_NUM, IC_BLOCK) % TILE_F == fw)
break;
bool do_leftover_write = INPUT0_FEATURE_NUM % IC_BLOCK == 0 || f_block * IC_BLOCK + fw * IC_BLOCK + lid < INPUT0_FEATURE_NUM;
if (do_leftover_write) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_XY; ++tx) {
unroll_for(uint tx = 0; tx < TILE_XY; ++tx) {
INPUT0_TYPE src = input[input_offset + lid + tx * IC_BLOCK + fw * INPUT0_FEATURE_PITCH * IC_BLOCK];
OUTPUT_TYPE res = TO_OUTPUT_TYPE(ACTIVATION(src, ACTIVATION_PARAMS));
output[dst_index + tx * IC_BLOCK + fw * OUTPUT_FEATURE_PITCH * IC_BLOCK] = res;
@@ -144,4 +138,3 @@ KERNEL (concatenation_gpu_blocked)(
#undef OUTPUT_BLOCK_WRITE
#undef TILE_F
#undef CEIL_DIV

View File

@@ -2,7 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
//
@@ -16,17 +17,9 @@
#define WORK_GROUP_SIZE 16
#define INPUT0_ELEMENTS_COUNT (INPUT0_LENGTH/INPUT0_BATCH_NUM)
#if FP16_UNIT_USED
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
#else
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (byte_offset), as_uint8(val))
#endif
__attribute__((reqd_work_group_size(1, WORK_GROUP_SIZE, 1)))
__attribute__((intel_reqd_sub_group_size(WORK_GROUP_SIZE)))
KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
REQD_SUB_GROUP_SIZE(WORK_GROUP_SIZE)
KERNEL(concatenation_gpu_depth_bfyx_no_pitch)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
{
const uint batch_id = get_group_id(0);
@@ -41,7 +34,7 @@ KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __gl
const uint output_offset = OUTPUT_OFFSET + element_group_offset + output_batch_offset + output_offset_in_concat_axis*OUTPUT_PITCHES[CONCAT_AXIS_INDEX];
//Check if current group in batch starts from 16-byte aligned pos. If not then move block read to 16-byte aligned position.
//Requirement for intel_sub_group_block_write8.
//Requirement for _sub_group_block_write8.
uint align_offset = 0;
const uint group_start_pos = output_offset;
if(group_start_pos % WORK_GROUP_SIZE != 0)
@@ -52,8 +45,8 @@ KERNEL (concatenation_gpu_depth_bfyx_no_padding)(__global UNIT_TYPE* input, __gl
if(element_group_offset + align_offset + WORK_GROUP_SIZE * ELEMENTS_PER_WORK_ITEM < INPUT0_ELEMENTS_COUNT)
{
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) in = ALIGNED_BLOCK_READ8(input, input_offset + align_offset);
ALIGNED_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, ACTIVATION_PARAMS));
MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) in = DT_INPUT_BLOCK_READ8(input, input_offset + align_offset);
DT_OUTPUT_BLOCK_WRITE8(output, output_offset + align_offset, ACTIVATION(in, ACTIVATION_PARAMS));
//Fill the values that were missed upon adding align_offset
if((align_offset != 0) && (element_offset + output_batch_offset < group_start_pos + align_offset))

View File

@@ -2,12 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/unit_type.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/fetch_data.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
@@ -23,10 +20,10 @@
// must be equal FSV / SUB_GROUP_SIZE
// ======================================================================================
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
__global UNIT_TYPE* output,
KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
uint output_offset_in_concat_axis
)
{
@@ -44,12 +41,12 @@ KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
input_offset += b * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV;
input_offset += fs * INPUT0_SIZE_X_WITH_PADDING * INPUT0_SIZE_Y_WITH_PADDING * FSV * INPUT0_BATCH_NUM;
UNIT_TYPE2 in = UNIT_BLOCK_READ2(input, input_offset);
MAKE_VECTOR_TYPE(INPUT0_TYPE, 2) in = DT_INPUT_BLOCK_READ2(input, input_offset);
in = ACTIVATION(in, ACTIVATION_PARAMS);
#if ALIGNED
const uint dst_index = OUTPUT_GET_INDEX(b, output_offset_in_concat_axis + fs * FSV, y, x);
UNIT_BLOCK_WRITE2(output, dst_index, in);
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, in);
#else
const uint dst_feature = fs * FSV + output_offset_in_concat_axis + sglid;
if (dst_feature + SUB_GROUP_SIZE < OUTPUT_FEATURE_NUM) {
@@ -63,8 +60,6 @@ KERNEL (concatenation_gpu_fs_b_yx_fsv32)(__global UNIT_TYPE* input,
#endif
}
#undef unroll_for
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#define GET_INDEX(prefix, ORDER) CAT(prefix, _GET_INDEX)(ORDER)

View File

@@ -2,53 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
///////////////////////// Input Index /////////////////////////
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if INPUT0_SIMPLE && INPUT0_DIMS <= 4
return GET_DATA_INDEX(INPUT0, b, f, y, x);
#elif INPUT0_SIMPLE && INPUT0_DIMS == 5
return GET_DATA_INDEX_5D(INPUT0, b, f, z, y, x);
#elif INPUT0_SIMPLE && INPUT0_DIMS == 6
return GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x);
#elif INPUT0_LAYOUT_B_FS_ZYX_FSV16
return GET_DATA_B_FS_ZYX_FSV16_INDEX(INPUT0, b, f, z, y, x);
#elif INPUT0_LAYOUT_BS_FS_ZYX_BSV16_FSV16
return GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(INPUT0, b, f, z, y, x);
#elif INPUT0_LAYOUT_BS_FS_YX_BSV16_FSV16
return GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, y, x);
#elif INPUT0_LAYOUT_BS_FS_YX_BSV32_FSV32
return GET_DATA_BS_FS_YX_BSV32_FSV32_INDEX(INPUT0, b, f, y, x);
#else
#error concatenation_gpu_simple_ref.cl: input format - not supported
#endif
}
///////////////////////// Output Index /////////////////////////
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if OUTPUT_SIMPLE && OUTPUT_DIMS <= 4
return GET_DATA_INDEX(OUTPUT, b, f, y, x);
#elif OUTPUT_SIMPLE && OUTPUT_DIMS == 5
return GET_DATA_INDEX_5D(OUTPUT, b, f, z, y, x);
#elif OUTPUT_SIMPLE && OUTPUT_DIMS == 6
return GET_DATA_INDEX_6D(OUTPUT, b, f, w, z, y, x);
#elif OUTPUT_LAYOUT_B_FS_ZYX_FSV16
return GET_DATA_B_FS_ZYX_FSV16_INDEX(OUTPUT, b, f, z, y, x);
#elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16
return GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(OUTPUT, b, f, z, y, x);
#elif OUTPUT_LAYOUT_BS_FS_YX_BSV16_FSV16
return GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(OUTPUT, b, f, y, x);
#elif OUTPUT_LAYOUT_BS_FS_YX_BSV32_FSV32
return GET_DATA_BS_FS_YX_BSV32_FSV32_INDEX(OUTPUT, b, f, y, x);
#else
#error concatenation_gpu_simple_ref.cl: output format - not supported
#endif
}
#include "include/fetch_utils.cl"
KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
{

View File

@@ -3,7 +3,6 @@
//
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/data_types.cl"
#if defined(CONVERT_FROM_NV12) || defined(CONVERT_FROM_I420)
#ifdef BUFFER_MEM

View File

@@ -4,8 +4,9 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#define TYPE_N_(type, n) type##n
#define TYPE_N(type, n) TYPE_N_(type, n)
@@ -60,13 +61,10 @@
#endif
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define FSV 16
#define SIMD 16
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(1, SIMD * FEATURE_SLM_SPLIT, 1)))
KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
const __global INPUT0_TYPE *conv_input,
@@ -102,8 +100,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
const uint max_out_yx = OUTPUT_SIZE_X * OUTPUT_SIZE_Y;
uint max_local_yx = min(max_out_yx, out_yx_sg + OUT_BLOCK_SPATIAL);
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
unroll_for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
uint out_yx_shuffle = out_yx_sg + sglid + os * SIMD;
uint out_yx_clamp = max_out_yx % OUT_BLOCK_SPATIAL == 0
? out_yx_shuffle
@@ -136,8 +133,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
uint input_y[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
#endif
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
unroll_for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
#ifdef SHOULD_USE_DATA_ZP
input_x[os] = out_x_shuffle[os] * STRIDE_SIZE_X - PADDING_SIZE_X;
input_y[os] = out_y_shuffle[os] * STRIDE_SIZE_Y - PADDING_SIZE_Y;
@@ -158,18 +154,15 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
uint4 weights_zp_val[OUT_BLOCK_FEATURES];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
}
#if INPUT0_FEATURE_NUM % FSV != 0
uint4 weights_zp_vec_partial[OUT_BLOCK_FEATURES];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
__attribute__((opencl_unroll_hint))
for (uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
unroll_for(uint f = INPUT0_FEATURE_NUM % FSV; f < FSV; f++) {
wzp_p[f] = 0;
}
}
@@ -181,8 +174,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
#if INPUT0_FEATURE_NUM % FSV != 0
if (feature_offset + (k + 1) * FSV >= ALIGN(INPUT0_FEATURE_NUM, FSV)) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
}
}
@@ -199,11 +191,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OUT_BLOCK_FEATURES];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
dotProdAZPxWZP[ofb] = 0;
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ive++) {
unroll_for(uint ive = 0; ive < 4; ive++) {
dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
IMAD(dotProdAZPxWZP[ofb][ive],
AS_INPUT0_TYPE_4(data_zp_val[ive]),
@@ -213,14 +203,12 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#endif
uint4 weights_val[OUT_BLOCK_FEATURES] = { };
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
weights_val[ofb] = vload4(0, (__global uint*)(weights + filter_idx + ofb * WEIGHTS_FEATURE_BLOCK_PITCH));
}
uint4 input_val[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
#if defined ASYMMETRIC_DATA_QUANTIZATION && defined NON_ZERO_INPUT0_PAD_BEFORE
if (((input_x[os] < 0) || (input_x[os] >= INPUT0_SIZE_X)) ||
((input_y[os] < 0) || (input_y[os] >= INPUT0_SIZE_Y))) {
@@ -236,12 +224,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#if OUT_BLOCK_FEATURES > 1 && FEATURE_SLM_SPLIT != 1 && OUT_BLOCK_SPATIAL > 14
// For some cases compiler spills here due to loop order
// Use suboptimal order to avoid this at cost of instruction dispatch delays.
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ++ive) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint ive = 0; ive < 4; ++ive) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
#ifdef SHOULD_USE_DATA_ZP
ACCUMULATOR_TYPE dotProdAZPxW = 0;
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
@@ -250,10 +235,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
#endif
#else
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ++ive) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ive = 0; ive < 4; ++ive) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
#ifdef SHOULD_USE_DATA_ZP
ACCUMULATOR_TYPE dotProdAZPxW = 0;
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
@@ -261,10 +244,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
AS_INPUT0_TYPE_4(data_zp_val[ive]),
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
#endif
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
#endif
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(_sub_group_shuffle(input_val[os / SIMD][ive], os % SIMD));
dotProd[ofb][os] = IMAD(dotProd[ofb][os],
inputs,
@@ -293,8 +275,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
}
filter_idx += WEIGHTS_IS_PITCH;
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
input_idx[os] += INPUT0_FEATURE_PITCH * FSV;
}
@@ -317,27 +298,21 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
__local ACCUMULATOR_TYPE* partial_acc_ptr = partial_acc + sgid_start_idx * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL + sglid;
if (get_sub_group_id() < OUT_BLOCK_FEATURES) {
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
unroll_for(uint wg = 0; wg < OUT_BLOCK_FEATURES; ++wg) {
if (get_sub_group_id() == wg) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < wg; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint ofb = 0; ofb < wg; ++ofb) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_SPATIAL * SIMD +
os * SIMD;
partial_acc_ptr[partial_acc_ptr_idx] = dotProd[ofb][os];
}
}
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
dotProd[0][os] = dotProd[wg][os];
}
__attribute__((opencl_unroll_hint))
for (uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint ofb = wg + 1; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
((wg != 0) ? OUT_BLOCK_SPATIAL * OUT_BLOCK_FEATURES * SIMD : 0) +
ofb * OUT_BLOCK_SPATIAL * SIMD +
@@ -348,10 +323,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
}
}
} else {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint ofb = 0; ofb < OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_SPATIAL * SIMD +
os * SIMD;
@@ -366,10 +339,8 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
return;
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_SPATIAL * SIMD + sglid;
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
const uint partial_acc_ptr_idx =
wg * OUT_BLOCK_FEATURES * SIMD * OUT_BLOCK_SPATIAL +
os * SIMD;
@@ -399,18 +370,15 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#ifdef COMPENSATION_TERM
COMPENSATION_TYPE comp[FINAL_OUT_BLOCK_FEATURES];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
comp[ofb] = compensation[out_f + ofb * SIMD];
}
#endif
// Convert accumulator type to activation type
ACTIVATION_TYPE dequantized[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
dequantized[ofb][os] = TO_ACTIVATION_TYPE(dotProd[ofb][os]);
#if BIAS_TERM
@@ -424,13 +392,11 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
// Fused ops/activation
OUTPUT_TYPE result[FINAL_OUT_BLOCK_FEATURES][OUT_BLOCK_SPATIAL];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
unroll_for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_PRELOAD_SCALAR;
#endif
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
#if HAS_FUSED_OPS
#if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_CALC_SCALAR;
@@ -462,10 +428,9 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
if (can_use_full_block_write) {
uint output_idx = OUTPUT_GET_INDEX(out_b,
out_fg,
intel_sub_group_shuffle(out_y_shuffle[0], 0),
intel_sub_group_shuffle(out_x_shuffle[0], 0));
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
_sub_group_shuffle(out_y_shuffle[0], 0),
_sub_group_shuffle(out_x_shuffle[0], 0));
unroll_for(uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
|| (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
|| (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
@@ -474,8 +439,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#if OUTPUT_TYPE_SIZE == 1
for (; os + 8 <= OUT_BLOCK_SPATIAL; os += 8) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 8; ++i) {
unroll_for(uint i = 0; i < 8; ++i) {
result_val[i] = result[ofb][os + i];
}
DT_OUTPUT_BLOCK_WRITE8(output, output_idx, result_val);
@@ -485,8 +449,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#if OUTPUT_TYPE_SIZE <= 2
for (; os + 4 <= OUT_BLOCK_SPATIAL; os += 4) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 4; ++i) {
unroll_for(uint i = 0; i < 4; ++i) {
result_val[i] = result[ofb][os + i];
}
DT_OUTPUT_BLOCK_WRITE4(output, output_idx, result_val);
@@ -495,8 +458,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#endif
for (; os + 2 <= OUT_BLOCK_SPATIAL; os += 2) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 2; ++i) {
unroll_for(uint i = 0; i < 2; ++i) {
result_val[i] = result[ofb][os + i];
}
DT_OUTPUT_BLOCK_WRITE2(output, output_idx, result_val);
@@ -512,23 +474,20 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
}
} else {
uint output_idx_shuffle[CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD)] = { };
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
output_idx_shuffle[os] = OUTPUT_GET_INDEX(out_b, out_fg, out_y_shuffle[os], out_x_shuffle[os]);
}
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
unroll_for(uint ofb = 0; ofb < FINAL_OUT_BLOCK_FEATURES; ++ofb) {
bool good_of_block = (CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES == 0)
|| (out_fg + FINAL_OUT_BLOCK_FEATURES * SIMD <= OUTPUT_FEATURE_NUM)
|| (ofb < CEIL_DIV(OUTPUT_FEATURE_NUM, SIMD) % FINAL_OUT_BLOCK_FEATURES);
if (good_of_block) {
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
unroll_for(uint os = 0; os < OUT_BLOCK_SPATIAL; ++os) {
bool good_os = (max_out_yx % OUT_BLOCK_SPATIAL == 0) || (out_yx_sg <= max_out_yx - OUT_BLOCK_SPATIAL) || (os < max_out_yx % OUT_BLOCK_SPATIAL);
if (!good_os)
break;
uint output_idx = intel_sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
uint output_idx = _sub_group_shuffle(output_idx_shuffle[os / SIMD], os % SIMD);
bool good_of = (OUTPUT_FEATURE_NUM % SIMD == 0) || (out_f + ofb * SIMD < OUTPUT_FEATURE_NUM);
if (!good_of)
@@ -538,8 +497,7 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
}
}
__attribute__((opencl_unroll_hint))
for (uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
unroll_for(uint os = 0; os < CEIL_DIV(OUT_BLOCK_SPATIAL, SIMD); ++os) {
output_idx_shuffle[os] += OUTPUT_FEATURE_PITCH * FSV;
}
}
@@ -582,8 +540,5 @@ KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
#undef AS_FILTER_TYPE_4
#undef CEIL_DIV
#undef ALIGN
#undef SIMD
#undef FSV

View File

@@ -4,8 +4,7 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
// ======================================================================================
// Host side jit-constants:
@@ -23,8 +22,6 @@
// data prefetching; requires additional global barrier
// ======================================================================================
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define FSV 4
#define WEIGHTS_OSV 16
@@ -61,7 +58,7 @@
// WI: 1 x FEATURES_PER_WI x 1
// SG: 1 x FEATURES_PER_WI x SIMD
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(SIMD, 1, LWG_DEPTH)))
KERNEL(convolution)(
const __global uint *input,
@@ -134,7 +131,7 @@ KERNEL(convolution)(
weights_offset += WEIGHTS_IS_PITCH / FSV * LWG_DEPTH;
unroll_for (uint out_fi = 0; out_fi < FEATURES_PER_WI; ++out_fi) {
int wei_i = intel_sub_group_shuffle(wei_sg[out_fi / SIMD], out_fi % SIMD);
int wei_i = _sub_group_shuffle(wei_sg[out_fi / SIMD], out_fi % SIMD);
FILTER_TYPE4 wei_val = AS_FILTER_TYPE4(wei_i);
dotProd[out_fi] = IMAD(dotProd[out_fi], in_val, wei_val);
@@ -223,8 +220,6 @@ KERNEL(convolution)(
}
}
#undef unroll_for
#undef FSV
#undef WEIGHTS_OSV

View File

@@ -2,10 +2,10 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
// ======================================================================================
// Host side jit-constants:
@@ -51,7 +51,6 @@
#define WEIGHTS_YXS_PITCH 4
#define FILTER_SPATIAL_SIZE (FILTER_SIZE_X * FILTER_SIZE_Y)
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
#if FILTER_BLOCKED < FILTER_SPATIAL_SIZE && FILTER_BLOCKED % 4 != 0
# error convolution_gpu_b_fs_yx_fsv4_dw.cl - filter blocks must either cover whole spatial filter or be multiple of 4.
@@ -76,9 +75,9 @@
#endif
#if TILED
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
#endif
KERNEL(convolution)(
KERNEL(convolution_gpu_b_fs_yx_fsv4_dw)(
const __global INPUT_TYPE4 *input,
__global OUTPUT_TYPE4 *output,
const __global FILTER_TYPE4 *weights,
@@ -114,11 +113,9 @@ KERNEL(convolution)(
#if PRELOAD_INPUT || TILED
INPUT_TYPE4 in[FILTER_SIZE_Y * INPUT_LINE_SIZE];
__attribute__((opencl_unroll_hint))
for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) {
unroll_for (uint yi = 0; yi < FILTER_SIZE_Y; ++yi) {
// TODO Try to avoid loading last input line in padded situations
__attribute__((opencl_unroll_hint))
for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
uint preload_offset = yi * INPUT_LINE_SIZE + xi;
uint input_x_offset = xi * (INPUT_X_PITCH / FSV);
uint input_y_offset = yi * (DILATION_SIZE_Y * INPUT_Y_PITCH / FSV);
@@ -135,10 +132,8 @@ KERNEL(convolution)(
#if PRELOAD_WEIGHTS
FILTER_TYPE4 wei[CEIL_DIV(FILTER_SPATIAL_SIZE, 4) * 4];
__attribute__((opencl_unroll_hint))
for (uint fsi = 0; fsi < FILTER_SPATIAL_SIZE; fsi += 4) {
__attribute__((opencl_unroll_hint))
for (uint ofi = 0; ofi < 4; ++ofi) {
unroll_for (uint fsi = 0; fsi < FILTER_SPATIAL_SIZE; fsi += 4) {
unroll_for(uint ofi = 0; ofi < 4; ++ofi) {
uint preload_offset = (fsi / 4) * 4 + ofi;
uint weights_idx = weights_offset + ofi * WEIGHTS_I_PITCH + (fsi / 4) * WEIGHTS_YXS_PITCH;
wei[preload_offset] = weights[weights_idx];
@@ -159,8 +154,7 @@ for (; y < tile_y_end; ++y) {
int acc[OUTPUT_BLOCK_X][4] = { };
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
unroll_for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);
uint4 fx = fis % FILTER_SIZE_X;
@@ -178,17 +172,16 @@ for (; y < tile_y_end; ++y) {
wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
#endif
__attribute__((opencl_unroll_hint))
for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
INPUT_TYPE4 in_trans0;
INPUT_TYPE4 in_trans1;
INPUT_TYPE4 in_trans2;
INPUT_TYPE4 in_trans3;
#if TILED
in_trans0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans1 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans2 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans3 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s3]), (fx.s3 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans1 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans2 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans3 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s3]), (fx.s3 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
#elif PRELOAD_INPUT
uint4 input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
uint4 input_y_offset = fy * INPUT_LINE_SIZE;
@@ -243,19 +236,18 @@ for (; y < tile_y_end; ++y) {
wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
# endif
__attribute__((opencl_unroll_hint))
for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
INPUT_TYPE4 in_trans0;
INPUT_TYPE4 in_trans1;
INPUT_TYPE4 in_trans2;
INPUT_TYPE4 in_trans3;
#if TILED
in_trans0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s0]), (fx.s0 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
# if FILTER_BLOCKED % 4 > 1
in_trans1 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans1 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s1]), (fx.s1 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
# endif
# if FILTER_BLOCKED % 4 > 2
in_trans2 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in_trans2 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy.s2]), (fx.s2 * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
# endif
#elif PRELOAD_INPUT
uint4 input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
@@ -317,16 +309,14 @@ for (; y < tile_y_end; ++y) {
wei3 = weights[weights_offset + 3 * WEIGHTS_I_PITCH];
# endif
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
unroll_for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
__attribute__((opencl_unroll_hint))
for (uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
unroll_for(uint oxi = 0; oxi < OUTPUT_BLOCK_X; ++oxi) {
# if TILED
in0 = AS_INPUT_TYPE4(intel_sub_group_shuffle(as_uint(in[fy]), (fx * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
in0 = AS_INPUT_TYPE4(_sub_group_shuffle(as_uint(in[fy]), (fx * DILATION_SIZE_X + tile_x * STRIDE_SIZE_X) % SIMD));
# elif PRELOAD_INPUT
uint input_x_offset = (fx * DILATION_SIZE_X + oxi * STRIDE_SIZE_X);
uint input_y_offset = fy * INPUT_LINE_SIZE;
@@ -349,17 +339,14 @@ for (; y < tile_y_end; ++y) {
#endif
#if TILE_Y != 1
__attribute__((opencl_unroll_hint))
for (uint yi = 0; yi < FILTER_SIZE_Y - 1; ++yi) {
__attribute__((opencl_unroll_hint))
for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
unroll_for (uint yi = 0; yi < FILTER_SIZE_Y - 1; ++yi) {
unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
in[yi * INPUT_LINE_SIZE + xi] = in[(yi + 1) * INPUT_LINE_SIZE + xi];
}
}
{
uint yi = FILTER_SIZE_Y - 1;
__attribute__((opencl_unroll_hint))
for (uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
unroll_for(uint xi = 0; xi < INPUT_LINE_SIZE; ++xi) {
in[yi * INPUT_LINE_SIZE + xi] = input[input_offset + xi * (INPUT_X_PITCH / FSV)];
}
input_offset += DILATION_SIZE_Y * INPUT_Y_PITCH / FSV;
@@ -456,4 +443,3 @@ for (; y < tile_y_end; ++y) {
#undef WEIGHTS_YXS_PITCH
#undef FILTER_SPATIAL_SIZE
#undef CEIL_DIV

View File

@@ -3,12 +3,11 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#include "include/imad.cl"
#include "include/batch_headers/imad.cl"
#define INPUT0_PACKED_TYPE uint
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_b_fs_yx_fsv4_int8)(
const __global INPUT0_PACKED_TYPE* input,

View File

@@ -2,8 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
@@ -146,10 +147,8 @@
# error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - internal error, CHECK_BOUNDARY_IN_SLM enabled without PRELOAD_INPUT_TO_SLM.
#endif
#define CEIL_DIV(a, b) ( ((a) + (b) - 1) / (b) )
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(LWS0, LWS1, SIMD)))
KERNEL(convolution)(
const __global INPUT0_TYPE *input,
@@ -209,8 +208,7 @@ KERNEL(convolution)(
#if ASYMMETRIC_DATA_QUANTIZATION && CHECK_BOUNDARY_IN_SLM
uint4 azp_uniform[FSV / iteration_preload_bytes];
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < FSV / iteration_preload_bytes; ++i) {
unroll_for(uint i = 0; i < FSV / iteration_preload_bytes; ++i) {
azp_uniform[i] = ((const __global uint4*)(activations_zp + (f + i * iteration_preload_bytes)))[0];
}
#endif
@@ -285,8 +283,7 @@ KERNEL(convolution)(
if (early_return)
return;
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
unroll_for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
// Loop over 4 filter spatials that match imad case
uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);
@@ -307,8 +304,7 @@ KERNEL(convolution)(
uint4 input_idx = input_spatial_offset + input_offset;
uint tx = 0;
__attribute__((opencl_unroll_hint))
for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
INPUT_TYPE16 tmp_in0 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s0);
INPUT_TYPE16 tmp_in1 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s1);
INPUT_TYPE16 tmp_in2 = INPUT_BLOCK_READN(16, input_ptr, input_idx.s2);
@@ -374,13 +370,11 @@ KERNEL(convolution)(
uint4 input_y_offset = fy * dilation_size_y * input_y_pitch;
uint4 input_spatial_offset = input_x_offset + input_y_offset;
uint4 input_start_offset = input_spatial_offset + input_offset;
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
uint4 input_idx = input_start_offset + tx * STRIDE_SIZE_X * input_x_pitch;
// Block reads along feature slice
uint fw = 0;
__attribute__((opencl_unroll_hint))
for (; fw + 4 <= F_PER_WI; fw += 4) {
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
INPUT_TYPE4 tmp_in0 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s0);
INPUT_TYPE4 tmp_in1 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s1);
INPUT_TYPE4 tmp_in2 = INPUT_BLOCK_READN(4, input_ptr, input_idx.s2);
@@ -417,14 +411,12 @@ KERNEL(convolution)(
#endif
// Weights loading:
FILTER_TYPE4 wei[F_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
wei[fw] = AS_FILTER_TYPE4(_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
}
#if CHECK_BOUNDARY && !CHECK_BOUNDARY_IN_SLM
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
int4 input_x = convert_int4(x * STRIDE_SIZE_X + tx * STRIDE_SIZE_X + fx * DILATION_SIZE_X) - PADDING_SIZE_X;
int4 input_y = convert_int4(y * STRIDE_SIZE_Y + fy * dilation_size_y) - PADDING_SIZE_Y;
int4 input_pad = input_x < 0 || input_x >= INPUT0_SIZE_X || input_y < 0 || input_y >= INPUT0_SIZE_Y;
@@ -433,20 +425,16 @@ KERNEL(convolution)(
#else
#define padding_value(fw) ((INPUT0_TYPE)0)
#endif
__attribute__((opencl_unroll_hint))
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
in_trans0[tx * F_PER_WI + fwp] = input_pad.s0 ? padding_value(fwp) : in_trans0[tx * F_PER_WI + fwp];
}
__attribute__((opencl_unroll_hint))
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
in_trans1[tx * F_PER_WI + fwp] = input_pad.s1 ? padding_value(fwp) : in_trans1[tx * F_PER_WI + fwp];
}
__attribute__((opencl_unroll_hint))
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
in_trans2[tx * F_PER_WI + fwp] = input_pad.s2 ? padding_value(fwp) : in_trans2[tx * F_PER_WI + fwp];
}
__attribute__((opencl_unroll_hint))
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
in_trans3[tx * F_PER_WI + fwp] = input_pad.s3 ? padding_value(fwp) : in_trans3[tx * F_PER_WI + fwp];
}
#undef padding_value
@@ -455,30 +443,24 @@ KERNEL(convolution)(
// Transpose input:
INPUT_TYPE4 in[TILE_X * F_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
uint in_offset = tx * F_PER_WI + fw;
in[in_offset] = (INPUT_TYPE4)(in_trans0[in_offset], in_trans1[in_offset], in_trans2[in_offset], in_trans3[in_offset]);
}
}
// IMAD:
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
acc[tx * F_PER_WI + fw] = IMAD(acc[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], wei[fw]);
}
}
#if ASYMMETRIC_WEIGHTS_QUANTIZATION
// Accumulate for input values for asymmetric weights:
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
src_sum[tx * F_PER_WI + fw] = IMAD(src_sum[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], (char4)(1, 1, 1, 1));
}
}
@@ -492,13 +474,11 @@ KERNEL(convolution)(
// Leftovers in filters spatial - use raw multiplication instead of imad
// Load inputs before loop to avoid byte scattered reads + there are at most 3 leftovers
FILTER_TYPE4 wei[F_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
unroll_for (uint fw = 0; fw < F_PER_WI; ++fw) {
wei[fw] = AS_FILTER_TYPE4(_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
}
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
unroll_for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
// Input loading:
uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
@@ -511,8 +491,7 @@ KERNEL(convolution)(
uint input_idx = input_spatial_offset + input_offset;
uint tx = 0;
__attribute__((opencl_unroll_hint))
for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
INPUT_TYPE16 tmp_in0 = INPUT_BLOCK_READN(16, input_ptr, input_idx);
VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
input_idx += 16 * SIMD;
@@ -543,12 +522,10 @@ KERNEL(convolution)(
uint input_y_offset = fy * dilation_size_y * input_y_pitch;
uint input_spatial_offset = input_x_offset + input_y_offset;
uint input_start_offset = input_spatial_offset + input_offset;
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
uint input_idx = input_start_offset + tx * STRIDE_SIZE_X * input_x_pitch;
uint fw = 0;
__attribute__((opencl_unroll_hint))
for (; fw + 4 <= F_PER_WI; fw += 4) {
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
INPUT_TYPE4 tmp_in0 = INPUT_BLOCK_READN(4, input_ptr, input_idx);
VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
input_idx += 4 * SIMD;
@@ -566,8 +543,7 @@ KERNEL(convolution)(
# endif
#if CHECK_BOUNDARY && !CHECK_BOUNDARY_IN_SLM
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
int input_x = (x + tx) * STRIDE_SIZE_X + fx * DILATION_SIZE_X - PADDING_SIZE_X;
int input_y = y * STRIDE_SIZE_Y + fy * dilation_size_y - PADDING_SIZE_Y;
int input_pad = input_x < 0 || input_x >= INPUT0_SIZE_X || input_y < 0 || input_y >= INPUT0_SIZE_Y;
@@ -576,8 +552,7 @@ KERNEL(convolution)(
#else
#define padding_value(fw) ((INPUT0_TYPE)0)
#endif
__attribute__((opencl_unroll_hint))
for (uint fwp = 0; fwp < F_PER_WI; ++fwp) {
unroll_for(uint fwp = 0; fwp < F_PER_WI; ++fwp) {
in_trans0[tx * F_PER_WI + fwp] = input_pad ? padding_value(fwp) : in_trans0[tx * F_PER_WI + fwp];
}
#undef padding_value
@@ -585,20 +560,16 @@ KERNEL(convolution)(
#endif
// Raw multiply accumulate:
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
acc[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw] * (int)wei[fw][fi];
}
}
#if ASYMMETRIC_WEIGHTS_QUANTIZATION
// Accumulate input values for asymmetric weights:
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
src_sum[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw];
}
}
@@ -614,18 +585,14 @@ KERNEL(convolution)(
#if BIAS_TERM
# if BIAS_PER_OFM
MAKE_VECTOR_TYPE(BIAS_TYPE, F_PER_WI) bias_val = BLOCK_READN(BIAS_TYPE, F_PER_WI, biases, f);
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(((BIAS_TYPE*)&bias_val)[fw]);
}
}
# elif BIAS_PER_OUTPUT
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
uint bias_offset = GET_BIAS_INDEX(b, f + fw * SIMD + get_sub_group_local_id(), y, x + tx);
BIAS_TYPE bias = biases[bias_offset];
dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
@@ -639,10 +606,8 @@ KERNEL(convolution)(
#if ASYMMETRIC_WEIGHTS_QUANTIZATION
{
MAKE_VECTOR_TYPE(WEIGHTS_ZERO_POINTS_TYPE, F_PER_WI) wzp = BLOCK_READN(WEIGHTS_ZERO_POINTS_TYPE, F_PER_WI, weights_zp, f);
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
dequantized[tx * F_PER_WI + fw] -= TO_DEQUANTIZED_TYPE(src_sum[tx * F_PER_WI + fw]) * TO_DEQUANTIZED_TYPE(((WEIGHTS_ZERO_POINTS_TYPE*)&wzp)[fw]);
}
}
@@ -652,10 +617,8 @@ KERNEL(convolution)(
#if COMPENSATION_TERM
{
MAKE_VECTOR_TYPE(COMPENSATION_TYPE, F_PER_WI) comp = BLOCK_READN(COMPENSATION_TYPE, F_PER_WI, compensation, f);
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(((COMPENSATION_TYPE*)&comp)[fw]);
}
}
@@ -664,14 +627,12 @@ KERNEL(convolution)(
OUTPUT_TYPE out[TILE_X * F_PER_WI];
// Fused ops and conversion to output type
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
#if HAS_FUSED_OPS
uint fused_ops_x = x + tx;
uint fused_ops_f = f;
uint fw = 0;
__attribute__((opencl_unroll_hint))
for (; fw + 4 <= F_PER_WI; fw += 4) {
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
DEQUANTIZED_TYPE4 fused_ops_in;
ARRAY_TO_VEC_4(fused_ops_in, dequantized, tx * F_PER_WI + fw);
FUSED_OPS_4;
@@ -693,8 +654,7 @@ KERNEL(convolution)(
out[tx * F_PER_WI + fw] = FUSED_OPS_RESULT_1;
}
#else
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
out[tx * F_PER_WI + fw] = TO_OUTPUT_TYPE(dequantized[tx * F_PER_WI + fw]);
}
#endif
@@ -702,10 +662,8 @@ KERNEL(convolution)(
// Fill results outside output in features with OUTPUT_PAD_VALUE.
if (OUTPUT_FEATURE_NUM % FSV != 0 && f + FSV > OUTPUT_FEATURE_NUM) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint fw = 0; fw < F_PER_WI; ++fw) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint fw = 0; fw < F_PER_WI; ++fw) {
const uint sglid = get_sub_group_local_id();
// Hint here can save some movs if features are divisible by SIMD and not by FSV
ASSUME_HINT(sglid < SIMD);
@@ -721,8 +679,7 @@ KERNEL(convolution)(
// Full output tile x write using block write ladder
uint tx = 0;
#if OUTPUT_TYPE_SIZE * 16 <= MAX_OPT_BLOCK_WRITE_BYTES
__attribute__((opencl_unroll_hint))
for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
unroll_for(; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
OUTPUT_TYPE16 tmp_write;
ARRAY_TO_VEC_16(tmp_write, out, tx);
DT_OUTPUT_BLOCK_WRITE16(output, output_offset, tmp_write);
@@ -730,8 +687,7 @@ KERNEL(convolution)(
}
#endif
#if OUTPUT_TYPE_SIZE * 8 <= MAX_OPT_BLOCK_WRITE_BYTES
__attribute__((opencl_unroll_hint))
for (; tx + 8 <= TILE_X * F_PER_WI; tx += 8) {
unroll_for(; tx + 8 <= TILE_X * F_PER_WI; tx += 8) {
OUTPUT_TYPE8 tmp_write;
ARRAY_TO_VEC_8(tmp_write, out, tx);
DT_OUTPUT_BLOCK_WRITE8(output, output_offset, tmp_write);
@@ -739,16 +695,14 @@ KERNEL(convolution)(
}
#endif
#if OUTPUT_TYPE_SIZE * 4 <= MAX_OPT_BLOCK_WRITE_BYTES
__attribute__((opencl_unroll_hint))
for (; tx + 4 <= TILE_X * F_PER_WI; tx += 4) {
unroll_for(; tx + 4 <= TILE_X * F_PER_WI; tx += 4) {
OUTPUT_TYPE4 tmp_write;
ARRAY_TO_VEC_4(tmp_write, out, tx);
DT_OUTPUT_BLOCK_WRITE4(output, output_offset, tmp_write);
output_offset += 4 * SIMD;
}
#endif
__attribute__((opencl_unroll_hint))
for (; tx + 2 <= TILE_X * F_PER_WI; tx += 2) {
unroll_for(; tx + 2 <= TILE_X * F_PER_WI; tx += 2) {
OUTPUT_TYPE2 tmp_write;
ARRAY_TO_VEC_2(tmp_write, out, tx);
DT_OUTPUT_BLOCK_WRITE2(output, output_offset, tmp_write);
@@ -759,20 +713,17 @@ KERNEL(convolution)(
}
} else {
// Leftovers write, block writes in f dimension only
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
if (tx < OUTPUT_SIZE_X % TILE_X) {
uint fw = 0;
#if OUTPUT_TYPE_SIZE * 4 <= MAX_OPT_BLOCK_WRITE_BYTES
__attribute__((opencl_unroll_hint))
for (; fw + 4 <= F_PER_WI; fw += 4) {
unroll_for(; fw + 4 <= F_PER_WI; fw += 4) {
OUTPUT_TYPE4 tmp_write;
ARRAY_TO_VEC_4(tmp_write, out, tx * F_PER_WI + fw);
DT_OUTPUT_BLOCK_WRITE4(output, output_offset + fw * SIMD, tmp_write);
}
#endif
__attribute__((opencl_unroll_hint))
for (; fw + 2 <= F_PER_WI; fw += 2) {
unroll_for(; fw + 2 <= F_PER_WI; fw += 2) {
OUTPUT_TYPE2 tmp_write;
ARRAY_TO_VEC_2(tmp_write, out, tx * F_PER_WI + fw);
DT_OUTPUT_BLOCK_WRITE2(output, output_offset + fw * SIMD, tmp_write);

View File

@@ -4,8 +4,9 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#define TYPE_N_(type, n) type##n
#define TYPE_N(type, n) TYPE_N_(type, n)
@@ -45,15 +46,12 @@
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define SIMD 16
#define FSV 16
// int8 conv_input and weights data is packed to int32 "batches",
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
const __global INPUT0_TYPE *conv_input,
@@ -129,18 +127,15 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
uint4 weights_zp_val[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_zp_val[ofb] = as_uint4((FILTER_TYPE_16)weights_zp[out_f + ofb * FSV]);
}
#if FILTER_IFM_NUM % FSV != 0
uint4 weights_zp_vec_partial[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_zp_vec_partial[ofb] = weights_zp_val[ofb];
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial[ofb];
__attribute__((opencl_unroll_hint))
for (uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
unroll_for(uint f = FILTER_IFM_NUM % FSV; f < FSV; f++) {
wzp_p[f] = 0;
}
}
@@ -152,8 +147,7 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#ifdef ASYMMETRIC_WEIGHTS_QUANTIZATION
#if FILTER_IFM_NUM % FSV != 0
if (in_f_start + (k + 1) * FSV >= ALIGN(FILTER_IFM_NUM, FSV)) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_zp_val[ofb] = weights_zp_vec_partial[ofb];
}
}
@@ -170,11 +164,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#ifdef SHOULD_USE_DATA_AND_WEIGHTS_ZP
ACCUMULATOR_TYPE_4 dotProdAZPxWZP[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
dotProdAZPxWZP[ofb] = 0;
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ive++) {
unroll_for(uint ive = 0; ive < 4; ive++) {
dotProdAZPxWZP[ofb][ive] = TO_ACCUMULATOR_TYPE(
IMAD(dotProdAZPxWZP[ofb][ive],
AS_INPUT0_TYPE_4(data_zp_val[ive]),
@@ -188,12 +180,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
__attribute__((opencl_unroll_hint(1)))
for (uint fyn = 0; fyn < FILTER_SIZE_Y / FILTER_SIZE_Y_UNROLL; fyn++) {
// Load input block IN_BLOCK_DEPTH x IN_BLOCK_HEIGHT x IN_BLOCK_WIDTH, scattering width along sub-group
__attribute__((opencl_unroll_hint))
for (uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
__attribute__((opencl_unroll_hint))
for (uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
__attribute__((opencl_unroll_hint))
for (uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
unroll_for(uint izb = 0; izb < IN_BLOCK_DEPTH; ++izb) {
unroll_for(uint iyb = 0; iyb < IN_BLOCK_HEIGHT; ++iyb) {
unroll_for(uint ixb = 0; ixb < CEIL_DIV(IN_BLOCK_WIDTH, SIMD); ++ixb) {
uint input_idx = input_start_idx + izb * INPUT0_Z_PITCH * FSV + iyb * INPUT0_Y_PITCH * FSV + ixb * SIMD * FSV;
#ifdef SHOULD_USE_DATA_ZP
const int y_idx = input_y + fyn * DILATION_SIZE_Y + iyb;
@@ -300,23 +289,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
}
__attribute__((opencl_unroll_hint))
for (uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
__attribute__((opencl_unroll_hint))
for (uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
__attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
unroll_for(uint fzu = 0; fzu < FILTER_SIZE_Z_UNROLL; ++fzu) {
unroll_for(uint fyu = 0; fyu < FILTER_SIZE_Y_UNROLL; ++fyu) {
unroll_for (uint fx = 0; fx < FILTER_SIZE_X; fx++) {
uint4 weights_val[OFM_BLOCKS_PER_SIMD];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
weights_val[ofb] = vload4(0, (__global uint *)(weights + filter_idx + ofb * filter_idx_diff));
}
__attribute__((opencl_unroll_hint))
for (uint ive = 0; ive < 4; ive++) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for (uint ive = 0; ive < 4; ive++) {
unroll_for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
#ifdef SHOULD_USE_DATA_ZP
ACCUMULATOR_TYPE dotProdAZPxW = 0;
dotProdAZPxW = TO_ACCUMULATOR_TYPE(
@@ -325,19 +308,16 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
AS_FILTER_TYPE_4(weights_val[ofb][ive])));
#endif
__attribute__((opencl_unroll_hint(OUT_BLOCK_DEPTH)))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_HEIGHT)))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
unroll_for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
const uint z_block_idx = od * STRIDE_SIZE_Z + fzu * DILATION_SIZE_Z;
const uint y_block_idx = oh * STRIDE_SIZE_Y + fyu * DILATION_SIZE_Y;
const uint x_block_idx = ow * STRIDE_SIZE_X + fx * DILATION_SIZE_X;
const uint shuffle_wi = x_block_idx % SIMD;
const uint shuffle_idx = x_block_idx / SIMD;
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(_sub_group_shuffle(input_val[z_block_idx][y_block_idx][shuffle_idx][ive],
shuffle_wi));
dotProd[ofb][od][oh][ow] = TO_ACCUMULATOR_TYPE(
@@ -401,17 +381,12 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
get_sub_group_local_id();
if (get_sub_group_id() < OFM_BLOCKS_PER_SIMD) {
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
unroll_for(uint wg = 0; wg < OFM_BLOCKS_PER_SIMD; ++wg) {
if (get_sub_group_id() == wg) {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < wg; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for(uint ofb = 0; ofb < wg; ++ofb) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -422,24 +397,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
}
}
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
dotProd[0][od][oh][ow] = dotProd[wg][od][oh][ow];
}
}
}
__attribute__((opencl_unroll_hint))
for (uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for(uint ofb = wg + 1; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
((wg != 0) ? OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * OFM_SIZE_PER_SIMD : 0) +
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -454,14 +422,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
}
} else {
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for(uint ofb = 0; ofb < OFM_BLOCKS_PER_SIMD; ++ofb) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
ofb * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -480,14 +444,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
return;
partial_acc_ptr = partial_acc + get_sub_group_id() * OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_DEPTH * SIMD + get_sub_group_local_id();
__attribute__((opencl_unroll_hint))
for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for (uint wg = 0; wg < FEATURE_SLM_SPLIT - 1; ++wg) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
const uint partial_acc_ptr_idx =
wg * OFM_SIZE_PER_SIMD * OUT_BLOCK_DEPTH * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH +
od * OUT_BLOCK_HEIGHT * OUT_BLOCK_WIDTH * SIMD +
@@ -510,29 +470,23 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#if BIAS_TERM
BIAS_TYPE bias[OFM_VALUES_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
bias[ofb] = biases[out_f + ofb * SIMD];
}
#endif
#ifdef COMPENSATION_TERM
COMPENSATION_TYPE comp[OFM_VALUES_PER_WI];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
comp[ofb] = compensation[out_f + ofb * SIMD];
}
#endif
ACTIVATION_TYPE dequantized[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
dequantized[ofb][od][oh][ow] = TO_ACTIVATION_TYPE(dotProd[ofb][od][oh][ow]);
#if BIAS_TERM
dequantized[ofb][od][oh][ow] += bias[ofb];
@@ -546,17 +500,13 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
OUTPUT_TYPE result[OFM_VALUES_PER_WI][OUT_BLOCK_DEPTH][OUT_BLOCK_HEIGHT][OUT_BLOCK_WIDTH];
__attribute__((opencl_unroll_hint))
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
unroll_for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ++ofb) {
#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD_SCALAR
FUSED_OPS_PRELOAD_SCALAR;
#endif
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
__attribute__((opencl_unroll_hint))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ++ow) {
ACTIVATION_TYPE dequantized_val = dequantized[ofb][od][oh][ow];
#if HAS_FUSED_OPS
# if FUSED_OPS_CAN_USE_PRELOAD_SCALAR
@@ -585,21 +535,17 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
for (uint ofb = 0; ofb < OFM_VALUES_PER_WI; ofb++) {
bool good_of_block = (CEIL_DIV(FILTER_OFM_NUM, SIMD) % OFM_BLOCKS_PER_SIMD == 0) || (out_f_sg + ofb * SIMD <= FILTER_OFM_NUM);
if (good_of_block) {
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
if (good_z) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
if (good_y) {
uint ow = 0;
#if OUTPUT_TYPE_SIZE == 1
__attribute__((opencl_unroll_hint))
for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
unroll_for (; ow + 8 <= OUT_BLOCK_WIDTH; ow += 8) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 8; ++i) {
unroll_for (uint i = 0; i < 8; ++i) {
result_val[i] = result[ofb][od][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE8(output, dst_index, result_val);
@@ -607,11 +553,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
#endif
#if OUTPUT_TYPE_SIZE <= 2
__attribute__((opencl_unroll_hint))
for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
unroll_for (; ow + 4 <= OUT_BLOCK_WIDTH; ow += 4) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 4; ++i) {
unroll_for (uint i = 0; i < 4; ++i) {
result_val[i] = result[ofb][od][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE4(output, dst_index, result_val);
@@ -619,11 +563,9 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
}
#endif
__attribute__((opencl_unroll_hint))
for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
unroll_for (; ow + 2 <= OUT_BLOCK_WIDTH; ow += 2) {
MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2) result_val;
__attribute__((opencl_unroll_hint))
for (uint i = 0; i < 2; ++i) {
unroll_for (uint i = 0; i < 2; ++i) {
result_val[i] = result[ofb][od][oh][ow + i];
}
DT_OUTPUT_BLOCK_WRITE2(output, dst_index, result_val);
@@ -655,12 +597,10 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#else
const uint dst_index = OUTPUT_GET_INDEX(out_b, out_f + ofb * SIMD, out_z, out_y, out_x);
#endif
__attribute__((opencl_unroll_hint))
for (uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
unroll_for(uint od = 0; od < OUT_BLOCK_DEPTH; ++od) {
bool good_z = (OUTPUT_SIZE_Z % OUT_BLOCK_DEPTH == 0) || (out_z + od < OUTPUT_SIZE_Z);
if (good_z) {
__attribute__((opencl_unroll_hint))
for (uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
unroll_for(uint oh = 0; oh < OUT_BLOCK_HEIGHT; ++oh) {
bool good_y = (OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT == 0) || (out_y + oh < OUTPUT_SIZE_Y);
if (good_y) {
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
@@ -720,9 +660,6 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
#undef AS_FILTER_TYPE_4
#undef CEIL_DIV
#undef ALIGN
#undef SIMD
#undef FSV
#undef OFM_VALUES_PER_WI

View File

@@ -2,13 +2,13 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/sub_group.cl"
#if FP16_UNIT_USED
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
{ \
const half16 acol0 = TRANSPOSE_BLOCK_16_FP16_HALF_TYPE( _blockA.s0 ); \
@@ -29,9 +29,6 @@
_result = fma( _blockB.s7, acol7, _result ); \
}
#else
// Block read - currently block is 4 bytes aligned.
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
#define MULTIPLY_BLOCKS_16x8_8x16(_result, _blockA, _blockB) \
{ \
const float16 acol0 = TRANSPOSE_BLOCK_16( _blockA.s0 ); \
@@ -53,7 +50,11 @@
}
#endif
__attribute__((intel_reqd_sub_group_size(16)))
#ifndef ACCUMULATOR_TYPE
#define ACCUMULATOR_TYPE INPUT0_TYPE
#endif
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_bfyx_1x1)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -63,14 +64,15 @@ KERNEL(convolution_bfyx_1x1)(
#endif
uint split_idx)
{
const uint xy = (uint)get_group_id(0) * 16 + get_sub_group_local_id();
const uint group_xy = (uint)get_group_id(0) * 16;
const uint xy = group_xy + get_sub_group_local_id();
const uint x = xy % OUTPUT_SIZE_X;
const uint y = xy / OUTPUT_SIZE_X;
const uint f = (uint)get_group_id(1) * 16 + get_sub_group_local_id();//get_global_id(1);
const uint b = (uint)get_global_id(2);
const uint group_f = (uint)get_group_id(1) * 16;
MAKE_VECTOR_TYPE(UNIT_TYPE, 16) blockC00 = UNIT_VAL_ZERO;
MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 16) blockC00 = INPUT0_VAL_ZERO;
#if BIAS_TERM
#if BIAS_PER_OUTPUT
@@ -80,7 +82,7 @@ KERNEL(convolution_bfyx_1x1)(
#endif
for(uint i = 0; i < 16; i++)
{
blockC00[i] = intel_sub_group_shuffle(biases[bias_index], i);
blockC00[i] = _sub_group_shuffle(biases[bias_index], i);
}
#endif
@@ -92,18 +94,18 @@ KERNEL(convolution_bfyx_1x1)(
const uint filter_offset = group_f * ((FILTER_OFM_PITCH + 8 - 1) / 8) * 8;//f*FILTER_OFM_PITCH;
const uint xy_block_num = (INPUT0_FEATURE_PITCH + 16 - 1) / 16;
const uint f_block_num = (INPUT0_FEATURE_NUM + 8 - 1) / 8;
const uint input_offset = in_split_offset + xy * 8 + b * xy_block_num * f_block_num * 128;//b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
const uint input_offset = in_split_offset + group_xy * 8 + b * xy_block_num * f_block_num * 128;//b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
for (uint k = 0; k < (FILTER_IFM_NUM + 8 - 1) / 8; ++k)
{
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockA00;
MAKE_VECTOR_TYPE(UNIT_TYPE, 8) blockB00;
MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) blockA00;
MAKE_VECTOR_TYPE(FILTER_TYPE, 8) blockB00;
uint input_idx = input_offset + k * 8 * xy_block_num * 16;
uint filter_idx = filter_offset + k * 8 * 16;
blockA00 = ALIGNED_BLOCK_READ8(input, input_idx);
blockB00 = ALIGNED_BLOCK_READ8(weights, filter_idx);
blockA00 = DT_INPUT_BLOCK_READ8(input, input_idx);
blockB00 = DT_FILTER_BLOCK_READ8(weights, filter_idx);
MULTIPLY_BLOCKS_16x8_8x16(blockC00, blockB00, blockA00);
}
@@ -128,3 +130,4 @@ KERNEL(convolution_bfyx_1x1)(
#undef CONCAT_TOKEN
#undef CONCAT_TOKEN_HANDLER1
#undef MULTIPLY_BLOCKS_16x16
#undef ACCUMULATOR_TYPE

View File

@@ -2,8 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/gemm_common.cl"
#define MULT(C_, A_, i_) \
@@ -13,7 +12,7 @@
DOT8i(C_, B24, A_, i_ + 3);
__attribute__((reqd_work_group_size(16, TY, 1)))
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_gpu_bfyx_1x1_hgemm_buf_16x1)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -56,14 +55,14 @@ KERNEL(convolution_gpu_bfyx_1x1_hgemm_buf_16x1)(
// 512 MADs
half8 B0 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
half8 B0 = as_half8(_sub_group_block_read_us8(weights, coordB));
coordB.y += 8;
half8 B8 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
half8 B8 = as_half8(_sub_group_block_read_us8(weights, coordB));
coordB.y += 8;
half8 B16 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
half8 B16 = as_half8(_sub_group_block_read_us8(weights, coordB));
coordB.y += 8;
half8 B24 = as_half8(intel_sub_group_block_read_us8(weights, coordB));
half8 B24 = as_half8(_sub_group_block_read_us8(weights, coordB));
coordB.y += 8;
half8 A0 = A_load[K8*0 + k8];

View File

@@ -2,11 +2,10 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#define SIMD_SIZE 8
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
REQD_SUB_GROUP_SIZE(SIMD_SIZE)
KERNEL(convolution)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -58,18 +57,18 @@ KERNEL(convolution)(
}
#if OUT_BLOCK_DEPTH == 8
float8 w = as_float8(intel_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
float8 w = as_float8(_sub_group_block_read8((__global uint*)weights + filter_offset + k * 64));
#elif OUT_BLOCK_DEPTH == 4
float4 w = as_float4(intel_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
float4 w = as_float4(_sub_group_block_read4((__global uint*)weights + filter_offset + k * 32));
#elif OUT_BLOCK_DEPTH == 2
float2 w = as_float2(intel_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
float2 w = as_float2(_sub_group_block_read2((__global uint*)weights + filter_offset + k * 16));
#endif
for(uint br = 0; br < OUT_BLOCK_HEIGHT; br++)
{
for(uint bc = 0; bc < OUT_BLOCK_WIDTH; bc++)
{
float _in = intel_sub_group_shuffle(in[br], bc);
float _in = _sub_group_shuffle(in[br], bc);
for(uint bd = 0; bd < OUT_BLOCK_DEPTH/2; bd++)
{
dotProd0[bc + OUT_BLOCK_WIDTH * (br + OUT_BLOCK_HEIGHT * bd)] += _in * w[bd];

View File

@@ -2,18 +2,11 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#if FP16_UNIT_USED
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_half(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (byte_offset), as_ushort8(val))
#else
#define ALIGNED_BLOCK_READ(ptr, byte_offset) as_float(intel_sub_group_block_read((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_WRITE(ptr, byte_offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (byte_offset), as_uint8(val))
#endif
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
__attribute__((reqd_work_group_size(16, 1, 1)))
KERNEL(convolution_depthwise_weights_lwg)(
__global INPUT0_TYPE* input,
@@ -41,7 +34,7 @@ KERNEL(convolution_depthwise_weights_lwg)(
const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_group_offset;
#if FILTER_SIZE_Y * FILTER_SIZE_X % 16 == 0 && !FP16_UNIT_USED
UNIT_TYPE w = ALIGNED_BLOCK_READ(weights, filter_offset);
UNIT_TYPE w = DT_FILTER_BLOCK_READ(weights, filter_offset);
#elif FILTER_SIZE_X * FILTER_SIZE_Y > 16 && FILTER_SIZE_X * FILTER_SIZE_Y <= 25
const uint lid = get_local_id(0);
UNIT_TYPE w[2] = { UNIT_VAL_ZERO };
@@ -78,9 +71,9 @@ KERNEL(convolution_depthwise_weights_lwg)(
#if FILTER_SIZE_X * FILTER_SIZE_Y > 16 && FILTER_SIZE_X * FILTER_SIZE_Y <= 25
const uint id = (j*FILTER_Y_PITCH + i*FILTER_X_PITCH) / 16;
const uint idx = (j*FILTER_Y_PITCH + i*FILTER_X_PITCH) % 16;
UNIT_TYPE w1 = intel_sub_group_shuffle(w[id], idx);
UNIT_TYPE w1 = _sub_group_shuffle(w[id], idx);
#else
UNIT_TYPE w1 = intel_sub_group_shuffle(w, j*FILTER_Y_PITCH + i*FILTER_X_PITCH);
UNIT_TYPE w1 = _sub_group_shuffle(w, j*FILTER_Y_PITCH + i*FILTER_X_PITCH);
#endif
dotProd = mad(input[input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH],
w1, dotProd);

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
//////////////////////////////////////////////////////////////////////////////
@@ -16,7 +15,7 @@
#define TILE_X 12 // Width of tile loaded in input (src0)
#define TILE_Y 10 // Height of tile loaded in input (src0)
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_f16_10x12x16)(
const __global half *src0,
__global half *dst,
@@ -100,12 +99,12 @@ KERNEL(convolution_f16_10x12x16)(
unsigned interleaved_y = 0;
LOOP(KERNEL_SLICE_DIV2, interleaved_y,
{
p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
p2BlockB[interleaved_y] = _sub_group_block_read_us2( (const __global ushort*)src1_read );
src1_read += ALIGNED_OFM_PER_GROUP * 2;
} )
if ( kernel_slice_is_odd )
{
pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
pBlockB[FILTER_SIZE_X * FILTER_SIZE_Y - 1] = _sub_group_block_read_us( (const __global ushort*)src1_read );
src1_read += ALIGNED_OFM_PER_GROUP * 2;
}

View File

@@ -1,159 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
//#include "include/cnn_common.cl"
//////////////////////////////////////////////////////////////////////////////
// Direct Convolution
#if defined(cl_intel_subgroups_short)
#define TILE_M DY // Height of tile in input patches (src0)
#define TILE_K DX // Width of tile in input patches (src0)
#define TILE_N 16 // Num filter channels per tile (src1)
#define TILE_X 8 // Width of tile loaded in input (src0)
#define TILE_Y 8 // Height of tile loaded in input (src0)
__attribute__((intel_reqd_sub_group_size(16)))
__kernel void convolution_f16_8x8x16(
const __global half *src0,
__global half *dst,
const __global half *src1,
const __global half *biases)
{
const unsigned global_x = (uint)get_global_id(0);
const unsigned global_y = (uint)get_global_id(1);
const unsigned global_z = (uint)get_global_id(2);
const unsigned out_fm = global_z % ALIGNED_OFM;
const unsigned batch_id = global_z / ALIGNED_OFM;
const unsigned group_x = get_group_id(0);
const unsigned group_z = get_group_id(2);
const unsigned max_group_x = get_num_groups(0);
const unsigned local_z = get_local_id(2);
half blockC[TILE_M * TILE_K] = { 0 };
uint src0_offset_tile =
batch_id * INPUT_BATCH_PITCH // batch offset
+ ( global_y * TILE_M * STRIDE_Y ) * INPUT_Y_PITCH // y offset
+ ( global_x * TILE_K * STRIDE_X ); // x offset
uint src0_offset = src0_offset_tile
+ ( local_z / ( TILE_X / 4 ) ) * INPUT_Y_PITCH // y tile offset
+ ( local_z % ( TILE_X / 4 ) ) * 4; // x tile offset
const __global half *src1_read = src1 + ( group_z * TILE_N % ALIGNED_OFM ) * 2;
unsigned patch_depth = 0;
__attribute__((opencl_unroll_hint(3)))
do
{
// Load atile (input) and btile (filters).
// Kernel data is partially interleaved. Every 2 rows are interleaved at float16 granularity.
// The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
// interleaved row is padded with zero to ensure same size as interleaved rows. This
// interleaving is done to increase consecutive data to fetch which reduces loads required.
// For example, this is how the kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
// (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
// (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
// (0, 2) (8, 2) (16, 2) (24, 2) ... ...
// ...
// in case the data is not aligned to sizeof(T)*KERNEL_WIDTH we need to use vload or set the data in a loop
half4 blockA = vload4(0, src0 + src0_offset );
src0_offset += INPUT_FEATURE_PITCH;
half blockB[KERNEL_WIDTH * KERNEL_HEIGHT];
ushort2* p2BlockB = (ushort2*)blockB;
ushort* pBlockB = (ushort* )blockB;
const bool kernel_slice_is_odd = ( KERNEL_WIDTH * KERNEL_HEIGHT ) % 2 == 1;
unsigned interleaved_y = 0;
LOOP(KERNEL_SLICE_DIV2, interleaved_y,
{
p2BlockB[interleaved_y] = intel_sub_group_block_read_us2( (const __global ushort*)src1_read );
src1_read += ALIGNED_OFM * 2;
} )
if ( kernel_slice_is_odd )
{
pBlockB[KERNEL_WIDTH * KERNEL_HEIGHT - 1] = intel_sub_group_block_read_us( (const __global ushort*)src1_read );
src1_read += ALIGNED_OFM * 2;
}
#define BLOCK_A(n) sub_group_broadcast( blockA[(n)%4], (n)/4 )
// Perform MADs
// Loop through all patches in tile (patch_x/y)
// For each patch, sum values (x/y)
unsigned patch_y=0;
LOOP(TILE_M, patch_y,
{
unsigned patch_x=0;
LOOP(TILE_K, patch_x,
{
unsigned tile_idx = patch_y * TILE_X * STRIDE_Y + patch_x * STRIDE_X;
unsigned out_idx = patch_y * TILE_K + patch_x;
unsigned y=0;
LOOP(KERNEL_HEIGHT, y,
{
unsigned x=0;
LOOP(KERNEL_WIDTH, x,
{
unsigned offset_idx = y * TILE_X + x;
unsigned out_chan_idx = y * KERNEL_WIDTH + x;
blockC[out_idx] = mad( BLOCK_A( tile_idx + offset_idx ), blockB[out_chan_idx], blockC[out_idx] );
} )
} )
} )
} )
}
while ( ++patch_depth < INPUT_FEATURE_NUM );
// Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
// TILE_K x TILE_M x SIMD. Partial writes most likely generated if output padding used.
// Group stores into vectors to expedite writeback. One large write is faster than many
// small saves. Right-most column may be smaller if output width not divisible by tile width.
__global half *out = dst
+ batch_id * OUTPUT_BATCH_PITCH // batch offset
+ out_fm * OUTPUT_FEATURE_PITCH // channel offset
+ ( global_y * TILE_M ) * OUTPUT_Y_PITCH // y offset
+ ( global_x * TILE_K ); // x offset
if ( batch_id < OUTPUT_BATCH_NUM && out_fm < OUTPUT_FEATURE_NUM )
{
half bias = biases[out_fm];
if ( OUTPUT_SIZE_X % TILE_K == 0 ||
group_x < max_group_x - 1 )
{
typedef CAT( half, TILE_K ) half_t;
half bias = biases[out_fm];
for( unsigned y = 0; y < TILE_M; y++ )
{
if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
{
half_t vBlockC;
half *pvBlockC = (half*)&vBlockC;
for (unsigned i = 0; i < TILE_K; i++) pvBlockC[i] = activation_function(blockC[y * TILE_K + i] + bias, ACTIVATION_PARAMS);
*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
}
}
}
else
{
typedef CAT( half, RIGHT_PARTIAL_TILE_K ) half_t;
for( unsigned y = 0; y < TILE_M; y++ )
{
if ( global_y * TILE_M + y < OUTPUT_SIZE_Y )
{
half_t vBlockC;
half *pvBlockC = (half*)&vBlockC;
for (unsigned i = 0; i < RIGHT_PARTIAL_TILE_K; i++) pvBlockC[i] = activation_function(blockC[y * TILE_K + i] + bias, ACTIVATION_PARAMS);
*(__global half_t*)(out + y * OUTPUT_Y_PITCH) = vBlockC;
}
}
}
}
}
#endif // cl_intel_subgroups_short

View File

@@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#define INPUT_TYPE INPUT0_TYPE
@@ -19,59 +21,19 @@
#define AS_FILTER_TYPE8 CAT(as_, FILTER_TYPE8)
#if INPUT0_TYPE_SIZE == 2
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
# define INPUT_BLOCK_READ2(ptr, offset) AS_INPUT_TYPE2(intel_sub_group_block_read_us2((__global ushort*)(ptr) + (offset)))
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read_us4((__global ushort*)(ptr) + (offset)))
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
#elif INPUT0_TYPE_SIZE == 4
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
# define INPUT_BLOCK_READ2(ptr, offset) AS_INPUT_TYPE2(intel_sub_group_block_read2((__global uint*)(ptr) + (offset)))
# define INPUT_BLOCK_READ4(ptr, offset) AS_INPUT_TYPE4(intel_sub_group_block_read4((__global uint*)(ptr) + (offset)))
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
#else
# error convolution_gpu_bfyx_f16.cl: unsupported input type
#endif
#if FILTER_TYPE_SIZE == 2
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
#elif FILTER_TYPE_SIZE == 4
# define FILTER_BLOCK_READ8(ptr, offset) AS_FILTER_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
#else
# error convolution_gpu_bfyx_f16.cl: unsupported filter type
#endif
#if OUTPUT_FORMAT_BFYX
# define OUTPUTVTYPE(n) CAT(OUTPUT_TYPE, n)
# define TO_OUTPUTVTYPE CAT(convert_, OUTPUTVTYPE(OUTPUT_X_BLOCK_SIZE))
# define VSTORE CAT(vstore, OUTPUT_X_BLOCK_SIZE)
#else
# if OUTPUT_TYPE_SIZE == 1
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) BLOCK_WRITE_UC_2((__global uchar*)(ptr) + (offset), as_uchar2(val))
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr) + (offset), as_uchar4(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
# elif OUTPUT_TYPE_SIZE == 2
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write_us2((__global ushort*)(ptr) + (offset), as_ushort2(val))
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write_us4((__global ushort*)(ptr) + (offset), as_ushort4(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
# elif OUTPUT_TYPE_SIZE == 4
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
# define OUTPUT_BLOCK_WRITE2(ptr, offset, val) intel_sub_group_block_write2((__global uint*)(ptr) + (offset), as_uint2(val))
# define OUTPUT_BLOCK_WRITE4(ptr, offset, val) intel_sub_group_block_write4((__global uint*)(ptr) + (offset), as_uint4(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
# else
# error convolution_gpu_bfyx_f16.cl: unsupported output type
# endif
#endif // OUTPUT_FORMAT_BFYX
#if INPUT0_TYPE_SIZE == 2
# define AS_INPUT_SRC CAT(as_, MAKE_VECTOR_TYPE(INPUT_TYPE, OUTPUT_X_BLOCK_SIZE))
# define AS_US_SRC CAT(as_, MAKE_VECTOR_TYPE(ushort, OUTPUT_X_BLOCK_SIZE))
# define GET_SRC(data, id) AS_INPUT_SRC(intel_sub_group_shuffle(AS_US_SRC(data), id))
# define GET_SRC(data, id) AS_INPUT_SRC(_sub_group_shuffle(AS_US_SRC(data), id))
#else
# define GET_SRC(data, id) intel_sub_group_shuffle(data, id)
# define GET_SRC(data, id) _sub_group_shuffle(data, id)
#endif
#define FEATURE_SLICE_SIZE 16
@@ -79,7 +41,7 @@
#define FILTER_OFM_NUM_ALIGNED (((FILTER_OFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
#define FILTER_IFM_NUM_ALIGNED (((FILTER_IFM_NUM + FEATURE_SLICE_SIZE - 1) / FEATURE_SLICE_SIZE) * FEATURE_SLICE_SIZE)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
KERNEL(convolution_bfyx_f16)(
__global INPUT0_TYPE* input,
@@ -169,12 +131,12 @@ KERNEL(convolution_bfyx_f16)(
#if BIAS_TERM
#if SLM_DIV_FACTOR == 1
vec_t dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
vec_t dst = (vec_t)(DT_INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
#else
vec_t dst;
if (feature_sub_block == 0) {
dst = (vec_t)(INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
dst = (vec_t)(DT_INPUT_BLOCK_READ(biases, feature_block * FEATURE_SLICE_SIZE));
} else {
dst = INPUT0_VAL_ZERO;
}
@@ -240,7 +202,7 @@ KERNEL(convolution_bfyx_f16)(
{
int xb = 0;
for (; xb + 8 <= INPUT_LINE_SIZE; xb += 8) {
INPUT_TYPE8 vv = INPUT_BLOCK_READ8(input, grouped_input_offset +
INPUT_TYPE8 vv = DT_INPUT_BLOCK_READ8(input, grouped_input_offset +
icb * input_fs_pitch +
kh * DILATION_SIZE_Y * input_y_pitch +
xb * input_x_pitch);
@@ -255,7 +217,7 @@ KERNEL(convolution_bfyx_f16)(
line_cache[xb + 7] = vv[7];
}
for (; xb + 4 <= INPUT_LINE_SIZE; xb += 4) {
INPUT_TYPE4 vv = INPUT_BLOCK_READ4(input, grouped_input_offset +
INPUT_TYPE4 vv = DT_INPUT_BLOCK_READ4(input, grouped_input_offset +
icb * input_fs_pitch +
kh * DILATION_SIZE_Y * input_y_pitch +
xb * input_x_pitch);
@@ -266,7 +228,7 @@ KERNEL(convolution_bfyx_f16)(
line_cache[xb + 3] = vv[3];
}
for (; xb < INPUT_LINE_SIZE; xb++) {
line_cache[xb] = INPUT_BLOCK_READ(input, grouped_input_offset +
line_cache[xb] = DT_INPUT_BLOCK_READ(input, grouped_input_offset +
icb * input_fs_pitch +
kh * DILATION_SIZE_Y * input_y_pitch +
xb * input_x_pitch);
@@ -333,11 +295,11 @@ KERNEL(convolution_bfyx_f16)(
# error convolution_gpu_bfyx_f16.cl: unsupported input feature size for multiple groups input preload
#endif // FILTER_IFM_NUM
#else
FILTER_TYPE8 wei0 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
FILTER_TYPE8 wei0 = DT_FILTER_BLOCK_READ8(weights, grouped_filter_offset +
icb * filter_is_pitch +
kh * filter_y_pitch +
kw * filter_x_pitch);
FILTER_TYPE8 wei1 = FILTER_BLOCK_READ8(weights, grouped_filter_offset +
FILTER_TYPE8 wei1 = DT_FILTER_BLOCK_READ8(weights, grouped_filter_offset +
icb * filter_is_pitch +
kh * filter_y_pitch +
kw * filter_x_pitch +
@@ -388,8 +350,7 @@ KERNEL(convolution_bfyx_f16)(
barrier(CLK_LOCAL_MEM_FENCE);
if (feature_sub_block == 0) {
__attribute__((opencl_unroll_hint))
for (int i = 1; i < SLM_DIV_FACTOR; i++)
unroll_for(int i = 1; i < SLM_DIV_FACTOR; i++)
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
#endif // SLM_DIV_FACTOR > 1
@@ -453,13 +414,13 @@ KERNEL(convolution_bfyx_f16)(
#endif
#else
#if OUTPUT_X_BLOCK_SIZE == 8
OUTPUT_BLOCK_WRITE8(output, output_offset, res);
DT_OUTPUT_BLOCK_WRITE8(output, output_offset, res);
#elif OUTPUT_X_BLOCK_SIZE == 4
OUTPUT_BLOCK_WRITE4(output, output_offset, res);
DT_OUTPUT_BLOCK_WRITE4(output, output_offset, res);
#elif OUTPUT_X_BLOCK_SIZE == 2
OUTPUT_BLOCK_WRITE2(output, output_offset, res);
DT_OUTPUT_BLOCK_WRITE2(output, output_offset, res);
#elif OUTPUT_X_BLOCK_SIZE == 1
OUTPUT_BLOCK_WRITE(output, output_offset, res);
DT_OUTPUT_BLOCK_WRITE(output, output_offset, res);
#else
# error convolution_gpu_bfyx_f16.cl: unsupported output x block size
#endif
@@ -480,7 +441,7 @@ KERNEL(convolution_bfyx_f16)(
#if OUTPUT_FORMAT_BFYX
output[output_offset + i] = res[i];
#else
OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
DT_OUTPUT_BLOCK_WRITE(output, output_offset + i * output_x_pitch, res[i]);
#endif
}
}
@@ -511,20 +472,8 @@ KERNEL(convolution_bfyx_f16)(
#undef AS_FILTER_TYPE8
#undef INPUT_BLOCK_READ
#undef INPUT_BLOCK_READ2
#undef INPUT_BLOCK_READ4
#undef INPUT_BLOCK_READ8
#undef FILTER_BLOCK_READ8
#if OUTPUT_FORMAT_BFYX
# undef OUTPUTVTYPE
# undef TO_OUTPUTVTYPE
# undef VSTORE
#else
# undef OUTPUT_BLOCK_WRITE
# undef OUTPUT_BLOCK_WRITE2
# undef OUTPUT_BLOCK_WRITE4
# undef OUTPUT_BLOCK_WRITE8
#endif // OUTPUT_FORMAT_BFYX

View File

@@ -2,17 +2,18 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/common.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/unit_type.cl"
#if X_BLOCK_SIZE > 1
# define GET_SRC(data, id) AS_TYPE(MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE), \
intel_sub_group_shuffle( \
_sub_group_shuffle( \
AS_TYPE(MAKE_VECTOR_TYPE(UNIT_BLOCK_RW_TYPE, X_BLOCK_SIZE), data), \
id))
#else
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
# define GET_SRC(data, id) AS_TYPE(UNIT_TYPE, _sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, data), id))
#endif
#define FEATURE_SLICE_SIZE 16
@@ -22,7 +23,7 @@
# define UNIT_BLOCK_WRITE_VEC(ptr, offset, val) CAT(UNIT_BLOCK_WRITE, X_BLOCK_SIZE)(ptr, offset, val)
#endif
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
__global INPUT0_TYPE* input,
@@ -211,8 +212,7 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
barrier(CLK_LOCAL_MEM_FENCE);
if (feature_sub_block == 0) {
__attribute__((opencl_unroll_hint))
for (int i = 1; i < SLM_DIV_FACTOR; i++)
unroll_for(int i = 1; i < SLM_DIV_FACTOR; i++)
dst += partial_summ[lid1 % feature_per_wg + i * feature_per_wg];
#endif // SLM_DIV_FACTOR > 1

View File

@@ -3,9 +3,9 @@
//
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/data_types.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#define FEATURE_SLICE_SIZE 16
@@ -22,42 +22,9 @@
#define AS_FILTER_TYPE2 CAT(as_, FILTER_TYPE2)
#define TO_OUTPUT_TYPE8 CAT(convert_, OUTPUT_TYPE8)
#if INPUT0_TYPE_SIZE == 2
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read_us8((__global ushort*)(ptr) + (offset)))
#elif INPUT0_TYPE_SIZE == 4
# define INPUT_BLOCK_READ(ptr, offset) AS_INPUT_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
# define INPUT_BLOCK_READ8(ptr, offset) AS_INPUT_TYPE8(intel_sub_group_block_read8((__global uint*)(ptr) + (offset)))
#else
# error convolution_gpu_bfyx_f16_depthwise.cl - unsupported input type.
#endif
#if FILTER_TYPE_SIZE == 2
# define FILTER_BLOCK_READ(ptr, offset) AS_FILTER_TYPE(intel_sub_group_block_read_us((__global ushort*)(ptr) + (offset)))
# define FILTER_BLOCK_READ2(ptr, offset) AS_FILTER_TYPE2(intel_sub_group_block_read_us2((__global ushort*)(ptr) + (offset)))
#elif FILTER_TYPE_SIZE == 4
# define FILTER_BLOCK_READ(ptr, offset) AS_FILTER_TYPE(intel_sub_group_block_read((__global uint*)(ptr) + (offset)))
# define FILTER_BLOCK_READ2(ptr, offset) AS_FILTER_TYPE2(intel_sub_group_block_read2((__global uint*)(ptr) + (offset)))
#else
# error convolution_gpu_bfyx_f16_depthwise.cl - unsupported filter type.
#endif
#if OUTPUT_TYPE_SIZE == 1
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITE_UC_1((__global uchar*)(ptr) + (offset), as_uchar(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr) + (offset), as_uchar8(val))
#elif OUTPUT_TYPE_SIZE == 2
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write_us((__global ushort*)(ptr) + (offset), as_ushort(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write_us8((__global ushort*)(ptr) + (offset), as_ushort8(val))
#elif OUTPUT_TYPE_SIZE == 4
# define OUTPUT_BLOCK_WRITE(ptr, offset, val) intel_sub_group_block_write((__global uint*)(ptr) + (offset), as_uint(val))
# define OUTPUT_BLOCK_WRITE8(ptr, offset, val) intel_sub_group_block_write8((__global uint*)(ptr) + (offset), as_uint8(val))
#else
# error convolution_gpu_bfyx_f16_depthwise.cl - unsupported output type.
#endif
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
KERNEL(convolution_depthwise)(
KERNEL(convolution_gpu_bfyx_f16_depthwise)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
@@ -96,32 +63,32 @@ KERNEL(convolution_depthwise)(
(f_block + input_fs_pad_before) * input_fs_pitch;
#if BIAS_TERM
INPUT_TYPE8 dst = (INPUT_TYPE8)(INPUT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
INPUT_TYPE8 dst = (INPUT_TYPE8)(DT_INPUT_BLOCK_READ(biases, f_block * FEATURE_SLICE_SIZE));
#else
INPUT_TYPE8 dst = (INPUT_TYPE8)(INPUT0_VAL_ZERO);
#endif
#if ((FILTER_SIZE_X == 3) && (FILTER_SIZE_Y == 3) && (STRIDE_SIZE_X == 1) && (DILATION_SIZE_X == 1) && (DILATION_SIZE_Y == 1))
FILTER_TYPE wei_00 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_01 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_02 = FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_10 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_11 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_12 = FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_20 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_21 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_22 = FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_00 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_01 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_02 = DT_FILTER_BLOCK_READ(weights, filter_offset + 0 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_10 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_11 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_12 = DT_FILTER_BLOCK_READ(weights, filter_offset + 1 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_20 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 0 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_21 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 1 * FEATURE_SLICE_SIZE);
FILTER_TYPE wei_22 = DT_FILTER_BLOCK_READ(weights, filter_offset + 2 * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + 2 * FEATURE_SLICE_SIZE);
INPUT_TYPE8 src_block_0 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 0) * input_y_pitch + (input_x) * input_x_pitch);
INPUT_TYPE8 src_block_1 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 1) * input_y_pitch + (input_x) * input_x_pitch);
INPUT_TYPE8 src_block_2 = INPUT_BLOCK_READ8(input, input_offset + (input_y + 2) * input_y_pitch + (input_x) * input_x_pitch);
INPUT_TYPE src_tail_00 = INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 8) * input_x_pitch);
INPUT_TYPE src_tail_01 = INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 9) * input_x_pitch);
INPUT_TYPE src_tail_10 = INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 8) * input_x_pitch);
INPUT_TYPE src_tail_11 = INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 9) * input_x_pitch);
INPUT_TYPE src_tail_20 = INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 8) * input_x_pitch);
INPUT_TYPE src_tail_21 = INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 9) * input_x_pitch);
INPUT_TYPE8 src_block_0 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 0) * input_y_pitch + (input_x) * input_x_pitch);
INPUT_TYPE8 src_block_1 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 1) * input_y_pitch + (input_x) * input_x_pitch);
INPUT_TYPE8 src_block_2 = DT_INPUT_BLOCK_READ8(input, input_offset + (input_y + 2) * input_y_pitch + (input_x) * input_x_pitch);
INPUT_TYPE src_tail_00 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 8) * input_x_pitch);
INPUT_TYPE src_tail_01 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 0) * input_y_pitch + (input_x + 9) * input_x_pitch);
INPUT_TYPE src_tail_10 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 8) * input_x_pitch);
INPUT_TYPE src_tail_11 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 1) * input_y_pitch + (input_x + 9) * input_x_pitch);
INPUT_TYPE src_tail_20 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 8) * input_x_pitch);
INPUT_TYPE src_tail_21 = DT_INPUT_BLOCK_READ(input, input_offset + (input_y + 2) * input_y_pitch + (input_x + 9) * input_x_pitch);
#if X_BLOCK_SIZE == 8
for (uint i = 0; i < X_BLOCK_SIZE - 2; i++)
@@ -185,12 +152,12 @@ KERNEL(convolution_depthwise)(
unroll_for (uint i = 0; i < FILTER_SIZE_Y; i++) {
unroll_for (uint j = 0; j < FILTER_SIZE_X_DIV_2; j++) {
wei_temp = FILTER_BLOCK_READ2(weights, filter_offset + i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + j * 2 * FEATURE_SLICE_SIZE);
wei_temp = DT_FILTER_BLOCK_READ2(weights, filter_offset + i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE + j * 2 * FEATURE_SLICE_SIZE);
wei[i * FILTER_SIZE_X + j * 2] = wei_temp.s0;
wei[i * FILTER_SIZE_X + j * 2 + 1] = wei_temp.s1;
}
#if (FILTER_SIZE_X % 2)
wei[i * FILTER_SIZE_X + FILTER_SIZE_X - 1] = FILTER_BLOCK_READ(weights, filter_offset +
wei[i * FILTER_SIZE_X + FILTER_SIZE_X - 1] = DT_FILTER_BLOCK_READ(weights, filter_offset +
i * FILTER_Y_PITCH * FEATURE_SLICE_SIZE +
(FILTER_SIZE_X - 1) * FEATURE_SLICE_SIZE);
#endif // (FILTER_SIZE_X % 2)
@@ -201,7 +168,7 @@ KERNEL(convolution_depthwise)(
unroll_for (uint k = 0; k < X_BLOCK_SIZE; k++) {
unroll_for (uint i = 0; i < FILTER_SIZE_Y; i++) {
unroll_for (uint j = 0; j < FILTER_SIZE_X; j++) {
src[k * FILTER_SIZE_Y * FILTER_SIZE_X + i * FILTER_SIZE_X + j] = INPUT_BLOCK_READ(input, input_offset +
src[k * FILTER_SIZE_Y * FILTER_SIZE_X + i * FILTER_SIZE_X + j] = DT_INPUT_BLOCK_READ(input, input_offset +
(input_y + (i * DILATION_SIZE_Y)) * input_y_pitch +
(input_x + (j * DILATION_SIZE_X) + k * STRIDE_SIZE_X) * input_x_pitch);
}
@@ -260,7 +227,7 @@ KERNEL(convolution_depthwise)(
#else
res = TO_OUTPUT_TYPE8(dst);
#endif // HAS_FUSED_OPS
OUTPUT_BLOCK_WRITE8(output, output_offset + x * output_x_pitch, res);
DT_OUTPUT_BLOCK_WRITE8(output, output_offset + x * output_x_pitch, res);
}
else
{
@@ -271,7 +238,7 @@ KERNEL(convolution_depthwise)(
#else
res[i] = TO_OUTPUT_TYPE(dst[i]);
#endif // HAS_FUSED_OPS
OUTPUT_BLOCK_WRITE(output, output_offset + (x + i) * output_x_pitch, res[i]);
DT_OUTPUT_BLOCK_WRITE(output, output_offset + (x + i) * output_x_pitch, res[i]);
}
}
}
@@ -300,13 +267,11 @@ KERNEL(convolution_depthwise)(
#else
res = TO_OUTPUT_TYPE(dst[0]);
#endif // HAS_FUSED_OPS
OUTPUT_BLOCK_WRITE(output, output_offset + x * output_x_pitch, res);
DT_OUTPUT_BLOCK_WRITE(output, output_offset + x * output_x_pitch, res);
}
#endif
}
#undef unroll_for
#undef FEATURE_SLICE_SIZE
#undef X_BLOCK_SIZE
@@ -322,12 +287,3 @@ KERNEL(convolution_depthwise)(
#undef AS_FILTER_TYPE2
#undef TO_OUTPUT_TYPE8
#undef INPUT_BLOCK_READ
#undef INPUT_BLOCK_READ8
#undef FILTER_BLOCK_READ
#undef FILTER_BLOCK_READ2
#undef OUTPUT_BLOCK_WRITE
#undef OUTPUT_BLOCK_WRITE8

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#if defined(cl_intel_subgroups_short)
@@ -10,7 +9,7 @@
#define TILE_K FILTER_SIZE_X
#define TILE_N 32
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_f16)(
const __global half *src0,
__global half *dst,
@@ -207,12 +206,12 @@ KERNEL(convolution_f16)(
interleaved_y = 0;
LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
{
p4BlockB00[interleaved_y] = intel_sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
p4BlockB00[interleaved_y] = _sub_group_block_read_us4( (const __global ushort*)src1 + src1_read_offset );
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
} )
if ( kernel_width_is_odd )
{
p2BlockB00[FILTER_SIZE_X - 1] = intel_sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
p2BlockB00[FILTER_SIZE_X - 1] = _sub_group_block_read_us2( (const __global ushort*)src1 + src1_read_offset );
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
}

View File

@@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
@@ -10,7 +10,7 @@
#define TILE_K FILTER_SIZE_X
#define TILE_N 32
__attribute__((intel_reqd_sub_group_size(8)))
REQD_SUB_GROUP_SIZE(8)
KERNEL(convolution_f32)(
const __global float *src0,
__global float *dst,
@@ -149,12 +149,12 @@ KERNEL(convolution_f32)(
interleaved_y = 0;
LOOP(FILTER_SIZE_X_DIV2, interleaved_y,
{
p8BlockB00[interleaved_y] = as_float8( intel_sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
p8BlockB00[interleaved_y] = as_float8( _sub_group_block_read8( (const __global uint*)src1 + src1_read_offset ) );
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
} )
if ( kernel_width_is_odd )
{
p4BlockB00[FILTER_SIZE_X - 1] = as_float4( intel_sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
p4BlockB00[FILTER_SIZE_X - 1] = as_float4( _sub_group_block_read4( (const __global uint*)src1 + src1_read_offset ) );
src1_read_offset += ALIGNED_OFM_PER_GROUP * 2;
}

View File

@@ -2,10 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_bfyx_iyxo_5x5)(
const __global UNIT_TYPE* input,

View File

@@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
@@ -53,7 +53,7 @@ if (_kernel_data.leftovers)
#define FEATURES_THREADS_PER_BATCH (FILTER_OFM_NUM)
#endif
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
const __global UNIT_TYPE* input,
@@ -173,10 +173,10 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
#if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
//if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
UNIT_TYPE val = intel_sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
UNIT_TYPE val = _sub_group_shuffle( in[(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) / SUB_GROUP_SIZE],
(((br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y) * IN_BLOCK_WIDTH) + (bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X)) % SUB_GROUP_SIZE);
#else
UNIT_TYPE val = intel_sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
UNIT_TYPE val = _sub_group_shuffle( in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
#endif
out[br * OUTPUT_BLOCK_WIDTH + bc] = mad(w[wi % PREFETCH], val, out[br * OUTPUT_BLOCK_WIDTH + bc]);

View File

@@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#define FEATURE_SLICE_SIZE 16
@@ -17,9 +19,9 @@
#error "convolution_gpu_bfyx_to_bfyx_f16: Filter and bias has different data type."
#endif
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
KERNEL(convolution_bfyx_to_bfyx_f16)(
KERNEL(convolution_gpu_bfyx_to_bfyx_f16)(
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights,
@@ -134,7 +136,7 @@ KERNEL(convolution_bfyx_to_bfyx_f16)(
INPUT0_TYPE src[INPUT0_FEATURE_NUM];
__attribute__((opencl_unroll_hint(INPUT0_FEATURE_NUM)))
for (int ic = 0; ic < INPUT0_FEATURE_NUM; ic++) {
src[ic] = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
src[ic] = _sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
dst[i] = mad(wei[ic], src[ic], dst[i]);
}
}

View File

@@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/unit_type.cl"
@@ -10,7 +12,7 @@
#define FEATURE_SLICE_SIZE 16
#define INPUT_FEATURE_NUM 3
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1)))
KERNEL(convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16)(
__global INPUT0_TYPE* input,
@@ -142,7 +144,7 @@ KERNEL(convolution_gpu_bfyx_to_bs_fs_yx_bsv16_fsv16)(
__attribute__((opencl_unroll_hint(INPUT_FEATURE_NUM)))
for (int ic = 0; ic < INPUT_FEATURE_NUM; ic++) {
UNIT_TYPE src = intel_sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
UNIT_TYPE src = _sub_group_shuffle(line_cache[ic * INPUT_BLOCK_SIZE + buf_offset], buf_group);
dst[i] = mad(w[ic], src, dst[i]);
}
}

View File

@@ -2,12 +2,12 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/unit_type.cl"
#include "include/batch_headers/fetch_data.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
@@ -58,7 +58,7 @@
#define ALIGNED_IFM_NUM (((FILTER_IFM_NUM + FSV - 1) / FSV) * FSV)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
__global UNIT_TYPE* input,
@@ -164,7 +164,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
// With simd along x dimension:
// (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) / SUB_GROUP_SIZE - element number in simd-lane;
// (out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) % SUB_GROUP_SIZE - simd-lane with that element.
UNIT_TYPE in_val = intel_sub_group_shuffle(
UNIT_TYPE in_val = _sub_group_shuffle(
in[(out_y * STRIDE_SIZE_Y + f_y * DILATION_SIZE_Y) * INPUT_BLOCK_WIDTH_EL_CNT +
(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) / SUB_GROUP_SIZE],
(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) % SUB_GROUP_SIZE);
@@ -299,8 +299,6 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
// ========================================================================
}
#undef unroll_for
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING

View File

@@ -2,11 +2,11 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
@@ -41,7 +41,7 @@
// ======================================================================================
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_fs_byx_fsv32)(
__global INPUT0_TYPE* input,
@@ -128,7 +128,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
{
unroll_for (uint out_f = 0; out_f < FSV_PER_THREAD; ++out_f)
{
INPUT0_TYPE in_val = intel_sub_group_shuffle(
INPUT0_TYPE in_val = _sub_group_shuffle(
in[(out_x * STRIDE_SIZE_X + f_x * DILATION_SIZE_X) * FSV_PER_THREAD + ifii / SUB_GROUP_SIZE],
ifii % SUB_GROUP_SIZE);
@@ -242,8 +242,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
// ========================================================================
}
#undef unroll_for
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING
#undef INPUT0_SIZE_B_WITH_PADDING

View File

@@ -3,11 +3,11 @@
//
#include "include/unit_type.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
@@ -33,7 +33,7 @@
// OUTPUT_BLOCK_HEIGHT - [int] number of elements calculated in y dimension by one thread
// ======================================================================================
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
__global UNIT_TYPE* input,
@@ -109,7 +109,7 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
{
unroll_for (uint out_f = 0; out_f < FSV_PER_THREAD; ++out_f)
{
UNIT_TYPE in_val = intel_sub_group_shuffle(input_read[in_f / SUB_GROUP_SIZE], in_f % SUB_GROUP_SIZE);
UNIT_TYPE in_val = _sub_group_shuffle(input_read[in_f / SUB_GROUP_SIZE], in_f % SUB_GROUP_SIZE);
const uint out_idx = out_y * OUTPUT_BLOCK_WIDTH * FSV_PER_THREAD + out_x * FSV_PER_THREAD + out_f;
out[out_idx] = mad(w[in_f * FSV_PER_THREAD + out_f], in_val, out[out_idx]);
@@ -236,8 +236,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32_1x1)(
// ========================================================================
}
#undef unroll_for
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING
#undef INPUT0_SIZE_B_WITH_PADDING

View File

@@ -2,12 +2,11 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/unit_type.cl"
#include "include/batch_headers/fetch_data.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
#define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
#define INPUT0_SIZE_B_WITH_PADDING (INPUT0_PAD_BEFORE_BATCH_NUM + INPUT0_BATCH_NUM + INPUT0_PAD_AFTER_BATCH_NUM)
@@ -32,9 +31,9 @@
// ======================================================================================
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
KERNEL(convolution_gpu_fs_byx_fsv32)(
KERNEL(convolution_gpu_fs_byx_fsv32_depthwise)(
__global UNIT_TYPE* input,
__global UNIT_TYPE* output,
__global UNIT_TYPE* weights,
@@ -226,8 +225,6 @@ KERNEL(convolution_gpu_fs_byx_fsv32)(
// ========================================================================
}
#undef unroll_for
#undef INPUT0_SIZE_X_WITH_PADDING
#undef INPUT0_SIZE_Y_WITH_PADDING
#undef INPUT0_SIZE_B_WITH_PADDING

View File

@@ -3,7 +3,8 @@
//
#include "include/batch_headers/fetch_data.cl"
#include "include/imad.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/imad.cl"
#if QUANTIZATION_TERM
# define ACCUMULATOR_TYPE int
# define TO_ACCUMULATOR_TYPE(x) convert_int(x)
@@ -40,9 +41,6 @@
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
#define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b)
#if INPUT0_PAD_BEFORE_SIZE_X != 0 || INPUT0_PAD_BEFORE_SIZE_Y != 0
#define NON_ZERO_INPUT0_PAD_BEFORE
#endif
@@ -68,7 +66,7 @@
// int8 conv_input and weights data is packed to int32 "batches",
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
REQD_SUB_GROUP_SIZE(SIMD_SIZE)
__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
KERNEL (fused_convolution_eltwise_gpu_imad)(
#if INPUT0_LAYOUT_B_FS_YX_FSV16
@@ -134,8 +132,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
int weights_zp_vec_partial;
weights_zp_vec_partial = weights_zp_val;
FILTER_TYPE* wzp_p = (FILTER_TYPE*)&weights_zp_vec_partial;
__attribute__((opencl_unroll_hint))
for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
unroll_for (uint in_f = FILTER_IFM_NUM % PACK; in_f < PACK; in_f++) {
wzp_p[in_f] = 0;
}
#endif
@@ -237,7 +234,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
#endif
#else
#ifdef BLOCK_LOAD_INPUTS
in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read((const __global uint*) &conv_input[in_addr]));
in[reg] = AS_PACKED_TYPE(_sub_group_block_read((const __global uint*) &conv_input[in_addr]));
#ifdef SHOULD_USE_DATA_ZP
if (input_on_padding)
in[reg] = data_zp_val;
@@ -255,8 +252,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
}
#ifdef BLOCK_LOAD_WEIGHTS
*((int8*)&w[0]) = as_int8(intel_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
w[8] = as_int(intel_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
*((int8*)&w[0]) = as_int8(_sub_group_block_read8((const __global uint*) &weights[weight_addr]));
w[8] = as_int(_sub_group_block_read((const __global uint*) &weights[weight_addr + (SIMD_SIZE<<3)]));
weight_addr += SIMD_SIZE*NUM_FILTERS;
#else
for(int pf = 0; pf < NUM_FILTERS; pf++) {
@@ -278,10 +275,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
dotProdAZPxW = TO_ACCUMULATOR_TYPE(IMAD(dotProdAZPxW, AS_INPUT0_TYPE_4(data_zp_val), AS_FILTER_TYPE_4(w[wi])));
#endif
__attribute__((opencl_unroll_hint))
for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
__attribute__((opencl_unroll_hint))
for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
unroll_for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
unroll_for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
INPUT0_TYPE_4 inputs = AS_INPUT0_TYPE_4(sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y],
bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X));
@@ -403,5 +398,3 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
#undef FILTER_TYPE_4
#undef AS_FILTER_TYPE_4
#undef NUM_FILTERS
#undef CEIL_DIV
#undef ALIGN

View File

@@ -4,7 +4,8 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/imad.cl"
#if QUANTIZATION_TERM
#define ACCUMULATOR_TYPE int
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
@@ -24,7 +25,7 @@
#define BATCH_SLICE_SIZE 16
#define FEATURE_SLICE_SIZE 16
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
const __global INPUT0_TYPE *conv_input,
__global OUTPUT_TYPE *output,
@@ -63,15 +64,15 @@ KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
__attribute__((opencl_unroll_hint(16)))
for (uint j = 0; j < 16; j++) {
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val.s0, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val.s1, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val.s2, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val.s3, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val2.s0, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val2.s1, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val2.s2, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val2.s3, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val2.s0, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val2.s1, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val2.s2, j))));
dotProd[16 + j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[16 + j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val2.s3, j))));
}
filter_idx += weights_x_pitch;
filter_idx2 += weights_x_pitch;
@@ -94,7 +95,7 @@ KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_1x1)(
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
#if BIAS_TERM
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + intel_sub_group_shuffle(bias, i);
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i] + _sub_group_shuffle(bias, i);
#else
dequantized = (ACTIVATION_TYPE)dotProd[16 * j + i];
#endif

View File

@@ -4,7 +4,8 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/imad.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#if QUANTIZATION_TERM
#define ACCUMULATOR_TYPE int
#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
@@ -25,7 +26,7 @@
// int8 conv_input and weights data is packed to int32 "batches",
// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_gpu_imad_bs_fs_yx_bsv16_fsv16_3x3)(
const __global INPUT0_TYPE *conv_input,
__global OUTPUT_TYPE *output,
@@ -68,10 +69,10 @@ uint split_idx)
__attribute__((opencl_unroll_hint(16)))
for (uint j = 0; j < 16; j++) {
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(intel_sub_group_shuffle(weights_val.s0, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(intel_sub_group_shuffle(weights_val.s1, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(intel_sub_group_shuffle(weights_val.s2, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(intel_sub_group_shuffle(weights_val.s3, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s0), as_char4(_sub_group_shuffle(weights_val.s0, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s1), as_char4(_sub_group_shuffle(weights_val.s1, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s2), as_char4(_sub_group_shuffle(weights_val.s2, j))));
dotProd[j] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[j], AS_INPUT0_TYPE_4(input_val0.s3), as_char4(_sub_group_shuffle(weights_val.s3, j))));
}
filter_idx += weights_x_pitch;
input_idx += input_x_pitch;
@@ -93,7 +94,7 @@ uint split_idx)
for (uint i = 0; i < 16; i++) {
ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
#if BIAS_TERM
dequantized = (ACTIVATION_TYPE)dotProd[i] + intel_sub_group_shuffle(bias, i);
dequantized = (ACTIVATION_TYPE)dotProd[i] + _sub_group_shuffle(bias, i);
#else
dequantized = (ACTIVATION_TYPE)dotProd[i];
#endif

View File

@@ -2,7 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/mmad.cl"
@@ -26,7 +27,7 @@
#define ACTIVATION_TYPE_VEC float8
#define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
#define MMAD MMAD_8x8
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
#define BLOCK_WRITE(ptr, val) _sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
#elif OUTPUT_X_BLOCK_SIZE == 4
#define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
#define ACCUMULATOR_TYPE_VEC int4
@@ -34,13 +35,13 @@
#define ACTIVATION_TYPE_VEC float4
#define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
#define MMAD MMAD_4x8
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
#define BLOCK_WRITE(ptr, val) _sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
#else
#error "convolution_gpu_mmad_b_fs_yx_fsv32: Unsupported block size"
#endif
__attribute__((reqd_work_group_size(8, OW_GROUP, 1)))
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
KERNEL(convolution_mmad_b_fs_yx_fsv32)(
__global INPUT0_TYPE* input,
__global PACKED_OUT_TYPE* output,
@@ -145,7 +146,7 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
}
else
{
line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, intel_sub_group_block_read((const __global uint*)(input + in_addr +
line_cache[xb] = AS_TYPE(PACKED_IN_TYPE, _sub_group_block_read((const __global uint*)(input + in_addr +
icb * input_fs_pitch +
kd * DILATION_SIZE_Z * input_z_pitch +
kh * DILATION_SIZE_Y * input_y_pitch +
@@ -166,10 +167,10 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32)(
+ kh * ISV_SIZE * OSV_SIZE * FILTER_SIZE_X
+ kw * ISV_SIZE * OSV_SIZE;
int8 weights_data0 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
int8 weights_data1 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
int8 weights_data2 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
int8 weights_data3 = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));
int8 weights_data0 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 0*8*ISV_SIZE)));
int8 weights_data1 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 1*8*ISV_SIZE)));
int8 weights_data2 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 2*8*ISV_SIZE)));
int8 weights_data3 = as_int8(_sub_group_block_read8((const __global uint*)(weights + f_off + 3*8*ISV_SIZE)));
acc[0] = MMAD(src, weights_data0, acc[0]); // 8 elements in 4*lid+0 out channel
acc[1] = MMAD(src, weights_data1, acc[1]); // 8 elements in 4*lid+1 out channel

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(convolution_mmad_b_fs_yx_fsv32_dw)(

View File

@@ -2,11 +2,10 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/imad.cl"
#define CEIL_DIV(x, y) (1 + ((x) - 1) / (y))
#include "include/batch_headers/imad.cl"
#define ISV 4
@@ -30,9 +29,9 @@
#define ACTIVATION_TYPE_VEC float8
#define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
#if OUTPUT_LAYOUT_B_FS_YX_FSV32
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
#else // OUTPUT_LAYOUT_B_FS_YX_FSV32
#define BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_8((__global uchar*)(ptr), as_uchar8(val))
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_uc8((__global uchar*)(ptr), as_uchar8(val))
#endif // OUTPUT_LAYOUT_B_FS_YX_FSV32
#elif OUTPUT_X_BLOCK_SIZE == 4
#define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
@@ -41,9 +40,9 @@
#define ACTIVATION_TYPE_VEC float4
#define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
#if OUTPUT_LAYOUT_B_FS_YX_FSV32
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
#else // OUTPUT_LAYOUT_B_FS_YX_FSV32
#define BLOCK_WRITE(ptr, val) BLOCK_WRITE_UC_4((__global uchar*)(ptr), as_uchar4(val))
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_uc4((__global uchar*)(ptr), as_uchar4(val))
#endif // OUTPUT_LAYOUT_B_FS_YX_FSV32
#else
#error "convolution_gpu_mmad_bfyx_b_fs_yx_fsv32: Unsupported block size"
@@ -52,9 +51,8 @@
#define AS_TYPE_N_(type, n, x) as_##type##n(x)
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(LWS0, LWS1, LWS2)))
KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
__global INPUT0_TYPE* input,
@@ -265,9 +263,9 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
+ kh * OSV * ISV * FILTER_SIZE_X
+ kw * OSV * ISV;
int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
int weights_data0 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off)));
#if OUTPUT_FEATURE_NUM > 16
int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
int weights_data1 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off + SUB_GROUP_SIZE*ISV)));
#endif
PACKED_TYPE_VEC src;
@@ -492,7 +490,6 @@ KERNEL(convolution_mmad_bfyx_to_b_fs_yx_fsv32)(
#endif // OUTPUT_IS_FP
}
#undef CEIL_DIV
#undef PACKED_TYPE_VEC
#undef ACCUMULATOR_TYPE_VEC
#undef TO_ACCUMULATOR_TYPE_VEC

View File

@@ -2,11 +2,10 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/imad.cl"
#define CEIL_DIV(x, y) (1 + ((x) - 1) / (y))
#include "include/batch_headers/imad.cl"
#ifdef ACCUMULATOR_TYPE
#undef ACCUMULATOR_TYPE
@@ -27,14 +26,14 @@
#define TO_ACCUMULATOR_TYPE_VEC(x) convert_int8(x)
#define ACTIVATION_TYPE_VEC float8
#define TO_ACTIVATION_TYPE_VEC(x) convert_float8(x)
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
#elif OUTPUT_X_BLOCK_SIZE == 4
#define PACKED_TYPE_VEC MAKE_VECTOR_TYPE(PACKED_IN_TYPE, 4)
#define ACCUMULATOR_TYPE_VEC int4
#define TO_ACCUMULATOR_TYPE_VEC(x) convert_int4(x)
#define ACTIVATION_TYPE_VEC float4
#define TO_ACTIVATION_TYPE_VEC(x) convert_float4(x)
#define BLOCK_WRITE(ptr, val) intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
#define BLOCK_WRITE(ptr, val) _sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
#else
#error "convolution_gpu_mmad_bfyx_to_b_fs_yx_fsv4: Unsupported block size"
#endif
@@ -43,7 +42,7 @@
#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
__global INPUT0_TYPE* input,
__global PACKED_OUT_TYPE* output,
@@ -129,8 +128,8 @@ KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
+ kh * OSV * 4 * FILTER_SIZE_X
+ kw * OSV * 4;
int weights_data0 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off)));
int weights_data1 = as_int(intel_sub_group_block_read((const __global uint*)(weights + f_off + 16*4)));
int weights_data0 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off)));
int weights_data1 = as_int(_sub_group_block_read((const __global uint*)(weights + f_off + 16*4)));
PACKED_TYPE_VEC src;
@@ -223,7 +222,6 @@ KERNEL(convolution_mmad_bfyx_b_fs_yx_fsv32)(
#endif // OUTPUT_IS_FP
}
#undef CEIL_DIV
#undef PACKED_TYPE_VEC
#undef ACCUMULATOR_TYPE_VEC
#undef TO_ACCUMULATOR_TYPE_VEC

View File

@@ -2,6 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/sub_group_shuffle.cl"
// --------------------------------------------------------------------------------------------------------------------------------
// L3_SIMD_4x8
// Input matrices dimensions: M x K x N
@@ -35,7 +37,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
const int INPUT0_SIZE_Y_PITCH_UNIT_4 = INPUT0_PITCH_SIZE_Y / VEC_SIZE; //for bxyf -> INPUT0_PITCH_SIZE_Y is equal to input features count, since ifm % 32 == 0, division by VEC_SIZE is ok
const int OUTPUT_SIZE_Y_PITCH_UNIT_4 = OUTPUT_Y_PITCH / VEC_SIZE; //for bxyf -> OUTPUT_Y_PITCH is equal to output features count, since ofm % 32 == 0, division by VEC_SIZE is ok
const int WEIGHTS_FEATURE_PITCH_UNIT_4 = WEIGHTS_PITCH_FEATURE / VEC_SIZE; //for xyio -> WEIGHTS_PITCH_FEATURE is equal to the output features count
const int group_x = get_group_id(0);
const int group_y = get_group_id(1);
const int group_z = get_group_id(2);
@@ -59,10 +61,10 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
const int y_idx = tile_idx_y; //winograd tile height == 1
const int f_idx = group_x * TILE_N + local_x * VEC_SIZE;
const int b_idx = batch_idx;
const int in_tile_idx = (x_idx % WINOGRAD_TILE_WIDTH);
const int tile_idx_x = (x_idx / WINOGRAD_TILE_WIDTH);
// Result ctile is M rows x N columns
// M = 8, we have 1 rows of work-items, so we need 8/1 = 8 results down
// N = 32, we have 8 columns of work-items, so we need 32/8 = 4 results across = 1 float4s across
@@ -124,11 +126,11 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
const UNIT_TYPE_4 a6 = src0[6 * INPUT0_SIZE_Y_PITCH_UNIT_4];
const UNIT_TYPE_4 a7 = src0[7 * INPUT0_SIZE_Y_PITCH_UNIT_4];
#define DOT_PRODUCT( _i, _j ) { a = intel_sub_group_shuffle(a ## _i, _j); c ## _i = mad(a.x, b0, mad(a.y, b1, mad(a.z, b2, mad(a.w, b3, c ## _i)))); }
#define DOT_PRODUCT( _i, _j ) { a = _sub_group_shuffle(a ## _i, _j); c ## _i = mad(a.x, b0, mad(a.y, b1, mad(a.z, b2, mad(a.w, b3, c ## _i)))); }
//in one iteration load weights tile 1-width, 1-height, 4-depth from 4 different filters (ofms)
//SIMD reads are chained along b-axis (different ofms), resulting in 1-width, 1-height, 4-depth blocks from 4*8=32 different filters
//consecutive reads are chained along f-dim and overflows to y-dim, reading in total
//consecutive reads are chained along f-dim and overflows to y-dim, reading in total
#define ITERATION( _j ) \
{ \
const UNIT_TYPE_4 b0 = src1[0]; src1 += WEIGHTS_FEATURE_PITCH_UNIT_4; \
@@ -165,7 +167,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1)
src0 += TILE_K / VEC_SIZE;
}
dst[0] = c0; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
dst[0] = c1; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;
dst[0] = c2; dst += OUTPUT_SIZE_Y_PITCH_UNIT_4;

View File

@@ -9,26 +9,27 @@
// --------------------------------------------------------------------------------------------------------------------------------
#include "include/batch_headers/common.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#define DOT4i0( _result, _A, _B, i) \
{ \
_result = mad(_A.s0, intel_sub_group_shuffle( _B.s0, (i)), _result); \
_result = mad(_A.s0, _sub_group_shuffle( _B.s0, (i)), _result); \
}
#define DOT4i1( _result, _A, _B, i) \
{ \
_result = mad(_A.s1, intel_sub_group_shuffle( _B.s1, (i)), _result); \
_result = mad(_A.s1, _sub_group_shuffle( _B.s1, (i)), _result); \
}
#define DOT4i2( _result, _A, _B, i) \
{ \
_result = mad(_A.s2, intel_sub_group_shuffle( _B.s2, (i)), _result); \
_result = mad(_A.s2, _sub_group_shuffle( _B.s2, (i)), _result); \
}
#define DOT4i3( _result, _A, _B, i) \
{ \
_result = mad(_A.s3, intel_sub_group_shuffle( _B.s3, (i)), _result); \
_result = mad(_A.s3, _sub_group_shuffle( _B.s3, (i)), _result); \
}
#define UNIT_TYPE_2 CAT(UNIT_TYPE, 2)
@@ -36,15 +37,15 @@
#define UNIT_TYPE_8 CAT(UNIT_TYPE, 8)
__attribute__((reqd_work_group_size(8, 2, 8)))
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_gpu_winograd_2x3_s1_fused)
(
__global INPUT0_TYPE* I,
__global OUTPUT_TYPE* O,
__global FILTER_TYPE* U,
__global INPUT0_TYPE* I,
__global OUTPUT_TYPE* O,
__global FILTER_TYPE* U,
#if BIAS_TERM
const __global UNIT_TYPE * bias,
#endif
#endif
uint split_idx)
{
// (DxC2)x(UxWx8c)
@@ -52,17 +53,17 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
__local UNIT_TYPE_4 V[slmSize*2]; // 8 KB
/* These constants are defined as precompiler macros during compilation. */
const uint WC = W*INPUT0_FEATURE_NUM;
const uint HW = H*W;
const uint HWC = H*WC;
const uint WC4 = WC >> 2;
const uint K16 = FILTER_OFM_NUM >> 4;
const uint C4 = INPUT0_FEATURE_NUM >> 2;
const uint K2 = FILTER_OFM_NUM >> 1;
const uint QK2 = Q*K2;
const uint QK = Q*FILTER_OFM_NUM;
const uint PQK = P*QK;
const uint WC = W*INPUT0_FEATURE_NUM;
const uint HW = H*W;
const uint HWC = H*WC;
const uint WC4 = WC >> 2;
const uint K16 = FILTER_OFM_NUM >> 4;
const uint C4 = INPUT0_FEATURE_NUM >> 2;
const uint K2 = FILTER_OFM_NUM >> 1;
const uint QK2 = Q*K2;
const uint QK = Q*FILTER_OFM_NUM;
const uint PQK = P*QK;
const uint upperHalf = get_local_id(1);
uint gx = get_group_id(0);
uint gy = (uint)get_group_id(1)*2+((uint)get_group_id(2)%2);
@@ -86,7 +87,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
int x = gx*14 + lz*2 + lxd4 - px;
int y = gy*4 - py;
uint k = gk*16 + lzd4*8;
// # x->
// # M0 M1 M2 M3 M4 M5 M6
// # +------------------------------------------
@@ -113,13 +114,13 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
uint lxm2 = lx % 2;
uint lxb1 = (lx & 2)/2;
uint2 coordU0;
coordU0.x = (lzm4*24 + k*12);
coordU0.y = 0;
uint slmPipeStage = 0;
__attribute__((opencl_unroll_hint(1)))
for (uint c = lxm4; c < C4_up16; c += 4) {
@@ -142,7 +143,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
bool y5_in = 0 <= (y + 5) && (y + 5) < H && x_in;
#if INPUT0_LAYOUT_BYXF
/* const UNIT_TYPE_4 I_load_0 = y0_in ? I_load[0*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
const UNIT_TYPE_4 I_load_1 = y1_in ? I_load[1*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
const UNIT_TYPE_4 I_load_2 = y2_in ? I_load[2*WC4+c] : (UNIT_TYPE_4)(UNIT_VAL_ZERO);
@@ -227,10 +228,10 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
//uint coordU_x = coordU0.x + get_sub_group_local_id()%8;
const uint flatA = coordU0.y*FILTER_OFM_NUM*KCOLSW*KROWSW + coordU0.x + get_sub_group_local_id()%8;
const UNIT_TYPE_4 f0 = (UNIT_TYPE_4)(
*(__global UNIT_TYPE *)(&U[flatA+0*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+1*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+2*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+3*FILTER_OFM_NUM*KCOLSW*KROWSW])); // as_UNIT_TYPE_4(intel_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+0*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+1*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+2*FILTER_OFM_NUM*KCOLSW*KROWSW]), // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
*(__global UNIT_TYPE *)(&U[flatA+3*FILTER_OFM_NUM*KCOLSW*KROWSW])); // as_UNIT_TYPE_4(_sub_group_block_read4(U, coordU));
// row 0
@@ -554,7 +555,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
DOT4i3(M6.s2, f1, V13, 2 + c4);
DOT4i3(M6.s3, f1, V13, 4 + c4);
//flatA += 8;
const UNIT_TYPE_4 f2 = (UNIT_TYPE_4)(
*(__global UNIT_TYPE *)(&U[flatA + 16 + 0 * FILTER_OFM_NUM*KCOLSW*KROWSW]),
@@ -563,7 +564,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
*(__global UNIT_TYPE *)(&U[flatA + 16 + 3 * FILTER_OFM_NUM*KCOLSW*KROWSW]));
coordU0.y += 4;
// f2[c4] x v[2 .. 16]
DOT4i0(M0.s0, f2, V00, 4 + c4);
DOT4i0(M0.s1, f2, V00, 6 + c4);
@@ -628,7 +629,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
// row 1
// f2 x v[2 .. 16]
DOT4i1(M0.s2, f2, V10, 4 + c4);
DOT4i1(M0.s3, f2, V10, 6 + c4);
@@ -649,7 +650,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
DOT4i1(M6.s3, f2, V13, 6 + c4);
// f2[c4] x v[2 .. 16]
DOT4i2(M0.s0, f2, V00, 4 + c4);
DOT4i2(M0.s1, f2, V00, 6 + c4);
@@ -759,11 +760,11 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lz < 7)
if (lz < 7)
{
// Load multiplies from SLM.
__local const UNIT_TYPE_8 *M_read = (__local UNIT_TYPE_8*)&V[lz*8 + lxd4*224 + lxm4*2 + slmSize*upperHalf];
UNIT_TYPE_8 M0 = M_read[0*28];
UNIT_TYPE_8 M1 = M_read[1*28];
UNIT_TYPE_8 M2 = M_read[2*28];
@@ -821,7 +822,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write_0[0] = ACTIVATION(S0.s0, ACTIVATION_PARAMS);
O_write_0[0+Q*P] = ACTIVATION(S0.s4, ACTIVATION_PARAMS);
#endif
#endif
#endif
}
if (q1_in) {
#if OUTPUT_LAYOUT_BYXF
@@ -829,7 +830,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[0*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s1 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s5 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[0*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s1, ACTIVATION_PARAMS), ACTIVATION(S0.s5, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_0[1] = ACTIVATION(S0.s1 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -837,8 +838,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_0[1] = ACTIVATION(S0.s1, ACTIVATION_PARAMS);
O_write_0[1+Q*P] = ACTIVATION(S0.s5, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -850,7 +851,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[1*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s0 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s4 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[1*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s0, ACTIVATION_PARAMS), ACTIVATION(S1.s4, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_1[0] = ACTIVATION(S1.s0 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -858,8 +859,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_1[0] = ACTIVATION(S1.s0, ACTIVATION_PARAMS);
O_write_1[0+Q*P] = ACTIVATION(S1.s4, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
if (q1_in) {
#if OUTPUT_LAYOUT_BYXF
@@ -867,7 +868,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[1*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s1 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s5 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[1*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s1, ACTIVATION_PARAMS), ACTIVATION(S1.s5, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_1[1] = ACTIVATION(S1.s1 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -875,8 +876,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_1[1] = ACTIVATION(S1.s1, ACTIVATION_PARAMS);
O_write_1[1+Q*P] = ACTIVATION(S1.s5, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -888,7 +889,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[2*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s2 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s6 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[2*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s2, ACTIVATION_PARAMS), ACTIVATION(S0.s6, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_2[0] = ACTIVATION(S0.s2 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -896,8 +897,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_2[0] = ACTIVATION(S0.s2, ACTIVATION_PARAMS);
O_write_2[0+Q*P] = ACTIVATION(S0.s6, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
if (q1_in) {
#if OUTPUT_LAYOUT_BYXF
@@ -905,7 +906,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[2*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s3 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S0.s7 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[2*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S0.s3, ACTIVATION_PARAMS), ACTIVATION(S0.s7, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_2[1] = ACTIVATION(S0.s3 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -913,8 +914,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_2[1] = ACTIVATION(S0.s3, ACTIVATION_PARAMS);
O_write_2[1+Q*P] = ACTIVATION(S0.s7, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -926,7 +927,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[3*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s2 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s6 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[3*QK2 + 0*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s2, ACTIVATION_PARAMS), ACTIVATION(S1.s6, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_3[0] = ACTIVATION(S1.s2 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -934,7 +935,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_3[0] = ACTIVATION(S1.s2, ACTIVATION_PARAMS);
O_write_3[0+Q*P] = ACTIVATION(S1.s6, ACTIVATION_PARAMS);
#endif
#endif
#endif
}
if (q1_in) {
@@ -943,7 +944,7 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
O_write[3*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s3 + bias[bias_index0], ACTIVATION_PARAMS), ACTIVATION(S1.s7 + bias[bias_index1], ACTIVATION_PARAMS));
#else
O_write[3*QK2 + 1*K2] = (UNIT_TYPE_2)(ACTIVATION(S1.s3, ACTIVATION_PARAMS), ACTIVATION(S1.s7, ACTIVATION_PARAMS));
#endif
#endif
#else
#if BIAS_TERM
O_write_3[1] = ACTIVATION(S1.s3 + bias[bias_index0], ACTIVATION_PARAMS);
@@ -951,8 +952,8 @@ KERNEL(convolution_gpu_winograd_2x3_s1_fused)
#else
O_write_3[1] = ACTIVATION(S1.s3, ACTIVATION_PARAMS);
O_write_3[1+Q*P] = ACTIVATION(S1.s7, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
}

View File

@@ -9,7 +9,7 @@
// --------------------------------------------------------------------------------------------------------------------------------
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#define DOT8i_0( _result, _A, _B, i) \
@@ -63,7 +63,7 @@
__attribute__((reqd_work_group_size(16, 1, 8)))
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(convolution_gpu_winograd_6x3_s1_fused)
(
__global INPUT0_TYPE* I,
@@ -75,7 +75,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
#endif
#if BIAS_TERM
const __global UNIT_TYPE * bias,
#endif
#endif
uint split_idx)
{
// (DxC2)x(UxWx8c)
@@ -100,7 +100,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
uint gx = get_group_id(0);
uint gy = get_group_id(1);
uint gz = get_group_id(2);
uint gz = get_group_id(2);
uint gk = gz % K16;
uint gn = gz / K16;
@@ -266,7 +266,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
__local const UNIT_TYPE_8 *V_read_c16 = V_read;
__attribute__((opencl_unroll_hint(1)))
for (uint c16 = 0; c16 < 2
for (uint c16 = 0; c16 < 2
#ifndef FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB
&& coordU0.y < last_coord_y
#endif
@@ -297,17 +297,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
// Fetch 8 channels of Winograd components from f(k,s)
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
const UNIT_TYPE_8 f00 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x, coordU0.y)));
const UNIT_TYPE_8 f00 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x, coordU0.y)));
#else
const UNIT_TYPE_8 f00 = (UNIT_TYPE_8)(
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 0 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 1 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 2 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 3 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 4 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 5 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 6 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 7 * WEIGHTWIDTH])));
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 0 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 1 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 2 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 3 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 4 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 5 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 6 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 7 * WEIGHTWIDTH])));
#endif
@@ -467,17 +467,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
DOT8i_7(M6.s1, f00, V8, 10 + c8);
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
const UNIT_TYPE_8 f01 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x + 16 * sizeof(UNIT_TYPE), coordU0.y)));
const UNIT_TYPE_8 f01 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x + 16 * sizeof(UNIT_TYPE), coordU0.y)));
#else
const UNIT_TYPE_8 f01 = (UNIT_TYPE_8)(
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 0 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 1 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 2 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 3 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 4 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 5 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 6 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 7 * WEIGHTWIDTH])));
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 0 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 1 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 2 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 3 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 4 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 5 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 6 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 16 + 7 * WEIGHTWIDTH])));
#endif
// f1[c8] x v[1 .. 15]
@@ -637,17 +637,17 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
DOT8i_7(M6.s1, f01, V8, 12 + c8);
#if FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_FBXYB || FILTER_LAYOUT_IMAGE_2D_WEIGHTS_WINOGRAD_6x3_S1_XFBYB
const UNIT_TYPE_8 f02 = as_half8(intel_sub_group_block_read_us8(U, (int2)(coordU0.x + 32 * sizeof(UNIT_TYPE), coordU0.y)));
const UNIT_TYPE_8 f02 = as_half8(_sub_group_block_read_us8(U, (int2)(coordU0.x + 32 * sizeof(UNIT_TYPE), coordU0.y)));
#else
const UNIT_TYPE_8 f02 = (UNIT_TYPE_8)(
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 0 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 1 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 2 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 3 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 4 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 5 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 6 * WEIGHTWIDTH])),
as_half(intel_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 7 * WEIGHTWIDTH])));
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 0 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 1 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 2 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 3 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 4 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 5 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 6 * WEIGHTWIDTH])),
as_half(_sub_group_block_read_us((__global unsigned short *)&U[flatA + 32 + 7 * WEIGHTWIDTH])));
#endif
coordU0.y += 8;
@@ -919,7 +919,7 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
#else
O_write_0[0] = ACTIVATION(S0.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
}
if (q1_in) {
#if OUTPUT_LAYOUT_BYXF
@@ -927,14 +927,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[0 * QK + 1 * K] = ACTIVATION(S0.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[0 * QK + 1 * K] = ACTIVATION(S0.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_0[1] = ACTIVATION(S0.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_0[1] = ACTIVATION(S0.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -946,14 +946,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[1 * QK + 0 * K] = ACTIVATION(S1.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[1 * QK + 0 * K] = ACTIVATION(S1.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_1[0] = ACTIVATION(S1.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_1[0] = ACTIVATION(S1.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
if (q1_in) {
#if OUTPUT_LAYOUT_BYXF
@@ -961,14 +961,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[1 * QK + 1 * K] = ACTIVATION(S1.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[1 * QK + 1 * K] = ACTIVATION(S1.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_1[1] = ACTIVATION(S1.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_1[1] = ACTIVATION(S1.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -980,14 +980,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[2 * QK + 0 * K] = ACTIVATION(S2.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[2 * QK + 0 * K] = ACTIVATION(S2.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_2[0] = ACTIVATION(S2.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_2[0] = ACTIVATION(S2.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
if (q1_in) {
#if OUTPUT_LAYOUT_BYXF
@@ -995,14 +995,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[2 * QK + 1 * K] = ACTIVATION(S2.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[2 * QK + 1 * K] = ACTIVATION(S2.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_2[1] = ACTIVATION(S2.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_2[1] = ACTIVATION(S2.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -1014,13 +1014,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[3 * QK + 0 * K] = ACTIVATION(S3.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[3 * QK + 0 * K] = ACTIVATION(S3.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_3[0] = ACTIVATION(S3.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_3[0] = ACTIVATION(S3.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
}
if (q1_in) {
@@ -1029,14 +1029,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[3 * QK + 1 * K] = ACTIVATION(S3.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[3 * QK + 1 * K] = ACTIVATION(S3.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_3[1] = ACTIVATION(S3.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_3[1] = ACTIVATION(S3.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
}
@@ -1049,13 +1049,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[4 * QK + 0 * K] = ACTIVATION(S4.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[4 * QK + 0 * K] = ACTIVATION(S4.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_4[0] = ACTIVATION(S4.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_4[0] = ACTIVATION(S4.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
}
if (q1_in) {
@@ -1064,14 +1064,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[4 * QK + 1 * K] = ACTIVATION(S4.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[4 * QK + 1 * K] = ACTIVATION(S4.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_4[1] = ACTIVATION(S4.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_4[1] = ACTIVATION(S4.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
@@ -1083,13 +1083,13 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[5 * QK + 0 * K] = ACTIVATION(S5.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[5 * QK + 0 * K] = ACTIVATION(S5.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_5[0] = ACTIVATION(S5.s0 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_5[0] = ACTIVATION(S5.s0 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
}
if (q1_in) {
@@ -1098,14 +1098,14 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
O_write[5 * QK + 1 * K] = ACTIVATION(S5.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write[5 * QK + 1 * K] = ACTIVATION(S5.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#else
#if BIAS_TERM
O_write_5[1] = ACTIVATION(S5.s1 * scl + bias[bias_index0], ACTIVATION_PARAMS);
#else
O_write_5[1] = ACTIVATION(S5.s1 * scl, ACTIVATION_PARAMS);
#endif
#endif
#endif
#endif
}
}
}
@@ -1113,4 +1113,4 @@ KERNEL(convolution_gpu_winograd_6x3_s1_fused)
}
#undef UNIT_TYPE_2
#undef UNIT_TYPE_4
#undef UNIT_TYPE_8
#undef UNIT_TYPE_8

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(convolution_gpu_yxfb_ref)(

View File

@@ -2,11 +2,13 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
__attribute__((reqd_work_group_size(16, 1, 1)))
KERNEL(convolution_gpu_yxfb_yxio_b16)(
const __global UNIT_TYPE* input,
@@ -94,7 +96,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
for (uint h = 0; h < FILTER_IFM_NUM; h++)
{
#if defined(USE_BLOCK_READ_2)
half4 _input = as_half4(intel_sub_group_block_read2((const __global uint*)(input + input_idx)));
half4 _input = as_half4(_sub_group_block_read2((const __global uint*)(input + input_idx)));
uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
_data[0] = fma(_input.s0, filter_transp, _data[0]);
@@ -103,7 +105,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
_data[3] = fma(_input.s3, filter_transp, _data[3]);
input_idx += INPUT0_FEATURE_PITCH;
#elif defined(USE_BLOCK_READ_1)
half2 _input = as_half2(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
half2 _input = as_half2(_sub_group_block_read((const __global uint*)(input + input_idx)));
uint filter_val_pair = *(const __global uint*)(filter + filter_idx);
half16 filter_transp = TRANSPOSE_BLOCK_16_FP16(filter_val_pair);
_data[0] = fma(_input.s0, filter_transp, _data[0]);

View File

@@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
@@ -93,7 +95,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b16)(
for (uint h = 0; h < FILTER_IFM_NUM; h++)
{
#ifdef USE_BLOCK_READ_2
float2 _input = as_float2(intel_sub_group_block_read2((const __global uint*)input + input_idx));
float2 _input = as_float2(_sub_group_block_read2((const __global uint*)input + input_idx));
float8 filter_transp = TRANSPOSE_BLOCK_8(filter[filter_idx]);
_data[0] = fma(_input.s0, filter_transp, _data[0]);
_data[1] = fma(_input.s1, filter_transp, _data[1]);

View File

@@ -2,7 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
@@ -18,23 +19,23 @@ KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
{
#if USE_VECTOR == 8
#define VECTOR_FLOAT float8
#define BLOCK_READ(IN) as_float8(intel_sub_group_block_read8((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
#define BLOCK_READ(IN) as_float8(_sub_group_block_read8((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write8((__global uint*)OUT, as_uint8(DATA));
#endif
#if USE_VECTOR == 4
#define VECTOR_FLOAT float4
#define BLOCK_READ(IN) as_float4(intel_sub_group_block_read4((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
#define BLOCK_READ(IN) as_float4(_sub_group_block_read4((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write4((__global uint*)OUT, as_uint4(DATA));
#endif
#if USE_VECTOR == 2
#define VECTOR_FLOAT float2
#define BLOCK_READ(IN) as_float2(intel_sub_group_block_read2((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
#define BLOCK_READ(IN) as_float2(_sub_group_block_read2((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write2((__global uint*)OUT, as_uint2(DATA));
#endif
#if USE_VECTOR == 1
#define VECTOR_FLOAT float
#define BLOCK_READ(IN) as_float(intel_sub_group_block_read((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) intel_sub_group_block_write((__global uint*)OUT, as_uint(DATA));
#define BLOCK_READ(IN) as_float(_sub_group_block_read((const __global uint*)IN))
#define BLOCK_WRITE(OUT, DATA) _sub_group_block_write((__global uint*)OUT, as_uint(DATA));
#endif
const uint batch_num = INPUT0_BATCH_NUM;
@@ -99,7 +100,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b1_block_multiple_x)(
float _in[X_PER_WORK_ITEM];
for(uint a = 0; a < X_PER_WORK_ITEM; a++)
{
_in[a] = as_float(intel_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
_in[a] = as_float(_sub_group_block_read((const __global uint*)input + (input_idx + a * INPUT0_FEATURE_NUM * STRIDE_SIZE_X)));
}
float8 _input[X_PER_WORK_ITEM];
for(uint a = 0; a < X_PER_WORK_ITEM; a++)

View File

@@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
@@ -65,7 +67,7 @@ KERNEL(convolution_gpu_yxfb_yxio_b8)(
#endif
for (uint h = 0; h < FILTER_IFM_NUM / 8; h++)
{
float8 _input = as_float8(intel_sub_group_block_read8((const __global uint*)input + input_idx));
float8 _input = as_float8(_sub_group_block_read8((const __global uint*)input + input_idx));
DOT_PRODUCT_8(_data0, _input.s0, filter[filter_idx]) filter_idx += FILTER_OFM_NUM;
#if OFM_PER_WORK_ITEM == 16
@@ -128,8 +130,8 @@ KERNEL(convolution_gpu_yxfb_yxio_b8)(
#endif
const uint _out_id = OUTPUT_OFFSET + out_id;
intel_sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
_sub_group_block_write8((__global uint*)output + _out_id, as_uint8(_data0));
#if OFM_PER_WORK_ITEM == 16
intel_sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
_sub_group_block_write8((__global uint*)output + _out_id + 8 * INPUT0_FEATURE_PITCH, as_uint8(_data1));
#endif
}

View File

@@ -3,7 +3,6 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#define INPUT0_GET_INDEX1(idx_order) INPUT0_GET_INDEX(idx_order)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(ctc_greedy_decoder_ref)(const __global INPUT0_TYPE* probabilities

View File

@@ -2,36 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
///////////////////////// Input Index /////////////////////////
inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if INPUT0_DIMS < 5
return INPUT0_GET_INDEX(b, f, y, x);
#elif INPUT0_DIMS == 5
return INPUT0_GET_INDEX(b, f, z, y, x);
#elif INPUT0_DIMS == 6
return INPUT0_GET_INDEX(b, f, w, z, y, x);
#else
#error cum_sum_ref.cl: input format - not supported
#endif
}
///////////////////////// Output Index /////////////////////////
inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint x)
{
#if OUTPUT_DIMS < 5
return OUTPUT_GET_INDEX(b, f, y, x);
#elif OUTPUT_DIMS == 5
return OUTPUT_GET_INDEX(b, f, z, y, x);
#elif OUTPUT_DIMS == 6
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
#else
#error cum_sum_ref.cl: output format - not supported
#endif
}
#include "include/fetch_utils.cl"
inline void FUNC(get_indices)(int *axes)
{
@@ -87,8 +58,6 @@ inline void FUNC(get_indices)(int *axes)
#endif
}
#define unroll_for __attribute__((opencl_unroll_hint)) for
#if CUM_SUM_PARTIAL_SUM
inline uint FUNC(get_current_index)(int axis, int i)
{
@@ -99,7 +68,7 @@ inline uint FUNC(get_current_index)(int axis, int i)
#endif
}
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(LWS, 1, 1)))
KERNEL(cum_sum_partial_sum)(
const __global INPUT0_TYPE* input,
@@ -160,7 +129,7 @@ inline uint FUNC(get_current_index)(int i)
}
// main
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(LWS, 1, 1)))
KERNEL(cum_sum_final)(
const __global PARTIAL_TYPE* partial,

View File

@@ -2,36 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
///////////////////////// Input Index /////////////////////////
inline uint FUNC(get_input_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
{
#if INPUT0_DIMS < 5
return INPUT0_GET_INDEX(b, f, y, x);
#elif INPUT0_DIMS == 5
return INPUT0_GET_INDEX(b, f, z, y, x);
#elif INPUT0_DIMS == 6
return INPUT0_GET_INDEX(b, f, w, z, y, x);
#else
#error cum_sum_ref.cl: input format - not supported
#endif
}
///////////////////////// Output Index /////////////////////////
inline uint FUNC(get_output_index)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x)
{
#if OUTPUT_DIMS < 5
return OUTPUT_GET_INDEX(b, f, y, x);
#elif OUTPUT_DIMS == 5
return OUTPUT_GET_INDEX(b, f, z, y, x);
#elif OUTPUT_DIMS == 6
return OUTPUT_GET_INDEX(b, f, w, z, y, x);
#else
#error cum_sum_ref.cl: output format - not supported
#endif
}
#include "include/fetch_utils.cl"
KERNEL(cum_sum_ref)(
OPTIONAL_SHAPE_INFO_ARG

View File

@@ -2,13 +2,11 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/data_types.cl"
#include "deconvolution_gpu_imad_common.cl"
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define FEATURE_SLICE_SIZE 16
#if X_BLOCK_SIZE == 1
@@ -54,7 +52,7 @@ DECLARE_READ_BLOCK_8(preload_weights, FILTER_TYPE)
# endif
#endif
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE))) // attr:no-format
REQD_SUB_GROUP_SIZE(FEATURE_SLICE_SIZE) // attr:no-format
__attribute__((reqd_work_group_size(1, FEATURE_SLICE_SIZE, 1)))
KERNEL(deconvolution_gpu_b_fs_zyx_fsv16_dw)(
const __global INPUT0_TYPE *input,
@@ -272,7 +270,6 @@ KERNEL(deconvolution_gpu_b_fs_zyx_fsv16_dw)(
}
}
#undef unroll_for
#undef FEATURE_SLICE_SIZE
#undef GET_VEC_ELEM

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#define WORK_GROUP_GROUP_SIZE 16

View File

@@ -4,8 +4,10 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "deconvolution_gpu_imad_common.cl"
@@ -31,7 +33,7 @@ DECLARE_STORE_BLOCK_4(store_output, OUTPUT_TYPE)
#define WEIGHTS_IN_TILE_OFM_PITCH (TILE_IFM * SIMD)
__attribute__((reqd_work_group_size(1, SIMD, 1)))
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
KERNEL(deconvolution_gpu_imad_ref)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* restrict output,
@@ -127,8 +129,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
for (uint fi = 0; fi < FILTER_IFM_NUM; fi += TILE_IFM) {
// Load weights [TILE_OFM, TILE_IFM, 1, 1]
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
uint weights_idx = weights_offset + of * WEIGHTS_IN_TILE_OFM_PITCH / 4;
FUNC_CALL(load_weights_ui)(weights_ui, weights_idx, TILE_IFM / 4, wei[of]);
}
@@ -142,8 +143,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
uint input_offset = INPUT0_GET_INDEX(out_b, if_start + fi, fixed_in_z, fixed_in_y, fixed_in_x) / 4;
# endif
#endif
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
uint input_idx = input_offset + ob * INPUT_IN_TILE_B_PITCH / 4;
FUNC_CALL(load_input_ui)(input_ui, input_idx, TILE_IFM / 4, in[ob]);
}
@@ -151,24 +151,18 @@ KERNEL(deconvolution_gpu_imad_ref)(
input_offset += INPUT_TILE_IFM_PITCH / 4;
#endif
if (zero_x) {
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint ifp = 0; ifp < TILE_IFM / 4; ++ifp) {
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
unroll_for(uint ifp = 0; ifp < TILE_IFM / 4; ++ifp) {
in[ob][ifp] = 0;
}
}
}
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
__attribute__((opencl_unroll_hint))
for (uint imad_it = 0; imad_it < TILE_IFM / 4; ++imad_it) {
uint in_val = intel_sub_group_shuffle(in[ob][imad_it], tx);
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
unroll_for(uint tx = 0; tx < TILE_X; ++tx) {
unroll_for (uint imad_it = 0; imad_it < TILE_IFM / 4; ++imad_it) {
uint in_val = _sub_group_shuffle(in[ob][imad_it], tx);
acc[ob][of][tx] = IMAD(acc[ob][of][tx], AS_INPUT_TYPE4(in_val), AS_FILTER_TYPE4(wei[of][imad_it]));
}
}
@@ -180,25 +174,19 @@ KERNEL(deconvolution_gpu_imad_ref)(
}
ACTIVATION_TYPE dequantized[TILE_B][TILE_OFM][TILE_X];
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
dequantized[ob][of][tx] = TO_ACTIVATION_TYPE(acc[ob][of][tx]);
}
}
}
#if BIAS_TERM
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
BIAS_TYPE bias_val = bias[out_f + of * SIMD];
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint ob = 0; ob < TILE_B; ++ob) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
dequantized[ob][of][tx] += TO_ACTIVATION_TYPE(bias_val);
}
}
@@ -206,15 +194,12 @@ KERNEL(deconvolution_gpu_imad_ref)(
#endif
OUTPUT_TYPE result[TILE_B][TILE_OFM][TILE_X];
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
unroll_for (uint of = 0; of < TILE_OFM; ++of) {
#if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_PRELOAD;
#endif
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for(uint ob = 0; ob < TILE_B; ++ob) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
#if HAS_FUSED_OPS
# if FUSED_OPS_CAN_USE_PRELOAD
FUSED_OPS_CALC;
@@ -233,12 +218,9 @@ KERNEL(deconvolution_gpu_imad_ref)(
bool leftovers_f = OUTPUT_FEATURE_NUM % SIMD != 0 && out_f + SIMD >= OUTPUT_FEATURE_NUM;
#if OUTPUT_NAIVE_STORE
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
if ((leftovers_x && tx >= OUTPUT_SIZE_X % TILE_X) ||
(leftovers_f && out_f + of * SIMD >= OUTPUT_FEATURE_NUM))
break;
@@ -252,10 +234,8 @@ KERNEL(deconvolution_gpu_imad_ref)(
}
}
#elif OUTPUT_BLOCK_X_STORE
__attribute__((opencl_unroll_hint))
for (uint ob = 0; ob < TILE_B; ++ob) {
__attribute__((opencl_unroll_hint))
for (uint of = 0; of < TILE_OFM; ++of) {
unroll_for (uint ob = 0; ob < TILE_B; ++ob) {
unroll_for(uint of = 0; of < TILE_OFM; ++of) {
#if OUTPUT_DIMS <= 4
uint output_idx = OUTPUT_GET_INDEX(out_b + ob, out_fg + of * SIMD, out_y, out_x);
#elif OUTPUT_DIMS == 5
@@ -266,8 +246,7 @@ KERNEL(deconvolution_gpu_imad_ref)(
} else if (!leftovers_f) {
FUNC_CALL(store_output)(output, output_idx, OUTPUT_SIZE_X % TILE_X, result[ob][of]);
} else {
__attribute__((opencl_unroll_hint))
for (uint tx = 0; tx < TILE_X; ++tx) {
unroll_for (uint tx = 0; tx < TILE_X; ++tx) {
if (out_f + of * SIMD < OUTPUT_FEATURE_NUM && out_x + tx < OUTPUT_SIZE_X) {
output[output_idx + sglid + tx * SIMD] = result[ob][of][tx];
}

View File

@@ -2,10 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#define CEIL_DIV(a, b) (((a) + ((b) - 1)) / (b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#define VEC_TO_ARR_1(var, arr, idx) \
arr[idx] = var

View File

@@ -4,8 +4,7 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/imad.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/imad.cl"
#include "deconvolution_gpu_imad_common.cl"

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(deconvolution_gpu_yxfb_ref)(

View File

@@ -2,15 +2,16 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/unit_type.cl"
#define FEATURE_SLICE_SIZE 16
#define GET_WEI(filter, id) AS_TYPE(UNIT_TYPE, intel_sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, filter), id))
#define GET_WEI(filter, id) AS_TYPE(UNIT_TYPE, _sub_group_shuffle(AS_TYPE(UNIT_BLOCK_RW_TYPE, filter), id))
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(deformable_convolution_gpu_bfyx_conv)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,

View File

@@ -2,10 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL(deformable_convolution_gpu_bfyx_interp)(
const __global INPUT0_TYPE* data,
const __global INPUT1_TYPE* trans,
@@ -29,7 +28,7 @@ KERNEL(deformable_convolution_gpu_bfyx_interp)(
const int input_offset_x = input_x + kw * DILATION_SIZE_X;
const int input_offset_y = input_y + kh * DILATION_SIZE_Y;
#if DEFORMABLE_MASK_ENABLED
const int dg_size = dg * FILTER_SIZE_Y * FILTER_SIZE_X * OUTPUT_SIZE_Y * OUTPUT_SIZE_X;
const int trans_offset = b * INPUT1_BATCH_PITCH + 2 * dg_size;

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(depth_to_space_block2_opt)(const __global half* input, __global half* output)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(depth_to_space_ref)(const __global INPUT0_TYPE* input,

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/common.cl"
#include "include/detection_output_common.cl"
@@ -56,7 +55,6 @@
// LOCAL_BATCHES_NUM - number of batch that can be process per work-group
// =================================================================================================================
#define unroll_for __attribute__((opencl_unroll_hint)) for
#define NUM_CLASSES_ACC (NUM_CLASSES + 2)
typedef struct __attribute__((__packed__)) {

View File

@@ -2,11 +2,11 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#define FEATURE_SLICE_SIZE 16
#define unroll_for __attribute__((opencl_unroll_hint())) for
#define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, BLOCK_SIZE)
#define TO_TYPE(type, val) CAT(convert_, type)(val)
@@ -25,7 +25,7 @@
#define GET_INDEX(prefix, num, idx_order) CAT(CAT(prefix, num), _GET_INDEX)(idx_order)
#endif
__attribute__((intel_reqd_sub_group_size(FEATURE_SLICE_SIZE)))
REQD_SUB_GROUP_SIZE(FEATURE_SLICE_SIZE)
KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
__global OUTPUT_TYPE* output
#if HAS_FUSED_OPS_DECLS
@@ -107,7 +107,6 @@ KERNEL(eltwise_b_fs_yx_fsv16)(INPUTS_DECLS
}
#undef FEATURE_SLICE_SIZE
#undef unroll_for
#undef OUTPUT_TYPE_BLOCK
#undef TO_TYPE
#undef READ_FUNC

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#define OUTPUT_TYPE_BLOCK MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(eltwise_fs_b_yx_fsv32)(

View File

@@ -2,18 +2,19 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/unit_type.cl"
// Kernel works only for sub_group size of 16 with 32 features slice size and process 2 features per WI
#define REQD_SUB_GROUP_SIZE 16
#define SUB_GROUP_SIZE 16
#define REQD_FEATURE_SLICE_SIZE 32
#define REQD_FEATURES_PER_WORK_ITEM 2
//inputs_decls -> __global unit_type * input0, __global unit_type * input1
__attribute__((intel_reqd_sub_group_size(REQD_SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
KERNEL(eltwise_mixed_byxf_and_fs_b_yx_fsv32)(
INPUTS_DECLS
__global UNIT_TYPE* output)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(eltwise_gpu_vload8)(INPUTS_DECLS

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#ifdef PACKED_SUM

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#define INPUT_TYPE INPUT0_TYPE
#define INPUT_TYPE2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)

View File

@@ -3,7 +3,6 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
inline int FUNC(get_pyramid_level_index)(uint level, uint c, uint y, uint x) {
uint idx = 0;
@@ -64,9 +63,10 @@ KERNEL(experimental_detectron_roi_feature_extractor_ref)(const __global INPUT0_T
const uint roi_bin_grid_w = (SAMPLING_RATIO > 0) ? SAMPLING_RATIO : (uint)ceil(roi_width / POOLED_WIDTH);
const uint roi_bin_grid_h = (SAMPLING_RATIO > 0) ? SAMPLING_RATIO : (uint)ceil(roi_height / POOLED_HEIGHT);
const uint level_h = LEVEL_SIZES[3 * level];
const uint level_w = LEVEL_SIZES[3 * level + 1];
const uint level_offset = LEVEL_SIZES[3 * level + 2];
size_t level_sizes_arr[3*NUM_PYRAMID_LEVELS] = LEVEL_SIZES;
const uint level_h = level_sizes_arr[3 * level];
const uint level_w = level_sizes_arr[3 * level + 1];
const uint level_offset = level_sizes_arr[3 * level + 2];
INPUT0_TYPE output_val = 0.0;
INPUT0_TYPE current_bin_start_h = roi_start_h + y * bin_height;

View File

@@ -3,7 +3,6 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
KERNEL(experimental_detectron_topk_rois_ref)(const __global INPUT0_TYPE* input_rois,
const __global INPUT1_TYPE* topk_indices, __global OUTPUT_TYPE* output_rois)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
KERNEL(extract_image_patches_ref)(const __global INPUT0_TYPE* input,

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"
#include "include/mmad.cl"
@@ -12,12 +11,12 @@
#define INPUT_PACKED_TYPE_VEC CAT(INPUT_PACKED_TYPE, SUB_GROUP_SIZE)
#define FILTER_PACKED_TYPE_VEC CAT(FILTER_PACKED_TYPE, SUB_GROUP_SIZE)
#define BLOCK_READ(ptr) intel_sub_group_block_read((const __global uint*)(ptr))
#define BLOCK_READ_8(ptr) intel_sub_group_block_read8((const __global uint*)(ptr))
#define BLOCK_READ(ptr) _sub_group_block_read((const __global uint*)(ptr))
#define BLOCK_READ_8(ptr) _sub_group_block_read8((const __global uint*)(ptr))
#define MMAD CAT(MMAD_, SUB_GROUP_SIZE)
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
KERNEL(fully_connected_gpu_MMAD)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -133,8 +132,7 @@ KERNEL(fully_connected_gpu_MMAD)(
INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
FILTER_PACKED_TYPE_VEC weights_data[UNROLL_FACTOR];
__attribute__((opencl_unroll_hint))
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
unroll_for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, BLOCK_READ(input + input_idx + kb * MMAD_INPUT_FBLOCK_PITCH));
#if SUB_GROUP_SIZE == 8
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, BLOCK_READ_8(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
@@ -144,8 +142,7 @@ KERNEL(fully_connected_gpu_MMAD)(
#endif // SUB_GROUP_SIZE
}
__attribute__((opencl_unroll_hint))
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
unroll_for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
INPUT_PACKED_TYPE_VEC in;
in.s0 = sub_group_broadcast(input_data[kb], 0);
@@ -177,8 +174,7 @@ KERNEL(fully_connected_gpu_MMAD)(
barrier(CLK_LOCAL_MEM_FENCE);
if (feature_block == 0) {
__attribute__((opencl_unroll_hint))
for (uint i = 1; i < SLM_DIV_FACTOR; i++)
unroll_for(uint i = 1; i < SLM_DIV_FACTOR; i++)
dotProd += partial_summ[lid0 % feature_per_wg + i * feature_per_wg];
#endif // SLM_DIV_FACTOR > 1

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#if defined(__fc_f16)

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
// Required JIT constants:
@@ -18,7 +17,7 @@
#define ACC_TYPE float
__attribute__((intel_reqd_sub_group_size(16)))
REQD_SUB_GROUP_SIZE(16)
KERNEL (fully_connected_gpu_bf_io_input_spatial)(
const __global UNIT_TYPE* input,
__global UNIT_TYPE* output,
@@ -47,8 +46,8 @@ KERNEL (fully_connected_gpu_bf_io_input_spatial)(
uint it_w_addr = _inG == UNIT_VAL_ZERO ? weight_idx_base : s_w_idx;
for (uint j = 0; j < 16; j++)
{
UNIT_TYPE _in = intel_sub_group_shuffle(_inG, j);
uint wi_w_addr = intel_sub_group_shuffle(it_w_addr, j);
UNIT_TYPE _in = _sub_group_shuffle(_inG, j);
uint wi_w_addr = _sub_group_shuffle(it_w_addr, j);
wi_w_addr += MULTIPLY_OFFSET(UNIT_TYPE, get_sub_group_local_id());
UNIT_TYPE _w = *OFFSET_GLOBAL_PTR(UNIT_TYPE, weight, wi_w_addr);
result += _in * _w;

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
// Required JIT constants:

View File

@@ -3,7 +3,9 @@
//
#include "include/batch_headers/common.cl"
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
// JIT Parameters:
// SIMD - sub-group size/simd width, one of {8, 16};
@@ -51,11 +53,6 @@
#define BIAS_BLOCK_READ(ptr, offset) BLOCK_READN(BIAS_TYPE, TILE_OFM, ptr, offset)
#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, TILE_OFM, ptr, offset, val)
// Utility math macros.
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
// Check alignment restrictions for using block writes on output.
#define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * TILE_OUT_B_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)
@@ -80,7 +77,7 @@
# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
#endif
__attribute__((intel_reqd_sub_group_size(SIMD)))
REQD_SUB_GROUP_SIZE(SIMD)
KERNEL(fc)(
const __global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
@@ -122,9 +119,8 @@ KERNEL(fc)(
INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * TILE_IN_B_PITCH];
MAKE_VECTOR_TYPE(FILTER_TYPE, TILE_OFM) tmp_wei = BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset);
__attribute__((opencl_unroll_hint))
for (uint bi = 0; bi < TILE_B; ++bi) {
acc[bi] = intel_sub_group_shuffle(tmp_input, bi) * tmp_wei;
unroll_for(uint bi = 0; bi < TILE_B; ++bi) {
acc[bi] = _sub_group_shuffle(tmp_input, bi) * tmp_wei;
}
weights_offset += TILE_OFM * SIMD;
@@ -148,19 +144,15 @@ KERNEL(fc)(
// NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
// but significantly degrades readability and generality of code.
// It doesn't also show noticable performance improvement on tested configurations.
__attribute__((opencl_unroll_hint))
for (uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
wei = FILTER_BLOCK_READ(weights, weights_offset);
weights_offset += TILE_K_OFM * SIMD;
__attribute__((opencl_unroll_hint))
for (uint kii = 0; kii < TILE_K; ++kii) {
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < TILE_OFM; ++fi) {
__attribute__((opencl_unroll_hint))
for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
const uint total_k = ki * TILE_K + kii;
INPUT0_TYPE in_val = intel_sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
}
}
@@ -181,20 +173,16 @@ KERNEL(fc)(
CONST_LOOP(TILE_B, LOAD_IN_0);
#undef LOAD_IN_0
input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
__attribute__((opencl_unroll_hint))
for (uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
unroll_for(uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
wei = FILTER_BLOCK_READ(weights, weights_offset);
weights_offset += TILE_K_OFM * SIMD;
__attribute__((opencl_unroll_hint))
for (uint kii = 0; kii < TILE_K; ++kii) {
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < TILE_OFM; ++fi) {
__attribute__((opencl_unroll_hint))
for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for (uint kii = 0; kii < TILE_K; ++kii) {
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
const uint total_k = ki * TILE_K + kii;
if (total_k < LEFTOVER_IFM) {
INPUT0_TYPE in_val = intel_sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((FILTER_TYPE*)(&wei))[kii * TILE_OFM + fi];
}
}
@@ -216,24 +204,20 @@ KERNEL(fc)(
BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
#else
BIAS_VEC_TYPE bias = 0;
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < TILE_OFM; ++fi) {
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
((BIAS_TYPE*)(&bias))[fi] = biases[out_f + sglid + fi * SIMD];
}
#endif
__attribute__((opencl_unroll_hint))
for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
activated[bi] += TO_ACTIVATION_VEC_TYPE(bias);
}
#endif
OUTPUT_VEC_TYPE result[TILE_B] = { };
#if HAS_FUSED_OPS
__attribute__((opencl_unroll_hint))
for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
#if TILE_OFM > 1
__attribute__((opencl_unroll_hint))
for (uint fi = 0; fi < TILE_OFM; ++fi) {
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
FUSED_OPS_VEC;
result[bi][fi] = FUSED_OPS_RESULT_VEC;
}
@@ -243,8 +227,7 @@ KERNEL(fc)(
#endif // TILE_OFM > 1
}
#else
__attribute__((opencl_unroll_hint))
for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
result[bi] = TO_OUTPUT_VEC_TYPE(ACTIVATION_TYPED(activated[bi], ACTIVATION_PARAMS_TYPED));
}
#endif
@@ -314,10 +297,6 @@ KERNEL(fc)(
#undef BIAS_BLOCK_READ
#undef OUTPUT_BLOCK_WRITE
#undef CEIL_DIV
#undef MIN
#undef MAX
#undef USE_BLOCK_WRITE
#undef MAIN_LOOP_ELEMENTS_COUNT

View File

@@ -2,7 +2,6 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/fetch_weights.cl"

View File

@@ -2,12 +2,13 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
// Block read - currently block is 4 bytes aligned.
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define MULTIPLY_BLOCKS_16x8(_result, _blockA, _blockB) \
{ \
@@ -32,8 +33,8 @@
#define SUB_GROUP_SIZE 16
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv16_vload)(
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
KERNEL (fully_connected_gpu_bs_f_bsv16_af8_vload)(
const __global UNIT_TYPE* input,
__global UNIT_TYPE* output,
const __global UNIT_TYPE* weight

View File

@@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/sub_group_block_read.cl"
#include "include/batch_headers/sub_group_block_write.cl"
#include "include/batch_headers/sub_group_shuffle.cl"
#include "include/batch_headers/fetch_data.cl"
// ---------------------------------------------------------------------------------------------------------------------
@@ -93,7 +95,7 @@
// Extracts one scalar element of UNIT_TYPE from sub-group chunk;
// chunk - name of chunk variable, idx - 0-based index of element.
#define SG_UNIT_SELECT(chunk, idx) CHUNK_UNIT_SELECT(intel_sub_group_shuffle(chunk, (idx) / UNITS_PER_CHUNK), (idx) % UNITS_PER_CHUNK)
#define SG_UNIT_SELECT(chunk, idx) CHUNK_UNIT_SELECT(_sub_group_shuffle(chunk, (idx) / UNITS_PER_CHUNK), (idx) % UNITS_PER_CHUNK)
// ---------------------------------------------------------------------------------------------------------------------
// Reads / Writes:
@@ -118,10 +120,10 @@
(array)[(idx) + 6] = chunk_vec.s6, (array)[(idx) + 7] = chunk_vec.s7))
// Currently block read is 4 bytes aligned.
#define ALIGNED_BLOCK_READ1(ptr, byte_offset) intel_sub_group_block_read((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) intel_sub_group_block_read2((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ4(ptr, byte_offset) intel_sub_group_block_read4((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) intel_sub_group_block_read8((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ1(ptr, byte_offset) _sub_group_block_read((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ2(ptr, byte_offset) _sub_group_block_read2((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ4(ptr, byte_offset) _sub_group_block_read4((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) _sub_group_block_read8((const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
// Currently read is 4 bytes aligned.
#define ALIGNED_READ1(ptr, byte_offset) (*(const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
@@ -130,10 +132,10 @@
#define ALIGNED_READ8(ptr, byte_offset) vload8(0, (const __global CHUNK_TYPE*)((const __global char*)(ptr) + (byte_offset)))
// Currently block write is 16 bytes aligned.
#define ALIGNED_BLOCK_WRITE1(ptr, byte_offset, val) intel_sub_group_block_write((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE2(ptr, byte_offset, val) intel_sub_group_block_write2((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE4(ptr, byte_offset, val) intel_sub_group_block_write4((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) intel_sub_group_block_write8((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE1(ptr, byte_offset, val) _sub_group_block_write((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE2(ptr, byte_offset, val) _sub_group_block_write2((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE4(ptr, byte_offset, val) _sub_group_block_write4((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
#define ALIGNED_BLOCK_WRITE8(ptr, byte_offset, val) _sub_group_block_write8((__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)), (val))
// Currently block write is 4 bytes aligned.
#define ALIGNED_WRITE1(ptr, byte_offset, val) ((void)(*(__global CHUNK_TYPE*)((__global char*)(ptr) + (byte_offset)) = (val)))
@@ -156,7 +158,7 @@
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
const __global UNIT_TYPE* input,
@@ -210,32 +212,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
CHUNK_TYPE input_val[IN_CHUNK_PREFETCH_SIZE];
#if IN_CHUNK_PREFETCH_SIZE % 8 == 0
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 8)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 8)
{
CHUNK_VEC8_TYPE input_vals = ALIGNED_BLOCK_READ8(input, input_offset + 8 * sg_elem_offset);
input_offset += 8 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
}
#elif IN_CHUNK_PREFETCH_SIZE % 4 == 0
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 4)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 4)
{
CHUNK_VEC4_TYPE input_vals = ALIGNED_BLOCK_READ4(input, input_offset + 4 * sg_elem_offset);
input_offset += 4 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
}
#elif IN_CHUNK_PREFETCH_SIZE % 2 == 0
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 2)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 2)
{
CHUNK_VEC2_TYPE input_vals = ALIGNED_BLOCK_READ2(input, input_offset + 2 * sg_elem_offset);
input_offset += 2 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
}
#else
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 1)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_SIZE; input_val_idx += 1)
{
CHUNK_VEC1_TYPE input_vals = ALIGNED_BLOCK_READ1(input, input_offset + sg_elem_offset);
input_offset += BYTES_PER_SG_READ;
@@ -243,8 +241,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
}
#endif
__attribute__((opencl_unroll_hint))
for (uint elem_base_idx = 0; elem_base_idx < IN_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
unroll_for(uint elem_base_idx = 0; elem_base_idx < IN_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
{
// Contains group of weights for RESPONSES_PER_SG_EXEC responses and for (FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC) spatial points.
// Currently for floats:
@@ -264,32 +261,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
CHUNK_TYPE filter_val[FILTER_CHUNK_PREFETCH_SIZE];
#if FILTER_CHUNK_PREFETCH_SIZE % 8 == 0
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 8)
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 8)
{
CHUNK_VEC8_TYPE filter_vals = ALIGNED_BLOCK_READ8(weight, filter_offset + 8 * sg_elem_offset);
filter_offset += 8 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
}
#elif FILTER_CHUNK_PREFETCH_SIZE % 4 == 0
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 4)
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 4)
{
CHUNK_VEC4_TYPE filter_vals = ALIGNED_BLOCK_READ4(weight, filter_offset + 4 * sg_elem_offset);
filter_offset += 4 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
}
#elif FILTER_CHUNK_PREFETCH_SIZE % 2 == 0
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 2)
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 2)
{
CHUNK_VEC2_TYPE filter_vals = ALIGNED_BLOCK_READ2(weight, filter_offset + 2 * sg_elem_offset);
filter_offset += 2 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
}
#else
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 1)
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; filter_val_idx += 1)
{
CHUNK_VEC1_TYPE filter_vals = ALIGNED_BLOCK_READ1(weight, filter_offset + sg_elem_offset);
filter_offset += BYTES_PER_SG_READ;
@@ -298,8 +291,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
#endif
// Processing of cached filter chunks.
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; ++filter_val_idx)
unroll_for (uint filter_val_idx = 0; filter_val_idx < FILTER_CHUNK_PREFETCH_SIZE; ++filter_val_idx)
{
const uint input_base_elem_idx = elem_base_idx + filter_val_idx * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC;
@@ -338,32 +330,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
CHUNK_TYPE input_val[IN_CHUNK_PREFETCH_SIZE];
#if IN_CHUNK_PREFETCH_SIZE % 8 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 8 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 16)
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 8)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 8)
{
CHUNK_VEC8_TYPE input_vals = ALIGNED_BLOCK_READ8(input, input_offset + 8 * sg_elem_offset);
input_offset += 8 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
}
#elif IN_CHUNK_PREFETCH_SIZE % 4 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 4 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 8)
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 4)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 4)
{
CHUNK_VEC4_TYPE input_vals = ALIGNED_BLOCK_READ4(input, input_offset + 4 * sg_elem_offset);
input_offset += 4 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
}
#elif IN_CHUNK_PREFETCH_SIZE % 2 == 0 && (IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE % 2 == 0 || IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE >= 4)
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 2)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 2)
{
CHUNK_VEC2_TYPE input_vals = ALIGNED_BLOCK_READ2(input, input_offset + 2 * sg_elem_offset);
input_offset += 2 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(input_val, input_val_idx, input_vals);
}
#else
__attribute__((opencl_unroll_hint))
for (uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 1)
unroll_for(uint input_val_idx = 0; input_val_idx < IN_CHUNK_PREFETCH_REMAINDER_REQ_SIZE; input_val_idx += 1)
{
CHUNK_VEC1_TYPE input_vals = ALIGNED_BLOCK_READ1(input, input_offset + sg_elem_offset);
input_offset += BYTES_PER_SG_READ;
@@ -371,8 +359,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
}
#endif
__attribute__((opencl_unroll_hint))
for (uint elem_base_idx = 0; elem_base_idx < INPUT0_ELEMENTS_REMAINDER; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
unroll_for(uint elem_base_idx = 0; elem_base_idx < INPUT0_ELEMENTS_REMAINDER; elem_base_idx += FILTER_CHUNK_PREFETCH_SIZE * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC)
{
// Size of array of CHUNK_TYPE needed to contain filter elements for input elements in range [elem_base_idx; INPUT0_ELEMENTS_REMAINDER).
const uint filter_chunk_remainder_size = ((INPUT0_ELEMENTS_REMAINDER - elem_base_idx) * RESPONSES_PER_SG_EXEC + UNITS_PER_SG_READ - 1) / UNITS_PER_SG_READ;
@@ -381,32 +368,28 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
CHUNK_TYPE filter_val[FILTER_CHUNK_PREFETCH_SIZE];
#if FILTER_CHUNK_PREFETCH_SIZE % 8 == 0
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 8)
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 8)
{
CHUNK_VEC8_TYPE filter_vals = ALIGNED_BLOCK_READ8(weight, filter_offset + 8 * sg_elem_offset);
filter_offset += 8 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC8_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
}
#elif FILTER_CHUNK_PREFETCH_SIZE % 4 == 0
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 4)
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 4)
{
CHUNK_VEC4_TYPE filter_vals = ALIGNED_BLOCK_READ4(weight, filter_offset + 4 * sg_elem_offset);
filter_offset += 4 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC4_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
}
#elif FILTER_CHUNK_PREFETCH_SIZE % 2 == 0
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 2)
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 2)
{
CHUNK_VEC2_TYPE filter_vals = ALIGNED_BLOCK_READ2(weight, filter_offset + 2 * sg_elem_offset);
filter_offset += 2 * BYTES_PER_SG_READ;
EXPAND_CHUNK_VEC2_TO_CHUNK_ARRAY(filter_val, filter_val_idx, filter_vals);
}
#else
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 1)
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; filter_val_idx += 1)
{
CHUNK_VEC1_TYPE filter_vals = ALIGNED_BLOCK_READ1(weight, filter_offset + sg_elem_offset);
filter_offset += BYTES_PER_SG_READ;
@@ -415,8 +398,7 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
#endif
// Processing of cached filter chunks.
__attribute__((opencl_unroll_hint))
for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; ++filter_val_idx)
unroll_for (uint filter_val_idx = 0; filter_val_idx < filter_chunk_prefetch_req_size; ++filter_val_idx)
{
const uint input_base_elem_idx = elem_base_idx + filter_val_idx * UNITS_PER_SG_READ / RESPONSES_PER_SG_EXEC;
@@ -458,15 +440,14 @@ KERNEL (fully_connected_gpu_bx_bs_x_bsv16_b1)(
sg_reduce_offset < SUB_GROUP_SIZE;
sg_reduce_offset += SUB_GROUP_SIZE * RESPONSES_PER_SG_EXEC / UNITS_PER_SG_READ)
{
reduced_acc = AS_CHUNK(AS_UNITS(reduced_acc) + AS_UNITS(intel_sub_group_shuffle_down(acc, zero, sg_reduce_offset)));
reduced_acc = AS_CHUNK(AS_UNITS(reduced_acc) + AS_UNITS(_sub_group_shuffle_down(acc, zero, sg_reduce_offset)));
}
// Expand accumulator chunks to units.
const uint expanded_acc_size = (RESPONSES_PER_SG_EXEC + SUB_GROUP_SIZE - 1) / SUB_GROUP_SIZE;
__attribute__((opencl_unroll_hint))
for (uint expanded_acc_idx = 0; expanded_acc_idx < expanded_acc_size; ++expanded_acc_idx)
unroll_for (uint expanded_acc_idx = 0; expanded_acc_idx < expanded_acc_size; ++expanded_acc_idx)
{
const uint output_id = output_base_id + expanded_acc_idx * SUB_GROUP_SIZE;
#if BIAS_TERM

View File

@@ -2,13 +2,12 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
#if FP16_UNIT_USED
// Block read - currently block is 4 bytes aligned.
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(intel_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_half8(_sub_group_block_read_us8((const __global ushort*)(ptr) + (byte_offset)))
#define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \
{ \
@@ -31,7 +30,7 @@
}
#else
// Block read - currently block is 4 bytes aligned.
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(intel_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
#define ALIGNED_BLOCK_READ8(ptr, byte_offset) as_float8(_sub_group_block_read8((const __global uint*)(ptr) + (byte_offset)))
#define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB) \
{ \
@@ -57,7 +56,7 @@
#define SUB_GROUP_SIZE 8
__attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
KERNEL (fully_connected_gpu_xb_bs_xs_xsv8_bsv8_vload)(
const __global UNIT_TYPE* input,
__global UNIT_TYPE* output,

View File

@@ -1,64 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "include/batch_headers/data_types.cl"
#include "include/batch_headers/fetch_data.cl"
#include "include/sub_group.cl"
__attribute__((reqd_work_group_size(8, 1, 1)))
KERNEL (fully_connected_gpu_xb_xb_b8_x8)(
const __global float* input,
__global float* output,
const __global float* weight
#if BIAS_TERM
, __global UNIT_TYPE* bias)
#else
)
#endif
{
const uint global_id = get_global_id(0);
const int x = get_global_id(0);
const uint batch_id = x % INPUT0_BATCH_NUM;
uint neuronIdx = (x / INPUT0_BATCH_NUM) * NEURONS_PER_WORK_ITEM;
const uint sub_group_id = get_local_id(0);
const uint batch_num = INPUT0_BATCH_NUM;
const int out_id = (global_id / batch_num) * NEURONS_PER_WORK_ITEM * batch_num + batch_id;
const int ofm_offset = (global_id * NEURONS_PER_WORK_ITEM) / batch_num;
float8 _data0 = 0.f;
#if NEURONS_PER_WORK_ITEM > 8
float8 _data1 = 0.f;
#endif
uint weight_offset = sub_group_id + neuronIdx;
for (uint h = 0; h < INPUT0_ELEMENTS_COUNT; h++)
{
DOT_PRODUCT_8(_data0, input[h * batch_num + batch_id], weight[weight_offset])
#if NEURONS_PER_WORK_ITEM > 8
DOT_PRODUCT_8(_data1, input[h * batch_num + batch_id], weight[weight_offset + 8])
#endif
weight_offset += FILTER_OFM_NUM;
}
#if BIAS_TERM
ADD_BIAS_8(_data0, bias[neuronIdx + sub_group_id]);
#if NEURONS_PER_WORK_ITEM > 8
ADD_BIAS_8(_data1, bias[neuronIdx + sub_group_id + 8]);
#endif
#endif
_data0 = ACTIVATION(_data0, ACTIVATION_PARAMS);
#if NEURONS_PER_WORK_ITEM > 8
_data1 = ACTIVATION(_data1, ACTIVATION_PARAMS);
#endif
intel_sub_group_block_write8((__global uint*)output + out_id, as_uint8(_data0));
#if NEURONS_PER_WORK_ITEM > 8
intel_sub_group_block_write8((__global uint*)output + out_id + 8 * batch_num, as_uint8(_data1));
#endif
}

Some files were not shown because too many files have changed in this diff Show More