[VPU][OpenCL] Update custom kernels (#2131)
* [Custom CL] Updated OpenCL kernels and tests * [Custom CL] Update OpenCL compiler * Update firmware to 1365 * Disable ExpGenerateProposals tests * VPU: new firmware no. 1370 * Myriad: re-enable ExpGenerateProposals tests Co-authored-by: Maxim Kurin <maxim.kurin@intel.com>
This commit is contained in:
parent
867340e8f1
commit
5ad4811793
@ -19,8 +19,8 @@ set(VPU_SUPPORTED_FIRMWARES usb-ma2450 usb-ma2x8x pcie-ma248x)
|
||||
# Default packages
|
||||
#
|
||||
|
||||
set(FIRMWARE_PACKAGE_VERSION 1360)
|
||||
set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.02.0")
|
||||
set(FIRMWARE_PACKAGE_VERSION 1370)
|
||||
set(VPU_CLC_MA2X8X_VERSION "movi-cltools-20.09.0")
|
||||
|
||||
#
|
||||
# CMake variables to override default firmware files
|
||||
|
@ -65,9 +65,14 @@ void MathExpression::parse(const std::string& expression) {
|
||||
// parse number
|
||||
if (std::isdigit(*it)) {
|
||||
size_t len = 0;
|
||||
// parse number and use its length
|
||||
const auto value = std::stof(&*it, &len);
|
||||
(void) value;
|
||||
// copy sub string that represents a number
|
||||
auto substring = std::string{it, it + len};
|
||||
|
||||
_parsedTokens.emplace_back(TokenType::Value, ValueType{value}, "");
|
||||
auto token = Token{TokenType::Value, ValueType{substring}, ""};
|
||||
_parsedTokens.push_back(std::move(token));
|
||||
|
||||
std::advance(it, len - 1);
|
||||
continue;
|
||||
@ -84,6 +89,7 @@ void MathExpression::parse(const std::string& expression) {
|
||||
tokenStack.push(token);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (_vars.find(token) != _vars.end()) {
|
||||
_parsedTokens.emplace_back(TokenType::Value, ValueType{_vars.at(token)}, "");
|
||||
continue;
|
||||
|
67
inference-engine/src/vpu/custom_kernels/binarization.cl
Normal file
67
inference-engine/src/vpu/custom_kernels/binarization.cl
Normal file
@ -0,0 +1,67 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void binarization(
|
||||
const __global half *__restrict src_data,
|
||||
const __global half *__restrict input_low_high,
|
||||
const __global half *__restrict dst_data,
|
||||
int switch_out,
|
||||
int input_low_high_size,
|
||||
int W,
|
||||
int H)
|
||||
{
|
||||
__local half local_src[15 * 1024];
|
||||
__local half local_dst[15 * 1024];
|
||||
|
||||
event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int c = get_global_id(2);
|
||||
int C = get_global_size(2);
|
||||
|
||||
half dst_low = switch_out ? 1.h : -1.h;
|
||||
half dst_high = switch_out ? -1.h : 1.h;
|
||||
|
||||
half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
|
||||
|
||||
for (int h = 0; h < H; h++) {
|
||||
|
||||
__local const half *__restrict addr_src = local_src + h * W;
|
||||
__local half *__restrict addr_dst = local_dst + h * W;
|
||||
|
||||
#if 1
|
||||
for (int w = 0; w < W / 8; w++) {
|
||||
|
||||
half8 h_src_val8 = (*((__local half8 *)addr_src + w));
|
||||
|
||||
short8 cond1;
|
||||
cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
|
||||
cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
|
||||
cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
|
||||
cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
|
||||
cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
|
||||
cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
|
||||
cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
|
||||
cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
|
||||
|
||||
cond1 = ~(cond1 - (short8)1);
|
||||
|
||||
short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
|
||||
|
||||
*((__local half8 *)addr_dst + w) = as_half8(res);
|
||||
}
|
||||
#endif
|
||||
for (int w = W & (~0x7); w < W; w++) {
|
||||
addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -0,0 +1,95 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
|
||||
|
||||
__kernel void binary_convolution(
|
||||
const __global half *restrict src_data,
|
||||
const __global uchar *restrict weights_data,
|
||||
__global half *restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH)
|
||||
{
|
||||
int ipad_value = ((pad_value > 0.f) ? 1 : 0);
|
||||
int c = get_global_id(2);
|
||||
int y = get_global_id(1);
|
||||
int x = get_global_id(0);
|
||||
|
||||
int OC = get_global_size(2);
|
||||
int OH = get_global_size(1);
|
||||
int OW = get_global_size(0);
|
||||
|
||||
int KD = 1;
|
||||
int SD = 0;
|
||||
int DD = 0;
|
||||
int PD = 0;
|
||||
int ID = 1;
|
||||
int OD = 1;
|
||||
|
||||
int nbits = 8;
|
||||
|
||||
int g = c % GC;
|
||||
int oc = c / GC;
|
||||
int oh = y;
|
||||
int ow = x;
|
||||
|
||||
for (int od = 0; od < OD; od++) {
|
||||
int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
|
||||
|
||||
int res = 0;
|
||||
|
||||
for (int ic = 0; ic < IC / GC; ic++) {
|
||||
for (int kd = 0; kd < KD; kd++) {
|
||||
for (int kh = 0; kh < KH; kh++) {
|
||||
for (int kw = 0; kw < KW; kw++) {
|
||||
int widx = g * OC / GC * IC / GC * KD * KH * KW
|
||||
+ oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
|
||||
+ kh * KW + kw;
|
||||
|
||||
int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
|
||||
|
||||
int s;
|
||||
|
||||
int iw = ow * SW - PW + kw * DW;
|
||||
int ih = oh * SH - PH + kh * DH;
|
||||
int id = od * SD - PD + kd * DD;
|
||||
|
||||
if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
|
||||
|| id >= (int)ID) {
|
||||
s = ipad_value;
|
||||
} else {
|
||||
int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
|
||||
+ ih * IW + iw;
|
||||
|
||||
s = ((src_data[iidx] > 0.f) ? 1 : 0);
|
||||
}
|
||||
|
||||
res += s ^ w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
|
||||
}
|
||||
}
|
@ -3,186 +3,115 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
ushort extract_weights(uchar val, int bit)
|
||||
{
|
||||
return ((val >> bit) & 1);
|
||||
}
|
||||
ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
|
||||
|
||||
__kernel void binary_convolution(
|
||||
const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
const __global half* restrict dst_data,
|
||||
float pad_value,
|
||||
const __global half *restrict src_data,
|
||||
const __global uchar *restrict weights_data,
|
||||
__global half *restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH,
|
||||
int SW,
|
||||
int SH,
|
||||
|
||||
int OW,
|
||||
const __local half* restrict src_local,
|
||||
__local half* restrict dst_local)
|
||||
int OW)
|
||||
{
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
int OH = get_global_size(0);
|
||||
int OC = get_global_size(1);
|
||||
__local half src_local[32 * 1024];
|
||||
__local half dst_local[2 * 1024];
|
||||
|
||||
const int oh = get_group_id(0);
|
||||
const int oc = get_group_id(1);
|
||||
const int OH = get_global_size(0);
|
||||
const int OC = get_global_size(1);
|
||||
|
||||
const int gc = oc / (OC / GC);
|
||||
|
||||
if (oh * SH >= 0 && oh * SH <= IH - 1) {
|
||||
const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
|
||||
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
src_local, // dst
|
||||
src, // src
|
||||
IW, // num_elements_per_line,
|
||||
IC / GC, // num_lines,
|
||||
IH * IW - IW, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e1);
|
||||
}
|
||||
|
||||
half pad_value_half = convert_half(pad_value);
|
||||
|
||||
//padding row
|
||||
if (oh * SH > IH - 1)
|
||||
{
|
||||
__local half* dst = src_local;
|
||||
for(int c = 0; c < IC/GC; c++)
|
||||
{
|
||||
if (oh * SH > IH - 1) {
|
||||
__local half *dst = src_local;
|
||||
for (int c = 0; c < IC / GC; c++) {
|
||||
#pragma unroll 8
|
||||
for(int j = 0; j < IW; j++)
|
||||
{
|
||||
for (int j = 0; j < IW; j++) {
|
||||
dst[j] = pad_value_half;
|
||||
}
|
||||
dst += IW;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int OWS = SW * OW;
|
||||
ushort8 in;
|
||||
|
||||
for (int ows8 = 0; ows8 < (OWS+7)/8; ows8++)
|
||||
{
|
||||
for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
|
||||
ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
for (int ic = 0; ic < IC/GC; ++ic)
|
||||
{
|
||||
__local half* src = (__local half*)((__local half8*)(src_local + ic * IW) + ows8);
|
||||
int weight_pos = oc * IC/GC + ic;
|
||||
ushort w = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
|
||||
for (int ic = 0; ic < IC / GC; ++ic) {
|
||||
__local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
|
||||
int weight_pos = oc * IC / GC + ic;
|
||||
ushort w =
|
||||
extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
|
||||
|
||||
if ((ows8 * 8) <= IW - 1)
|
||||
{
|
||||
in = *((__local ushort8*)(src));
|
||||
if ((ows8 * 8) <= IW - 1) {
|
||||
in = *((__local ushort8 *)(src));
|
||||
}
|
||||
|
||||
//padding column
|
||||
if (ows8 * 8 + 7 > IW - 1)
|
||||
{
|
||||
if (ows8 * 8 + 7 > IW - 1) {
|
||||
int boundary = (IW - 1) - ows8 * 8 + 1;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++)
|
||||
{
|
||||
*((half*)(&in) + offset) = pad_value_half;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++) {
|
||||
*((half *)(&in) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
|
||||
ushort8 w8 = (ushort8)(w);
|
||||
|
||||
ushort8 cond = (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
|
||||
ushort8 cond =
|
||||
(((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
|
||||
val += (cond ^ w8);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ushort8 val_shift = val << 1;
|
||||
int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
|
||||
for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++)
|
||||
{
|
||||
*(dst_local + ow) = (half)(IC/GC - *((ushort*)(&val_shift) + ow * SW - ows8 * 8));
|
||||
int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
|
||||
for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
|
||||
*(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_binary_convolution(
|
||||
const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
const __global half* restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH,
|
||||
|
||||
int OW,
|
||||
__local half* restrict src_local,
|
||||
const __local half* restrict dst_local)
|
||||
{
|
||||
const int oh = get_group_id(0);
|
||||
const int oc = get_group_id(1);
|
||||
const int OC = get_global_size(1);
|
||||
|
||||
const int gc = oc / (OC/GC);
|
||||
|
||||
if (oh * SH >= 0 && oh * SH <= IH - 1)
|
||||
{
|
||||
const __global half* src = src_data + (gc * IC/GC) * IW * IH + (SH * oh) * IW;
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src, // src
|
||||
src_local, // dst
|
||||
IW * sizeof(half), // src width
|
||||
IW * sizeof(half), // dst width
|
||||
IH * IW * sizeof(half), // src stride
|
||||
IW * sizeof(half), // dst stride
|
||||
IW * IC/GC * sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
}
|
||||
__kernel void __dma_postwrite_binary_convolution(
|
||||
const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
__global half* restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH,
|
||||
|
||||
int OW,
|
||||
const __local half* restrict src_local,
|
||||
const __local half* restrict dst_local)
|
||||
{
|
||||
const int oh = get_group_id(0);
|
||||
const int oc = get_group_id(1);
|
||||
const int OH = get_global_size(0);
|
||||
|
||||
async_work_group_copy(dst_data + oc*OW*OH + oh*OW, dst_local, OW, 0);
|
||||
}
|
@ -3,82 +3,131 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
ushort extract_weights(uchar val, int bit)
|
||||
{
|
||||
return ((val >> bit) & 1);
|
||||
}
|
||||
ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
|
||||
|
||||
__kernel void binary_convolution(
|
||||
const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
const __global half* restrict dst_data,
|
||||
float pad_value,
|
||||
const __global half *restrict src_data,
|
||||
const __global uchar *restrict weights_data,
|
||||
const __global half *restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH,
|
||||
int SW,
|
||||
int SH,
|
||||
|
||||
int OW,
|
||||
const __local half* restrict src_local,
|
||||
__local half* restrict dst_local)
|
||||
int OW)
|
||||
{
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
int OH = get_global_size(0);
|
||||
int OC = get_global_size(1);
|
||||
__local half src_local[32 * 1024];
|
||||
__local half dst_local[2 * 1024];
|
||||
|
||||
const int oh = get_group_id(0);
|
||||
const int oc = get_group_id(1);
|
||||
const int OH = get_global_size(0);
|
||||
const int OC = get_global_size(1);
|
||||
|
||||
const int gc = oc / (OC / GC);
|
||||
|
||||
if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
|
||||
{
|
||||
event_t e = async_work_group_copy_3D3D(
|
||||
src_local, // dst
|
||||
src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
|
||||
IW, // num_elements_per_line
|
||||
3, // num_lines
|
||||
DH * IW - IW, // src_line_stride
|
||||
0, // dst_line_stride
|
||||
IC / GC, // num planes
|
||||
IH * IW - 3 * IW, // src plane stride
|
||||
0, // dst plane stride
|
||||
0);
|
||||
wait_group_events(1, &e);
|
||||
} else {
|
||||
int ih = oh * SH - 1;
|
||||
if (ih >= 0 && ih <= IH - 1) //dma for first row
|
||||
{
|
||||
event_t e = async_work_group_copy_2D2D(
|
||||
src_local, // dst
|
||||
src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
|
||||
IW, // num_elements_per_line,
|
||||
IC / GC, // num_lines,
|
||||
IH * IW - IW, // src_line_stride,
|
||||
2 * IW, // dst_line_stride,
|
||||
0);
|
||||
|
||||
wait_group_events(1, &e);
|
||||
}
|
||||
ih = oh * SH - 1 + DH;
|
||||
if (ih >= 0 && ih <= IH - 1) //dma for second row
|
||||
{
|
||||
event_t e = async_work_group_copy_2D2D(
|
||||
src_local + IW, // dst
|
||||
src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
|
||||
IW, // num_elements_per_line,
|
||||
IC / GC, // num_lines,
|
||||
IH * IW - IW, // src_line_stride,
|
||||
2 * IW, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e);
|
||||
}
|
||||
ih = oh * SH - 1 + 2 * DH;
|
||||
if (ih >= 0 && ih <= IH - 1) //dma for third row
|
||||
{
|
||||
event_t e = async_work_group_copy_2D2D(
|
||||
src_local + 2 * IW, // dst
|
||||
src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
|
||||
IW, // num_elements_per_line,
|
||||
IC / GC, // num_lines,
|
||||
IH * IW - IW, // src_line_stride,
|
||||
2 * IW, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e);
|
||||
}
|
||||
}
|
||||
|
||||
half pad_value_half = convert_half(pad_value);
|
||||
|
||||
//padding row
|
||||
if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1)
|
||||
{
|
||||
__local half* dst = src_local;
|
||||
for(int c = 0; c < IC/GC; c++)
|
||||
{
|
||||
if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
|
||||
__local half *dst = src_local;
|
||||
for (int c = 0; c < IC / GC; c++) {
|
||||
#pragma unroll 8
|
||||
for(int j = 0; j < IW; j++)
|
||||
{
|
||||
for (int j = 0; j < IW; j++) {
|
||||
dst[j] = pad_value_half;
|
||||
}
|
||||
dst += 3 * IW;
|
||||
}
|
||||
}
|
||||
if (oh * SH + DH - 1 > IH - 1)
|
||||
{
|
||||
__local half* dst = src_local + IW;
|
||||
for(int c = 0; c < IC/GC; c++)
|
||||
{
|
||||
if (oh * SH + DH - 1 > IH - 1) {
|
||||
__local half *dst = src_local + IW;
|
||||
for (int c = 0; c < IC / GC; c++) {
|
||||
#pragma unroll 8
|
||||
for(int j = 0; j < IW; j++)
|
||||
{
|
||||
for (int j = 0; j < IW; j++) {
|
||||
dst[j] = pad_value_half;
|
||||
}
|
||||
dst += 3 * IW;
|
||||
}
|
||||
}
|
||||
if (oh * SH + DH + DH - 1 > IH - 1)
|
||||
{
|
||||
__local half* dst = src_local + 2 * IW;
|
||||
for(int c = 0; c < IC/GC; c++)
|
||||
{
|
||||
if (oh * SH + DH + DH - 1 > IH - 1) {
|
||||
__local half *dst = src_local + 2 * IW;
|
||||
for (int c = 0; c < IC / GC; c++) {
|
||||
#pragma unroll 8
|
||||
for(int j = 0; j < IW; j++)
|
||||
{
|
||||
for (int j = 0; j < IW; j++) {
|
||||
dst[j] = pad_value_half;
|
||||
}
|
||||
dst += 3 * IW;
|
||||
@ -97,13 +146,12 @@ __kernel void binary_convolution(
|
||||
ushort8 in21;
|
||||
ushort8 in22;
|
||||
|
||||
for (int ows8 = 0; ows8 < (OWS+7)/8; ows8++)
|
||||
{
|
||||
for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
|
||||
ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
for (int ic = 0; ic < IC/GC; ++ic)
|
||||
{
|
||||
__local half* src = (__local half*)((__local half8*)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
|
||||
int weight_pos = oc*IC/GC*3*3 + ic*3*3;
|
||||
for (int ic = 0; ic < IC / GC; ++ic) {
|
||||
__local half *src =
|
||||
(__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
|
||||
int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
|
||||
ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
|
||||
ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
|
||||
ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
|
||||
@ -114,64 +162,55 @@ __kernel void binary_convolution(
|
||||
ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
|
||||
ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
|
||||
|
||||
if ((ows8 * 8) - 1 <= IW - 1)
|
||||
{
|
||||
in00 = *((__local ushort8*)(src - IW - DW));
|
||||
in01 = *((__local ushort8*)(src - IW));
|
||||
in02 = *((__local ushort8*)(src - IW + DW));
|
||||
if ((ows8 * 8) - 1 <= IW - 1) {
|
||||
in00 = *((__local ushort8 *)(src - IW - DW));
|
||||
in01 = *((__local ushort8 *)(src - IW));
|
||||
in02 = *((__local ushort8 *)(src - IW + DW));
|
||||
|
||||
in10 = *((__local ushort8*)(src - DW));
|
||||
in11 = *((__local ushort8*)(src));
|
||||
in12 = *((__local ushort8*)(src + DW));
|
||||
in10 = *((__local ushort8 *)(src - DW));
|
||||
in11 = *((__local ushort8 *)(src));
|
||||
in12 = *((__local ushort8 *)(src + DW));
|
||||
|
||||
in20 = *((__local ushort8*)(src + IW - DW));
|
||||
in21 = *((__local ushort8*)(src + IW));
|
||||
in22 = *((__local ushort8*)(src + IW + DW));
|
||||
in20 = *((__local ushort8 *)(src + IW - DW));
|
||||
in21 = *((__local ushort8 *)(src + IW));
|
||||
in22 = *((__local ushort8 *)(src + IW + DW));
|
||||
}
|
||||
|
||||
//padding column
|
||||
if (ows8 * 8 - 1 < 0)
|
||||
{
|
||||
if (ows8 * 8 - 1 < 0) {
|
||||
int boundary = 1 - ows8 * 8;
|
||||
boundary = boundary > 8 ? 8 : boundary;
|
||||
for (int offset = 0; offset < boundary; offset++)
|
||||
{
|
||||
*((half*)(&in00) + offset) = pad_value_half;
|
||||
*((half*)(&in10) + offset) = pad_value_half;
|
||||
*((half*)(&in20) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1)
|
||||
{
|
||||
int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++)
|
||||
{
|
||||
*((half*)(&in02) + offset) = pad_value_half;
|
||||
*((half*)(&in12) + offset) = pad_value_half;
|
||||
*((half*)(&in22) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
if ((ows8 * 8 + 7) + DW - 1 > IW - 1)
|
||||
{
|
||||
int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++)
|
||||
{
|
||||
*((half*)(&in01) + offset) = pad_value_half;
|
||||
*((half*)(&in11) + offset) = pad_value_half;
|
||||
*((half*)(&in21) + offset) = pad_value_half;
|
||||
boundary = boundary > 8 ? 8 : boundary;
|
||||
for (int offset = 0; offset < boundary; offset++) {
|
||||
*((half *)(&in00) + offset) = pad_value_half;
|
||||
*((half *)(&in10) + offset) = pad_value_half;
|
||||
*((half *)(&in20) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
if ((ows8 * 8 + 7) - 1 > IW - 1)
|
||||
{
|
||||
if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
|
||||
int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++) {
|
||||
*((half *)(&in02) + offset) = pad_value_half;
|
||||
*((half *)(&in12) + offset) = pad_value_half;
|
||||
*((half *)(&in22) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
|
||||
int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++) {
|
||||
*((half *)(&in01) + offset) = pad_value_half;
|
||||
*((half *)(&in11) + offset) = pad_value_half;
|
||||
*((half *)(&in21) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
if ((ows8 * 8 + 7) - 1 > IW - 1) {
|
||||
int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++)
|
||||
{
|
||||
*((half*)(&in00) + offset) = pad_value_half;
|
||||
*((half*)(&in10) + offset) = pad_value_half;
|
||||
*((half*)(&in20) + offset) = pad_value_half;
|
||||
boundary = boundary < 0 ? 0 : boundary;
|
||||
for (int offset = boundary; offset < 8; offset++) {
|
||||
*((half *)(&in00) + offset) = pad_value_half;
|
||||
*((half *)(&in10) + offset) = pad_value_half;
|
||||
*((half *)(&in20) + offset) = pad_value_half;
|
||||
}
|
||||
}
|
||||
|
||||
@ -185,16 +224,34 @@ __kernel void binary_convolution(
|
||||
ushort8 w21 = (ushort8)(w7);
|
||||
ushort8 w22 = (ushort8)(w8);
|
||||
|
||||
ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
|
||||
|
||||
ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
|
||||
(ushort8)(1) :
|
||||
(ushort8)(0);
|
||||
|
||||
val += (cond0 ^ w00);
|
||||
val += (cond1 ^ w01);
|
||||
val += (cond2 ^ w02);
|
||||
@ -207,150 +264,15 @@ __kernel void binary_convolution(
|
||||
}
|
||||
|
||||
ushort8 val_shift = val << 1;
|
||||
int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
|
||||
for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++)
|
||||
{
|
||||
*(dst_local + ow) = (half)(IC/GC*KH*KW - *((ushort*)(&val_shift) + ow * SW - ows8 * 8));
|
||||
int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
|
||||
for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
|
||||
*(dst_local + ow) =
|
||||
(half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_binary_convolution(
|
||||
const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
const __global half* restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH,
|
||||
|
||||
int OW,
|
||||
__local half* restrict src_local,
|
||||
const __local half* restrict dst_local)
|
||||
{
|
||||
const int oh = get_group_id(0);
|
||||
const int oc = get_group_id(1);
|
||||
const int OH = get_global_size(0);
|
||||
const int OC = get_global_size(1);
|
||||
|
||||
const int gc = oc / (OC/GC);
|
||||
|
||||
if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
|
||||
{
|
||||
const __global half* src = src_data + (gc * IC/GC) * IW * IH + (SH * oh - 1) * IW;
|
||||
WorkGroupDmaCreate3DTransaction(
|
||||
src, //src,
|
||||
src_local, //dst,
|
||||
IW * sizeof(half), //src width,
|
||||
IW * sizeof(half), //dst width,
|
||||
DH * IW * sizeof(half), //src stride,
|
||||
IW * sizeof(half), //dst stride,
|
||||
IC/GC, //num planes //hang when > 256
|
||||
IH * IW * sizeof(half), //src plane stride,
|
||||
3 * IW * sizeof(half), //dst plane stride,
|
||||
3 * IW * sizeof(half), //plane size,
|
||||
0
|
||||
);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
int ih = oh * SH - 1;
|
||||
if (ih >= 0 && ih <= IH - 1) //dma for first row
|
||||
{
|
||||
const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW;
|
||||
__local half* dst = src_local;
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src, // src
|
||||
dst, // dst
|
||||
IW * sizeof(half), // src width
|
||||
IW * sizeof(half), // dst width
|
||||
IH * IW * sizeof(half), // src stride
|
||||
3 * IW * sizeof(half), // dst stride
|
||||
IW * IC/GC * sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
ih = oh * SH - 1 + DH;
|
||||
if (ih >= 0 && ih <= IH - 1) //dma for second row
|
||||
{
|
||||
const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW;
|
||||
__local half* dst = src_local + IW;
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src, // src
|
||||
dst, // dst
|
||||
IW * sizeof(half), // src width
|
||||
IW * sizeof(half), // dst width
|
||||
IH * IW * sizeof(half), // src stride
|
||||
3 * IW * sizeof(half), // dst stride
|
||||
IW * IC/GC * sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
ih = oh * SH - 1 + 2 * DH;
|
||||
if (ih >= 0 && ih <= IH - 1) //dma for third row
|
||||
{
|
||||
const __global half* src = src_data + (gc * IC/GC) * IW * IH + ih * IW;
|
||||
__local half* dst = src_local + 2 * IW;
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src, // src
|
||||
dst, // dst
|
||||
IW * sizeof(half), // src width
|
||||
IW * sizeof(half), // dst width
|
||||
IH * IW * sizeof(half), // src stride
|
||||
3 * IW * sizeof(half), // dst stride
|
||||
IW * IC/GC * sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void __dma_postwrite_binary_convolution(
|
||||
const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
__global half* restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH,
|
||||
|
||||
int OW,
|
||||
const __local half* restrict src_local,
|
||||
const __local half* restrict dst_local)
|
||||
{
|
||||
const int oh = get_group_id(0);
|
||||
const int oc = get_group_id(1);
|
||||
const int OH = get_global_size(0);
|
||||
|
||||
async_work_group_copy(dst_data + oc*OW*OH + oh*OW, dst_local, OW, 0);
|
||||
}
|
@ -1,339 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
int extract_weights(uchar val, int bit) {
|
||||
return ((val >> bit) & 1);
|
||||
}
|
||||
|
||||
__kernel void binary_convolution(const __global half* restrict src_data,
|
||||
const __global uchar* restrict weights_data,
|
||||
__global half* restrict dst_data,
|
||||
float pad_value,
|
||||
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
|
||||
int DW,
|
||||
int DH,
|
||||
|
||||
int GC,
|
||||
|
||||
int KW,
|
||||
int KH,
|
||||
|
||||
int PW,
|
||||
int PH,
|
||||
|
||||
int SW,
|
||||
int SH)
|
||||
{
|
||||
int ipad_value = ((pad_value > 0.f) ? 1 : 0);
|
||||
int c = get_global_id(2);
|
||||
int y = get_global_id(1);
|
||||
int x = get_global_id(0);
|
||||
|
||||
int OC = get_global_size(2);
|
||||
int OH = get_global_size(1);
|
||||
int OW = get_global_size(0);
|
||||
|
||||
int KD = 1;
|
||||
int SD = 0;
|
||||
int DD = 0;
|
||||
int PD = 0;
|
||||
int ID = 1;
|
||||
int OD = 1;
|
||||
|
||||
int nbits = 8;
|
||||
|
||||
int g = c % GC;
|
||||
int oc = c / GC;
|
||||
int oh = y;
|
||||
int ow = x;
|
||||
|
||||
for (int od = 0; od < OD; od++) {
|
||||
int oidx = g * OC / GC * OD * OH * OW
|
||||
+ oc * OD * OH * OW
|
||||
+ od * OH * OW
|
||||
+ oh * OW
|
||||
+ ow;
|
||||
|
||||
int res = 0;
|
||||
|
||||
for (int ic = 0; ic < IC / GC; ic++) {
|
||||
for (int kd = 0; kd < KD; kd++) {
|
||||
for (int kh = 0; kh < KH; kh++) {
|
||||
for (int kw = 0; kw < KW; kw++) {
|
||||
int widx = g * OC / GC * IC / GC * KD * KH * KW
|
||||
+ oc * IC / GC * KD * KH * KW
|
||||
+ ic * KD * KH * KW
|
||||
+ kd * KH * KW
|
||||
+ kh * KW
|
||||
+ kw;
|
||||
|
||||
int w = extract_weights(weights_data[widx/nbits], (widx % nbits));
|
||||
|
||||
int s;
|
||||
|
||||
int iw = ow * SW - PW + kw * DW;
|
||||
int ih = oh * SH - PH + kh * DH;
|
||||
int id = od * SD - PD + kd * DD;
|
||||
|
||||
if (iw < 0 || iw >= (int) IW ||
|
||||
ih < 0 || ih >= (int) IH ||
|
||||
id < 0 || id >= (int) ID) {
|
||||
s = ipad_value;
|
||||
} else {
|
||||
int iidx = g * IC / GC * ID * IH * IW
|
||||
+ ic * ID * IH * IW
|
||||
+ id * IH * IW
|
||||
+ ih * IW
|
||||
+ iw;
|
||||
|
||||
s = ((src_data[iidx] > 0.f) ? 1 : 0);
|
||||
}
|
||||
|
||||
res += s ^ w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst_data[oidx] = (half)(IC/GC*KD*KH*KW - 2*res);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void quantize(const __global half* __restrict src,
|
||||
const __global half* __restrict input_low,
|
||||
const __global half* __restrict input_high,
|
||||
const __global half* __restrict output_low,
|
||||
const __global half* __restrict output_high,
|
||||
const __global half* __restrict dst,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int H,
|
||||
const __local half* __restrict src_local,
|
||||
__local half* __restrict dst_local)
|
||||
{
|
||||
|
||||
int c = get_global_id(2);
|
||||
int C = get_global_size(2);
|
||||
|
||||
half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]);
|
||||
half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
|
||||
half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]);
|
||||
half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
|
||||
|
||||
half const1 = (half)(0.01 > (h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
|
||||
half const2 = (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
|
||||
|
||||
for (int h = 0; h < H; h++)
|
||||
{
|
||||
__local const half* __restrict addr_src = src_local + h*W;
|
||||
__local half* __restrict addr_dst = dst_local + h*W;
|
||||
|
||||
for (int w = 0; w < W / 8; w++)
|
||||
{
|
||||
half8 val = *((__local half8*)addr_src + w);
|
||||
#if 1
|
||||
// round is too slow =( 902 b of code
|
||||
//half8 aux = round((val - (half8)h_ilow) * (half8)const1);
|
||||
|
||||
half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
|
||||
|
||||
aux = (half8){
|
||||
(half)(short)(aux.s0),
|
||||
(half)(short)(aux.s1),
|
||||
(half)(short)(aux.s2),
|
||||
(half)(short)(aux.s3),
|
||||
(half)(short)(aux.s4),
|
||||
(half)(short)(aux.s5),
|
||||
(half)(short)(aux.s6),
|
||||
(half)(short)(aux.s7)
|
||||
};
|
||||
|
||||
aux = aux * (half8)const2 + (half8)h_olow;
|
||||
|
||||
// vector comparison add 756 b of assembly, so do in manually
|
||||
// short8 a = val <= (half8)h_olow;
|
||||
// short8 b = val > (half8)h_ohigh;
|
||||
|
||||
short8 a;
|
||||
short8 b;
|
||||
a.s0 = (val.s0 <= h_ilow);
|
||||
a.s1 = (val.s1 <= h_ilow);
|
||||
a.s2 = (val.s2 <= h_ilow);
|
||||
a.s3 = (val.s3 <= h_ilow);
|
||||
a.s4 = (val.s4 <= h_ilow);
|
||||
a.s5 = (val.s5 <= h_ilow);
|
||||
a.s6 = (val.s6 <= h_ilow);
|
||||
a.s7 = (val.s7 <= h_ilow);
|
||||
|
||||
b.s0 = (val.s0 > h_ihigh);
|
||||
b.s1 = (val.s1 > h_ihigh);
|
||||
b.s2 = (val.s2 > h_ihigh);
|
||||
b.s3 = (val.s3 > h_ihigh);
|
||||
b.s4 = (val.s4 > h_ihigh);
|
||||
b.s5 = (val.s5 > h_ihigh);
|
||||
b.s6 = (val.s6 > h_ihigh);
|
||||
b.s7 = (val.s7 > h_ihigh);
|
||||
|
||||
a = ~(a-(short8)1);
|
||||
b = ~(b-(short8)1);
|
||||
|
||||
short8 c1 = (~a & b);
|
||||
short8 c2 = (~a & ~b);
|
||||
|
||||
short8 res = a & as_short8((half8)h_olow)
|
||||
| c1 & as_short8((half8)h_ohigh)
|
||||
| c2 & as_short8(aux);
|
||||
|
||||
*((__local half8*)addr_dst + w) = as_half8(res);
|
||||
#else
|
||||
*((__local half8*)addr_dst + w) = val;
|
||||
#endif
|
||||
}
|
||||
for (int w = W & (~0x7); w < W; w++)
|
||||
{
|
||||
half val = addr_src[w];
|
||||
#if 1
|
||||
short a = val <= h_ilow; a = ~(a-1);
|
||||
short b = val > h_ihigh; b = ~(b-1);
|
||||
|
||||
short c1 = (~a & b);
|
||||
short c2 = (~a & ~b);
|
||||
|
||||
short res = a & as_short(h_olow)
|
||||
| c1 & as_short(h_ohigh)
|
||||
| c2 & as_short(((half)(round( (val - h_ilow) * const1) * const2) + h_olow));
|
||||
|
||||
addr_dst[w] = as_half(res);
|
||||
#else
|
||||
addr_dst[w] = val;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void __dma_preload_quantize(const __global half* __restrict src,
|
||||
const __global half* __restrict input_low,
|
||||
const __global half* __restrict input_high,
|
||||
const __global half* __restrict output_low,
|
||||
const __global half* __restrict output_high,
|
||||
const __global half* __restrict dst,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int H,
|
||||
__local half* __restrict src_local,
|
||||
const __local half* __restrict dst_local)
|
||||
{
|
||||
const int sizePlane = W*H;
|
||||
async_work_group_copy(src_local ,src + get_group_id(2)*sizePlane, sizePlane, 0);
|
||||
}
|
||||
__kernel void __dma_postwrite_quantize(const __global half* __restrict src,
|
||||
const __global half* __restrict input_low,
|
||||
const __global half* __restrict input_high,
|
||||
const __global half* __restrict output_low,
|
||||
const __global half* __restrict output_high,
|
||||
__global half* __restrict dst,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int H,
|
||||
const __local half* __restrict src_local,
|
||||
const __local half* __restrict dst_local)
|
||||
{
|
||||
const int sizePlane = W*H;
|
||||
async_work_group_copy(dst + get_group_id(2)*sizePlane ,dst_local, sizePlane, 0);
|
||||
}
|
||||
|
||||
__kernel void binarization(const __global half* __restrict src,
|
||||
const __global half* __restrict input_low_high,
|
||||
const __global half* __restrict dst,
|
||||
int switch_out,
|
||||
int input_low_high_size,
|
||||
int W,
|
||||
int H,
|
||||
const __local half* __restrict src_local,
|
||||
__local half* __restrict dst_local)
|
||||
{
|
||||
int c = get_global_id(2);
|
||||
int C = get_global_size(2);
|
||||
|
||||
half dst_low = switch_out ? 1.h : -1.h;
|
||||
half dst_high = switch_out ? -1.h : 1.h;
|
||||
|
||||
half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
|
||||
|
||||
for (int h = 0; h < H; h++) {
|
||||
|
||||
__local const half* __restrict addr_src = src_local + h*W;
|
||||
__local half* __restrict addr_dst = dst_local + h*W;
|
||||
|
||||
#if 1
|
||||
for (int w = 0; w < W / 8; w++) {
|
||||
|
||||
half8 h_src_val8 = (*((__local half8*)addr_src + w));
|
||||
|
||||
short8 cond1;
|
||||
cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
|
||||
cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
|
||||
cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
|
||||
cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
|
||||
cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
|
||||
cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
|
||||
cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
|
||||
cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
|
||||
|
||||
cond1 = ~(cond1-(short8)1);
|
||||
|
||||
short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
|
||||
|
||||
*((__local half8*)addr_dst + w) = as_half8(res);
|
||||
}
|
||||
#endif
|
||||
for (int w = W & (~0x7); w < W; w++)
|
||||
{
|
||||
addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void __dma_preload_binarization(const __global half* __restrict src,
|
||||
const __global half* __restrict input_low_high,
|
||||
const __global half* __restrict dst,
|
||||
int switch_out,
|
||||
int input_low_high_size,
|
||||
int W,
|
||||
int H,
|
||||
__local half* __restrict src_local,
|
||||
const __local half* __restrict dst_local)
|
||||
{
|
||||
const int sizePlane = W*H;
|
||||
async_work_group_copy(src_local ,src + get_group_id(2)*sizePlane, sizePlane, 0);
|
||||
}
|
||||
__kernel void __dma_postwrite_binarization(const __global half* __restrict src,
|
||||
const __global half* __restrict input_low_high,
|
||||
__global half* __restrict dst,
|
||||
int switch_out,
|
||||
int input_low_high_size,
|
||||
int W,
|
||||
int H,
|
||||
const __local half* __restrict src_local,
|
||||
const __local half* __restrict dst_local)
|
||||
{
|
||||
const int sizePlane = W*H;
|
||||
async_work_group_copy(dst + get_group_id(2)*sizePlane ,dst_local, sizePlane, 0);
|
||||
}
|
@ -1,281 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__kernel void Convolution1x1_NCHW(
|
||||
const __global half* in,
|
||||
const __global half* out,
|
||||
const __global half* w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
const __local half* in_local,
|
||||
__local half* out_local)
|
||||
{
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
|
||||
int stride;
|
||||
int write_output = 0;
|
||||
__global half* src;
|
||||
|
||||
__global half8* w8 = (__global half8*)(&w[oc*IC]);
|
||||
__global half* w1 = (__global half*)(&w[oc*IC]);
|
||||
|
||||
|
||||
for (uint ow = 0; ow < (OW & (~0x7)); ow += 8)
|
||||
{
|
||||
uint iw = ow;
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8_0 = 0.0f;
|
||||
|
||||
__local half8* in8_0 = (__local half8*)(&in_local[iw + 0 * IW]);
|
||||
__local half8* in8_1 = (__local half8*)(&in_local[iw + 1 * IW]);
|
||||
__local half8* in8_2 = (__local half8*)(&in_local[iw + 2 * IW]);
|
||||
__local half8* in8_3 = (__local half8*)(&in_local[iw + 3 * IW]);
|
||||
__local half8* in8_4 = (__local half8*)(&in_local[iw + 4 * IW]);
|
||||
__local half8* in8_5 = (__local half8*)(&in_local[iw + 5 * IW]);
|
||||
__local half8* in8_6 = (__local half8*)(&in_local[iw + 6 * IW]);
|
||||
__local half8* in8_7 = (__local half8*)(&in_local[iw + 7 * IW]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ic ++)
|
||||
{
|
||||
val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
|
||||
val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
|
||||
val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
|
||||
val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
|
||||
val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
|
||||
val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
|
||||
val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
|
||||
val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
|
||||
}
|
||||
|
||||
for (uint ic = (IC & (~0x7)); ic < IC; ++ic)
|
||||
{
|
||||
val8_0 += *((__local half8*)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
|
||||
}
|
||||
*((__local half8*)&out_local[ow + 0]) = (val8_0);
|
||||
}
|
||||
|
||||
uint iw = (OW & (~0x7));
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8_0 = 0.0f;
|
||||
|
||||
__local half8* in8_0 = (__local half8*)(&in_local[iw + 0 * IW]);
|
||||
__local half8* in8_1 = (__local half8*)(&in_local[iw + 1 * IW]);
|
||||
__local half8* in8_2 = (__local half8*)(&in_local[iw + 2 * IW]);
|
||||
__local half8* in8_3 = (__local half8*)(&in_local[iw + 3 * IW]);
|
||||
__local half8* in8_4 = (__local half8*)(&in_local[iw + 4 * IW]);
|
||||
__local half8* in8_5 = (__local half8*)(&in_local[iw + 5 * IW]);
|
||||
__local half8* in8_6 = (__local half8*)(&in_local[iw + 6 * IW]);
|
||||
__local half8* in8_7 = (__local half8*)(&in_local[iw + 7 * IW]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ic ++)
|
||||
{
|
||||
val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
|
||||
val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
|
||||
val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
|
||||
val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
|
||||
val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
|
||||
val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
|
||||
val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
|
||||
val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
|
||||
}
|
||||
|
||||
for (uint ic = (IC & (~0x7)); ic < IC; ++ic)
|
||||
{
|
||||
val8_0 += *((__local half8*)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
|
||||
}
|
||||
for (uint ow = (OW & (~0x7)); ow < OW; ow ++)
|
||||
{
|
||||
out_local[ow + 0] = (val8_0[ow % 8]);
|
||||
}
|
||||
}
|
||||
__kernel void __dma_preload_Convolution1x1_NCHW(
|
||||
const __global half* in,
|
||||
const __global half* out,
|
||||
const __global half* w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
__local half* in_local,
|
||||
const __local half* out_local)
|
||||
{
|
||||
const int sizePlane = IW*IH;
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
in + get_group_id(0)*IW, // src
|
||||
in_local, // dst
|
||||
IW * sizeof(half), // src width
|
||||
IW * sizeof(half), // dst width
|
||||
sizePlane * sizeof(half), // src stride
|
||||
IW * sizeof(half), // dst stride
|
||||
IW * IC * sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
__kernel void __dma_postwrite_Convolution1x1_NCHW(
|
||||
const __global half* in,
|
||||
__global half* out,
|
||||
const __global half* w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
const __local half* in_local,
|
||||
const __local half* out_local)
|
||||
{
|
||||
async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0);
|
||||
}
|
||||
|
||||
__kernel void Convolution1x1_NHWC(
|
||||
const __global half* in,
|
||||
const __global half* out,
|
||||
const __global half* w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
const __local half* in_local,
|
||||
__local half* out_local)
|
||||
{
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
|
||||
int stride;
|
||||
int write_output = 0;
|
||||
__global half* src;
|
||||
|
||||
__global half8* w8 = (__global half8*)(&w[oc*IC]);
|
||||
__global half* w1 = (__global half*)(&w[oc*IC]);
|
||||
|
||||
for (uint ow = 0; ow < (OW & (~0x7)); ow += 8)
|
||||
{
|
||||
uint iw = ow;
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8_0 = 0.0f;
|
||||
half8 val8_1 = 0.0f;
|
||||
half8 val8_2 = 0.0f;
|
||||
half8 val8_3 = 0.0f;
|
||||
half8 val8_4 = 0.0f;
|
||||
half8 val8_5 = 0.0f;
|
||||
half8 val8_6 = 0.0f;
|
||||
half8 val8_7 = 0.0f;
|
||||
|
||||
__local half8* in8_0 = (__local half8*)(&in_local[(iw + 0) * IC]);
|
||||
__local half8* in8_1 = (__local half8*)(&in_local[(iw + 1) * IC]);
|
||||
__local half8* in8_2 = (__local half8*)(&in_local[(iw + 2) * IC]);
|
||||
__local half8* in8_3 = (__local half8*)(&in_local[(iw + 3) * IC]);
|
||||
__local half8* in8_4 = (__local half8*)(&in_local[(iw + 4) * IC]);
|
||||
__local half8* in8_5 = (__local half8*)(&in_local[(iw + 5) * IC]);
|
||||
__local half8* in8_6 = (__local half8*)(&in_local[(iw + 6) * IC]);
|
||||
__local half8* in8_7 = (__local half8*)(&in_local[(iw + 7) * IC]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ++ic)
|
||||
{
|
||||
val8_0 += (in8_0[ic]) * (w8[ic]);
|
||||
val8_1 += (in8_1[ic]) * (w8[ic]);
|
||||
val8_2 += (in8_2[ic]) * (w8[ic]);
|
||||
val8_3 += (in8_3[ic]) * (w8[ic]);
|
||||
val8_4 += (in8_4[ic]) * (w8[ic]);
|
||||
val8_5 += (in8_5[ic]) * (w8[ic]);
|
||||
val8_6 += (in8_6[ic]) * (w8[ic]);
|
||||
val8_7 += (in8_7[ic]) * (w8[ic]);
|
||||
}
|
||||
|
||||
half val_0 = 0.0f;
|
||||
half val_1 = 0.0f;
|
||||
half val_2 = 0.0f;
|
||||
half val_3 = 0.0f;
|
||||
half val_4 = 0.0f;
|
||||
half val_5 = 0.0f;
|
||||
half val_6 = 0.0f;
|
||||
half val_7 = 0.0f;
|
||||
for (uint ic = IC & (~0x7); ic < IC; ++ic)
|
||||
{
|
||||
val_0 += *((__local half*)in8_0 + ic) * (*((__global half*)w8 + ic));
|
||||
val_1 += *((__local half*)in8_1 + ic) * (*((__global half*)w8 + ic));
|
||||
val_2 += *((__local half*)in8_2 + ic) * (*((__global half*)w8 + ic));
|
||||
val_3 += *((__local half*)in8_3 + ic) * (*((__global half*)w8 + ic));
|
||||
val_4 += *((__local half*)in8_4 + ic) * (*((__global half*)w8 + ic));
|
||||
val_5 += *((__local half*)in8_5 + ic) * (*((__global half*)w8 + ic));
|
||||
val_6 += *((__local half*)in8_6 + ic) * (*((__global half*)w8 + ic));
|
||||
val_7 += *((__local half*)in8_7 + ic) * (*((__global half*)w8 + ic));
|
||||
}
|
||||
out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
|
||||
out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
|
||||
out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
|
||||
out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
|
||||
out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
|
||||
out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
|
||||
out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
|
||||
out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
|
||||
}
|
||||
for (uint ow = (OW & (~0x7)); ow < OW; ow ++)
|
||||
{
|
||||
|
||||
uint iw = ow;
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8 = 0.0f;
|
||||
|
||||
__local half8* in8 = (__local half8*)(&in_local[iw * IC]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ++ic)
|
||||
{
|
||||
val8 += (in8[ic]) * (w8[ic]);
|
||||
}
|
||||
|
||||
half val = 0.0f;
|
||||
for (uint ic = (IC & (~0x7)); ic < IC; ++ic)
|
||||
{
|
||||
val += (*((__local half*)in8 + ic)) * (*((__global half*)w8 + ic));
|
||||
}
|
||||
out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
|
||||
}
|
||||
}
|
||||
__kernel void __dma_preload_Convolution1x1_NHWC(
|
||||
const __global half* in,
|
||||
const __global half* out,
|
||||
const __global half* w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
__local half* in_local,
|
||||
const __local half* out_local)
|
||||
{
|
||||
const int sizeAct = IW*IC;
|
||||
async_work_group_copy(in_local, in + get_group_id(0)*sizeAct, sizeAct, 0);
|
||||
}
|
||||
__kernel void __dma_postwrite_Convolution1x1_NHWC(
|
||||
const __global half* in,
|
||||
__global half* out,
|
||||
const __global half* w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
const __local half* in_local,
|
||||
const __local half* out_local)
|
||||
{
|
||||
async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0);
|
||||
}
|
114
inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl
Normal file
114
inference-engine/src/vpu/custom_kernels/convolution1x1_chw.cl
Normal file
@ -0,0 +1,114 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void Convolution1x1_NCHW(
|
||||
const __global half *in,
|
||||
const __global half *out,
|
||||
const __global half *w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC)
|
||||
{
|
||||
__local half in_local[8 * 1024];
|
||||
__local half out_local[8 * 1024];
|
||||
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
in_local, // dst
|
||||
in + get_group_id(0) * IW, // src
|
||||
IW, // num_elements_per_line,
|
||||
IC, // num_lines,
|
||||
IW * IH - IW, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
|
||||
int stride;
|
||||
int write_output = 0;
|
||||
__global half *src;
|
||||
|
||||
__global half8 *w8 = (__global half8 *)(&w[oc * IC]);
|
||||
__global half *w1 = (__global half *)(&w[oc * IC]);
|
||||
|
||||
for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
|
||||
uint iw = ow;
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8_0 = 0.0f;
|
||||
|
||||
__local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
|
||||
__local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
|
||||
__local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
|
||||
__local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
|
||||
__local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
|
||||
__local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
|
||||
__local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
|
||||
__local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ic++) {
|
||||
val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
|
||||
val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
|
||||
val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
|
||||
val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
|
||||
val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
|
||||
val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
|
||||
val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
|
||||
val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
|
||||
}
|
||||
|
||||
for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
|
||||
val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
|
||||
}
|
||||
*((__local half8 *)&out_local[ow + 0]) = (val8_0);
|
||||
}
|
||||
|
||||
uint iw = (OW & (~0x7));
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8_0 = 0.0f;
|
||||
|
||||
__local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
|
||||
__local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
|
||||
__local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
|
||||
__local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
|
||||
__local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
|
||||
__local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
|
||||
__local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
|
||||
__local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ic++) {
|
||||
val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
|
||||
val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
|
||||
val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
|
||||
val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
|
||||
val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
|
||||
val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
|
||||
val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
|
||||
val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
|
||||
}
|
||||
|
||||
for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
|
||||
val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
|
||||
}
|
||||
for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
|
||||
out_local[ow + 0] = (val8_0[ow % 8]);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(
|
||||
out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
|
||||
out_local,
|
||||
OW,
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
126
inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl
Normal file
126
inference-engine/src/vpu/custom_kernels/convolution1x1_hwc.cl
Normal file
@ -0,0 +1,126 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void Convolution1x1_NHWC(
|
||||
const __global half *in,
|
||||
const __global half *out,
|
||||
const __global half *w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC)
|
||||
{
|
||||
|
||||
__local half in_local[8 * 1024];
|
||||
__local half out_local[8 * 1024];
|
||||
|
||||
const int sizeAct = IW * IC;
|
||||
|
||||
event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
|
||||
int stride;
|
||||
int write_output = 0;
|
||||
__global half *src;
|
||||
|
||||
__global half8 *w8 = (__global half8 *)(&w[oc * IC]);
|
||||
__global half *w1 = (__global half *)(&w[oc * IC]);
|
||||
|
||||
for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
|
||||
uint iw = ow;
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8_0 = 0.0f;
|
||||
half8 val8_1 = 0.0f;
|
||||
half8 val8_2 = 0.0f;
|
||||
half8 val8_3 = 0.0f;
|
||||
half8 val8_4 = 0.0f;
|
||||
half8 val8_5 = 0.0f;
|
||||
half8 val8_6 = 0.0f;
|
||||
half8 val8_7 = 0.0f;
|
||||
|
||||
__local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
|
||||
__local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
|
||||
__local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
|
||||
__local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
|
||||
__local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
|
||||
__local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
|
||||
__local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
|
||||
__local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ++ic) {
|
||||
val8_0 += (in8_0[ic]) * (w8[ic]);
|
||||
val8_1 += (in8_1[ic]) * (w8[ic]);
|
||||
val8_2 += (in8_2[ic]) * (w8[ic]);
|
||||
val8_3 += (in8_3[ic]) * (w8[ic]);
|
||||
val8_4 += (in8_4[ic]) * (w8[ic]);
|
||||
val8_5 += (in8_5[ic]) * (w8[ic]);
|
||||
val8_6 += (in8_6[ic]) * (w8[ic]);
|
||||
val8_7 += (in8_7[ic]) * (w8[ic]);
|
||||
}
|
||||
|
||||
half val_0 = 0.0f;
|
||||
half val_1 = 0.0f;
|
||||
half val_2 = 0.0f;
|
||||
half val_3 = 0.0f;
|
||||
half val_4 = 0.0f;
|
||||
half val_5 = 0.0f;
|
||||
half val_6 = 0.0f;
|
||||
half val_7 = 0.0f;
|
||||
for (uint ic = IC & (~0x7); ic < IC; ++ic) {
|
||||
val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
|
||||
val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
|
||||
val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
|
||||
val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
|
||||
val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
|
||||
val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
|
||||
val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
|
||||
val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
|
||||
}
|
||||
out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
|
||||
out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
|
||||
out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
|
||||
out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
|
||||
out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
|
||||
out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
|
||||
out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
|
||||
out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
|
||||
}
|
||||
for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
|
||||
|
||||
uint iw = ow;
|
||||
uint ih = oh;
|
||||
|
||||
half8 val8 = 0.0f;
|
||||
|
||||
__local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
|
||||
|
||||
for (uint ic = 0; ic < IC / 8; ++ic) {
|
||||
val8 += (in8[ic]) * (w8[ic]);
|
||||
}
|
||||
|
||||
half val = 0.0f;
|
||||
for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
|
||||
val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
|
||||
}
|
||||
out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(
|
||||
out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
|
||||
out_local,
|
||||
OW,
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -3,64 +3,89 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void Convolution3x3(const __global half* in_param,
|
||||
const __global half* out,
|
||||
const __global half* w,
|
||||
int IW, int IH, int IC,
|
||||
int OW, int OH, int OC, int KX, int KY,
|
||||
int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y,
|
||||
const __local half* in_local,
|
||||
__local half* out_local,
|
||||
const __local half* w_local)
|
||||
__kernel void Convolution3x3(
|
||||
const __global half *in_param,
|
||||
const __global half *out,
|
||||
const __global half *w,
|
||||
int IW,
|
||||
int IH,
|
||||
int IC,
|
||||
int OW,
|
||||
int OH,
|
||||
int OC,
|
||||
int KX,
|
||||
int KY,
|
||||
int stride_x,
|
||||
int stride_y,
|
||||
int pad_x,
|
||||
int pad_y,
|
||||
int dilation_x,
|
||||
int dilation_y)
|
||||
{
|
||||
__local half in_local[8 * 1024];
|
||||
__local half out_local[8 * 1024];
|
||||
__local half w_local[8 * 1024];
|
||||
|
||||
const int sizePlane = IW * IH;
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
in_local, // dst
|
||||
in_param + get_group_id(0) * stride_y * IW, // src
|
||||
3 * IW, // num_elements_per_line,
|
||||
IC, // num_lines,
|
||||
IW * IH - 3 * IW, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
const int sizeWeight = IC * 3 * 3;
|
||||
e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int oh = get_global_id(0);
|
||||
int oc = get_global_id(1);
|
||||
|
||||
__local half* in = (__local half* )in_local + 1;
|
||||
__local half *in = (__local half *)in_local + 1;
|
||||
|
||||
int stride;
|
||||
int write_output = 0;
|
||||
__local half* src;
|
||||
__local half *src;
|
||||
|
||||
if((stride_x == 1) && (stride_y == 1))
|
||||
{
|
||||
stride = OW / 8;
|
||||
if ((stride_x == 1) && (stride_y == 1)) {
|
||||
stride = OW / 8;
|
||||
write_output = 1;
|
||||
}
|
||||
if((stride_x == 2) && (stride_y == 2))
|
||||
{
|
||||
stride = OW / 4;
|
||||
if ((stride_x == 2) && (stride_y == 2)) {
|
||||
stride = OW / 4;
|
||||
write_output = 2;
|
||||
}
|
||||
|
||||
for (int ow = 0; ow < stride; ow++)
|
||||
{
|
||||
for (int ow = 0; ow < stride; ow++) {
|
||||
float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
for (int ic = 0; ic < IC; ++ic)
|
||||
{
|
||||
src = (__local half* )((__local half8*)(in + ic * IW * 3) + ow);
|
||||
__local half* k = (__local half* )(w_local + ic*3*3);
|
||||
for (int ic = 0; ic < IC; ++ic) {
|
||||
src = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
|
||||
__local half *k = (__local half *)(w_local + ic * 3 * 3);
|
||||
|
||||
half8 aux_in00 = *((__local half8*)src - 1);
|
||||
half8 aux_in01 = *((__local half8*)src + 0);
|
||||
half8 aux_in02 = *((__local half8*)src + 1);
|
||||
half8 aux_in10 = *((__local half8*)(src + IW) - 1);
|
||||
half8 aux_in11 = *((__local half8*)(src + IW) + 0);
|
||||
half8 aux_in12 = *((__local half8*)(src + IW) + 1);
|
||||
half8 aux_in20 = *((__local half8*)(src + IW * 2) - 1);
|
||||
half8 aux_in21 = *((__local half8*)(src + IW * 2) + 0);
|
||||
half8 aux_in22 = *((__local half8*)(src + IW * 2) + 1);
|
||||
half8 aux_in00 = *((__local half8 *)src - 1);
|
||||
half8 aux_in01 = *((__local half8 *)src + 0);
|
||||
half8 aux_in02 = *((__local half8 *)src + 1);
|
||||
half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
|
||||
half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
|
||||
half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
|
||||
half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
|
||||
half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
|
||||
half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);
|
||||
|
||||
short8 in00 = *((short8*)&aux_in00);
|
||||
short8 in01 = *((short8*)&aux_in01);
|
||||
short8 in02 = *((short8*)&aux_in02);
|
||||
short8 in10 = *((short8*)&aux_in10);
|
||||
short8 in11 = *((short8*)&aux_in11);
|
||||
short8 in12 = *((short8*)&aux_in12);
|
||||
short8 in20 = *((short8*)&aux_in20);
|
||||
short8 in21 = *((short8*)&aux_in21);
|
||||
short8 in22 = *((short8*)&aux_in22);
|
||||
short8 in00 = *((short8 *)&aux_in00);
|
||||
short8 in01 = *((short8 *)&aux_in01);
|
||||
short8 in02 = *((short8 *)&aux_in02);
|
||||
short8 in10 = *((short8 *)&aux_in10);
|
||||
short8 in11 = *((short8 *)&aux_in11);
|
||||
short8 in12 = *((short8 *)&aux_in12);
|
||||
short8 in20 = *((short8 *)&aux_in20);
|
||||
short8 in21 = *((short8 *)&aux_in21);
|
||||
short8 in22 = *((short8 *)&aux_in22);
|
||||
|
||||
short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
|
||||
short8 aux_aux01 = in01;
|
||||
@ -72,15 +97,15 @@ __kernel void Convolution3x3(const __global half* in_param,
|
||||
short8 aux_aux21 = in21;
|
||||
short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);
|
||||
|
||||
half8 aux00 = *((half8*)&aux_aux00);
|
||||
half8 aux01 = *((half8*)&aux_aux01);
|
||||
half8 aux02 = *((half8*)&aux_aux02);
|
||||
half8 aux10 = *((half8*)&aux_aux10);
|
||||
half8 aux11 = *((half8*)&aux_aux11);
|
||||
half8 aux12 = *((half8*)&aux_aux12);
|
||||
half8 aux20 = *((half8*)&aux_aux20);
|
||||
half8 aux21 = *((half8*)&aux_aux21);
|
||||
half8 aux22 = *((half8*)&aux_aux22);
|
||||
half8 aux00 = *((half8 *)&aux_aux00);
|
||||
half8 aux01 = *((half8 *)&aux_aux01);
|
||||
half8 aux02 = *((half8 *)&aux_aux02);
|
||||
half8 aux10 = *((half8 *)&aux_aux10);
|
||||
half8 aux11 = *((half8 *)&aux_aux11);
|
||||
half8 aux12 = *((half8 *)&aux_aux12);
|
||||
half8 aux20 = *((half8 *)&aux_aux20);
|
||||
half8 aux21 = *((half8 *)&aux_aux21);
|
||||
half8 aux22 = *((half8 *)&aux_aux22);
|
||||
|
||||
half8 w00 = (half8)(*(k + 0));
|
||||
half8 w01 = (half8)(*(k + 1));
|
||||
@ -102,69 +127,32 @@ __kernel void Convolution3x3(const __global half* in_param,
|
||||
val += convert_float8(aux21) * convert_float8(w21);
|
||||
val += convert_float8(aux22) * convert_float8(w22);
|
||||
}
|
||||
if(write_output == 2)
|
||||
*((__local half4*)(out_local) + ow) = convert_half4(val.s0246);
|
||||
if(write_output == 1)
|
||||
*((__local half8*)(out_local) + ow) = convert_half8(val);
|
||||
if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
|
||||
if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
|
||||
}
|
||||
|
||||
for (int ow = OW & ~(0x7); ow < OW; ow++)
|
||||
{
|
||||
for (int ow = OW & ~(0x7); ow < OW; ow++) {
|
||||
float val = 0.0f;
|
||||
for (int ic = 0; ic < IC; ++ic)
|
||||
{
|
||||
for (int ky = 0; ky < 3; ++ky)
|
||||
{
|
||||
for (int kx = 0; kx < 3; ++kx)
|
||||
{
|
||||
for (int ic = 0; ic < IC; ++ic) {
|
||||
for (int ky = 0; ky < 3; ++ky) {
|
||||
for (int kx = 0; kx < 3; ++kx) {
|
||||
int iw = ow * stride_x - pad_x + kx * dilation_x;
|
||||
int ih = oh * stride_y - pad_y + ky * dilation_y;
|
||||
|
||||
val += convert_float(in[ic*IW*3 + (ky * dilation_y)*IW + iw]) * convert_float(w_local[ic*3*3 + ky*3 + kx]);
|
||||
val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
|
||||
* convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
out_local[ow] = convert_half(val);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_Convolution3x3(
|
||||
const __global half* in_param,
|
||||
const __global half* out,
|
||||
const __global half* w,
|
||||
int IW, int IH, int IC,
|
||||
int OW, int OH, int OC, int KX, int KY,
|
||||
int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y,
|
||||
__local half* in_local,
|
||||
const __local half* out_local,
|
||||
__local half* w_local)
|
||||
{
|
||||
const int sizePlane = IW*IH;
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
in_param + get_group_id(0)*stride_y*IW, // src
|
||||
in_local, // dst
|
||||
3 * IW * sizeof(half), // src width
|
||||
3 * IW * sizeof(half), // dst width
|
||||
sizePlane * sizeof(half), // src stride
|
||||
3 * IW * sizeof(half), // dst stride
|
||||
3 * IW * IC * sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
const int sizeWeight = IC*3*3;
|
||||
async_work_group_copy(w_local, w + get_group_id(1)*sizeWeight, sizeWeight, 0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_Convolution3x3(
|
||||
const __global half* in_param,
|
||||
__global half* out,
|
||||
const __global half* w,
|
||||
int IW, int IH, int IC,
|
||||
int OW, int OH, int OC, int KX, int KY,
|
||||
int stride_x, int stride_y, int pad_x, int pad_y, int dilation_x, int dilation_y,
|
||||
const __local half* in_local,
|
||||
const __local half* out_local,
|
||||
const __local half* w_local)
|
||||
{
|
||||
async_work_group_copy(out + get_group_id(1)*OW*OH + get_group_id(0)*OW, out_local, OW, 0);
|
||||
event_t e2 = async_work_group_copy(
|
||||
out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
|
||||
out_local,
|
||||
OW,
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
@ -4,112 +4,105 @@
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define MAX_OPENCL_BUFF_SIZE 64*1024
|
||||
#define MAX_OPENCL_BUFF_SIZE 64 * 1024
|
||||
|
||||
// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
|
||||
#define USE_MANUAL_DMA 1
|
||||
#define USE_DMA 1
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
void dmacpyLineSrcStrideStart(global half* from, private half* to, int size, int src_width, int src_stride)
|
||||
#if defined(USE_DMA)
|
||||
void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride)
|
||||
{
|
||||
item_dma_event_t copyEvent = WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
|
||||
item_dma_event_t copyEvent =
|
||||
WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
|
||||
WaitWorkItemDmaEvents(1, ©Event);
|
||||
}
|
||||
|
||||
void dmacpyLineDstStrideStart(private half* from, global half* to, int size, int src_width, int src_stride)
|
||||
void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride)
|
||||
{
|
||||
item_dma_event_t copyEvent = WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
|
||||
item_dma_event_t copyEvent =
|
||||
WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
|
||||
WaitWorkItemDmaEvents(1, ©Event);
|
||||
}
|
||||
#endif
|
||||
|
||||
void memzero(void * ptr, size_t num)
|
||||
void memzero(void *ptr, size_t num)
|
||||
{
|
||||
float4* line0_ = (float4*) ptr;
|
||||
float4 *line0_ = (float4 *)ptr;
|
||||
#pragma unroll 16
|
||||
for (int i = 0; i < num/16; i++)
|
||||
{
|
||||
for (int i = 0; i < num / 16; i++) {
|
||||
line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
|
||||
}
|
||||
uchar* ptr_ = (uchar*) ptr;
|
||||
for (int i = num/16*16; i < num; i++)
|
||||
{
|
||||
uchar *ptr_ = (uchar *)ptr;
|
||||
for (int i = num / 16 * 16; i < num; i++) {
|
||||
ptr_[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
|
||||
__private const half* restrict line1,
|
||||
__private half* restrict dline,
|
||||
int topwidth,
|
||||
int max_displacement,
|
||||
int neighborhood_grid_radius,
|
||||
int kernel_size,
|
||||
int padding,
|
||||
int bottomwidth,
|
||||
int stride1,
|
||||
int stride2,
|
||||
int max_channels,
|
||||
int cur_subchannels)
|
||||
void __attribute__((noinline)) crosscorrh(
|
||||
__private const half *restrict line0,
|
||||
__private const half *restrict line1,
|
||||
__private half *restrict dline,
|
||||
int topwidth,
|
||||
int max_displacement,
|
||||
int neighborhood_grid_radius,
|
||||
int kernel_size,
|
||||
int padding,
|
||||
int bottomwidth,
|
||||
int stride1,
|
||||
int stride2,
|
||||
int max_channels,
|
||||
int cur_subchannels)
|
||||
{
|
||||
if (max_channels == 64)
|
||||
{
|
||||
for (int i = 0; i < kernel_size; i++)
|
||||
{
|
||||
int x1 = max_displacement - padding + i;
|
||||
int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1)/stride1;
|
||||
x1 += offset1*stride1;
|
||||
if (max_channels == 64) {
|
||||
for (int i = 0; i < kernel_size; i++) {
|
||||
int x1 = max_displacement - padding + i;
|
||||
int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
|
||||
x1 += offset1 * stride1;
|
||||
|
||||
for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1)
|
||||
{
|
||||
int x2 = x1 - neighborhood_grid_radius*stride2;
|
||||
int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1)/stride2;
|
||||
x2 += offset2*stride2;
|
||||
for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
|
||||
int x2 = x1 - neighborhood_grid_radius * stride2;
|
||||
int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
|
||||
x2 += offset2 * stride2;
|
||||
|
||||
for (int top_channel_x = offset2 - neighborhood_grid_radius;
|
||||
top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
|
||||
top_channel_x++, x2 += stride2)
|
||||
{
|
||||
top_channel_x++, x2 += stride2) {
|
||||
half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
half8* src0 = (half8*)(line0 + x1*max_channels);
|
||||
half8* src1 = (half8*)(line1 + x2*max_channels);
|
||||
half8 *src0 = (half8 *)(line0 + x1 * max_channels);
|
||||
half8 *src1 = (half8 *)(line1 + x2 * max_channels);
|
||||
|
||||
#pragma unroll 8
|
||||
for (int ch = 0; ch < max_channels/8; ch++)
|
||||
sum4 += (src0[ch])*(src1[ch]);
|
||||
for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);
|
||||
|
||||
half sum = __builtin_shave_sau_sumx_f16_r(sum4);
|
||||
dline[(top_channel_x + neighborhood_grid_radius)*topwidth + blockIdx_x] += (sum);
|
||||
dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int neighborhood_grid_width = 2*neighborhood_grid_radius + 1;
|
||||
} else {
|
||||
int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;
|
||||
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++)
|
||||
{
|
||||
for (int i = 0; i < kernel_size; i++)
|
||||
{
|
||||
int x1 = blockIdx_x*stride1 + max_displacement + i - padding;
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
|
||||
for (int i = 0; i < kernel_size; i++) {
|
||||
int x1 = blockIdx_x * stride1 + max_displacement + i - padding;
|
||||
|
||||
if ((x1 >= 0) && (x1 < bottomwidth))
|
||||
{
|
||||
int o_min = - neighborhood_grid_radius*stride2;
|
||||
int o_max = neighborhood_grid_width*stride2 - neighborhood_grid_radius*stride2;
|
||||
if ((o_min) < ( - x1)) o_min -= ((x1 + o_min - (stride2 - 1))/stride2)*stride2;
|
||||
if ((o_max) >= (bottomwidth+stride2 - x1)) o_max -= ((x1 + o_max - bottomwidth )/stride2)*stride2;
|
||||
if ((x1 >= 0) && (x1 < bottomwidth)) {
|
||||
int o_min = -neighborhood_grid_radius * stride2;
|
||||
int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
|
||||
if ((o_min) < (-x1)) {
|
||||
o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
|
||||
}
|
||||
if ((o_max) >= (bottomwidth + stride2 - x1)) {
|
||||
o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
|
||||
}
|
||||
|
||||
int o = o_min;
|
||||
for (; o <= o_max - 4*stride2; o += 4*stride2)
|
||||
{
|
||||
half8* bottom0 = (half8*)(line0 + x1*max_channels);
|
||||
half8* bottom1_0 = (half8*)(line1 + (x1 + o + 0*stride2)*max_channels);
|
||||
half8* bottom1_1 = (half8*)(line1 + (x1 + o + 1*stride2)*max_channels);
|
||||
half8* bottom1_2 = (half8*)(line1 + (x1 + o + 2*stride2)*max_channels);
|
||||
half8* bottom1_3 = (half8*)(line1 + (x1 + o + 3*stride2)*max_channels);
|
||||
for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
|
||||
half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
|
||||
half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels);
|
||||
half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels);
|
||||
half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels);
|
||||
half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels);
|
||||
|
||||
int c = 0;
|
||||
|
||||
@ -118,8 +111,7 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
|
||||
half8 sum42 = 0;
|
||||
half8 sum43 = 0;
|
||||
|
||||
for (; c <= cur_subchannels/8 - 4; c += 4)
|
||||
{
|
||||
for (; c <= cur_subchannels / 8 - 4; c += 4) {
|
||||
sum40 += bottom0[c + 0] * bottom1_0[c + 0];
|
||||
sum40 += bottom0[c + 1] * bottom1_0[c + 1];
|
||||
sum40 += bottom0[c + 2] * bottom1_0[c + 2];
|
||||
@ -141,8 +133,7 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
|
||||
sum43 += bottom0[c + 3] * bottom1_3[c + 3];
|
||||
}
|
||||
|
||||
for (; c < cur_subchannels/8; c++)
|
||||
{
|
||||
for (; c < cur_subchannels / 8; c++) {
|
||||
sum40 += bottom0[c] * bottom1_0[c];
|
||||
sum41 += bottom0[c] * bottom1_1[c];
|
||||
sum42 += bottom0[c] * bottom1_2[c];
|
||||
@ -154,48 +145,47 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
|
||||
half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
|
||||
half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);
|
||||
|
||||
for (c = c*8; c < cur_subchannels; c++)
|
||||
{
|
||||
sum0 += line0[x1*max_channels + c] * line1[(x1 + o + 0*stride2)*max_channels + c];
|
||||
sum1 += line0[x1*max_channels + c] * line1[(x1 + o + 1*stride2)*max_channels + c];
|
||||
sum2 += line0[x1*max_channels + c] * line1[(x1 + o + 2*stride2)*max_channels + c];
|
||||
sum3 += line0[x1*max_channels + c] * line1[(x1 + o + 3*stride2)*max_channels + c];
|
||||
for (c = c * 8; c < cur_subchannels; c++) {
|
||||
sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
|
||||
sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
|
||||
sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
|
||||
sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
|
||||
}
|
||||
|
||||
dline[blockIdx_x + (((o/stride2) + 0)*topwidth + neighborhood_grid_radius*topwidth)] += sum0;
|
||||
dline[blockIdx_x + (((o/stride2) + 1)*topwidth + neighborhood_grid_radius*topwidth)] += sum1;
|
||||
dline[blockIdx_x + (((o/stride2) + 2)*topwidth + neighborhood_grid_radius*topwidth)] += sum2;
|
||||
dline[blockIdx_x + (((o/stride2) + 3)*topwidth + neighborhood_grid_radius*topwidth)] += sum3;
|
||||
dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
||||
sum0;
|
||||
dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
||||
sum1;
|
||||
dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
||||
sum2;
|
||||
dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
|
||||
sum3;
|
||||
}
|
||||
|
||||
for (; o < o_max; o += 1*stride2)
|
||||
{
|
||||
half8* bottom0 = (half8*)(line0 + x1*max_channels);
|
||||
half8* bottom1 = (half8*)(line1 + (x1 + o)*max_channels);
|
||||
for (; o < o_max; o += 1 * stride2) {
|
||||
half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
|
||||
half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels);
|
||||
|
||||
int c = 0;
|
||||
|
||||
half8 sum4 = 0;
|
||||
for (; c <= cur_subchannels/8 - 4; c += 4)
|
||||
{
|
||||
for (; c <= cur_subchannels / 8 - 4; c += 4) {
|
||||
sum4 += bottom0[c + 0] * bottom1[c + 0];
|
||||
sum4 += bottom0[c + 1] * bottom1[c + 1];
|
||||
sum4 += bottom0[c + 2] * bottom1[c + 2];
|
||||
sum4 += bottom0[c + 3] * bottom1[c + 3];
|
||||
}
|
||||
for (; c < cur_subchannels/8; c++)
|
||||
{
|
||||
for (; c < cur_subchannels / 8; c++) {
|
||||
sum4 += bottom0[c] * bottom1[c];
|
||||
}
|
||||
|
||||
half sum = __builtin_shave_sau_sumx_f16_r(sum4);
|
||||
|
||||
for (c = c*8; c < cur_subchannels; c++)
|
||||
{
|
||||
sum += line0[x1*max_channels + c] * line1[(x1 + o)*max_channels + c];
|
||||
for (c = c * 8; c < cur_subchannels; c++) {
|
||||
sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
|
||||
}
|
||||
|
||||
dline[blockIdx_x + (((o + neighborhood_grid_radius*stride2)/stride2)*topwidth)] += sum;
|
||||
dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -203,243 +193,257 @@ void __attribute__((noinline)) crosscorrh(__private const half* restrict line0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__kernel void correlate2_half(__global const half* restrict bottom0,
|
||||
__global const half* restrict bottom1,
|
||||
__global half* restrict top,
|
||||
int topwidth,
|
||||
int topheight,
|
||||
int bottomwidth,
|
||||
int bottomheight,
|
||||
int bottomchannels,
|
||||
int max_displacement,
|
||||
int padding,
|
||||
int neighborhood_grid_radius,
|
||||
int neighborhood_grid_width,
|
||||
int kernel_size,
|
||||
int stride1,
|
||||
int stride2)
|
||||
__kernel void correlate2_half(
|
||||
__global const half *restrict bottom0,
|
||||
__global const half *restrict bottom1,
|
||||
__global half *restrict top,
|
||||
int topwidth,
|
||||
int topheight,
|
||||
int bottomwidth,
|
||||
int bottomheight,
|
||||
int bottomchannels,
|
||||
int max_displacement,
|
||||
int padding,
|
||||
int neighborhood_grid_radius,
|
||||
int neighborhood_grid_width,
|
||||
int kernel_size,
|
||||
int stride1,
|
||||
int stride2)
|
||||
{
|
||||
int max_channels = (MAX_OPENCL_BUFF_SIZE/sizeof(half) - topwidth*neighborhood_grid_width) / (3*bottomwidth);
|
||||
int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
|
||||
if (max_channels > 64) max_channels = 64;
|
||||
int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
|
||||
int subchannels = (bottomchannels + subchannels_count-1) / subchannels_count;
|
||||
int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count;
|
||||
if (subchannels < max_channels) subchannels = max_channels;
|
||||
|
||||
const int sumelems = kernel_size*kernel_size*bottomchannels;
|
||||
const int sumelems = kernel_size * kernel_size * bottomchannels;
|
||||
|
||||
__private half cmx[MAX_OPENCL_BUFF_SIZE/sizeof(half)];
|
||||
__private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];
|
||||
|
||||
__private half* line0 = cmx;
|
||||
__private half* line1 = line0 + bottomwidth*subchannels;
|
||||
__private half* dline = line1 + bottomwidth*subchannels;
|
||||
__private half *line0 = cmx;
|
||||
__private half *line1 = line0 + bottomwidth * subchannels;
|
||||
__private half *dline = line1 + bottomwidth * subchannels;
|
||||
|
||||
int blockIdx_y = get_global_id(0);
|
||||
|
||||
#if defined(USE_MANUAL_DMA)
|
||||
__private half* dmabuf = dline + topwidth*neighborhood_grid_width;
|
||||
#if defined(USE_DMA)
|
||||
__private half *dmabuf = dline + topwidth * neighborhood_grid_width;
|
||||
#endif
|
||||
|
||||
int y1 = blockIdx_y*stride1 + max_displacement;
|
||||
int y1 = blockIdx_y * stride1 + max_displacement;
|
||||
|
||||
for (int j = 0; j < kernel_size; j++)
|
||||
{
|
||||
for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels)
|
||||
{
|
||||
for (int j = 0; j < kernel_size; j++) {
|
||||
for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
|
||||
// configure channel batching
|
||||
int startchannel = bottomchannel;
|
||||
int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
|
||||
int deltachannels = endchannel-startchannel;
|
||||
int deltachannels = endchannel - startchannel;
|
||||
|
||||
// load line form blob 0 with repackaging
|
||||
if (y1+j-padding >= 0 && y1+j-padding < bottomheight)
|
||||
{
|
||||
#if defined(USE_MANUAL_DMA)
|
||||
__global const half* curr = bottom0 + startchannel*bottomheight*bottomwidth + (y1+j-padding)*bottomwidth;
|
||||
dmacpyLineSrcStrideStart(curr,
|
||||
dmabuf,
|
||||
bottomwidth*deltachannels*sizeof(half),
|
||||
bottomwidth*sizeof(half),
|
||||
bottomwidth*bottomheight*sizeof(half));
|
||||
if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
|
||||
#if defined(USE_DMA)
|
||||
__global const half *curr =
|
||||
bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
|
||||
dmacpyLineSrcStrideStart(
|
||||
curr,
|
||||
dmabuf,
|
||||
bottomwidth * deltachannels * sizeof(half),
|
||||
bottomwidth * sizeof(half),
|
||||
bottomwidth * bottomheight * sizeof(half));
|
||||
|
||||
for (int ch = 0; ch < deltachannels; ch++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++)
|
||||
{
|
||||
half8 val = ((half8*)(dmabuf + ch*bottomwidth))[blockIdx_x];
|
||||
line0[(blockIdx_x*8 + 0)*max_channels+ch] = val[0];
|
||||
line0[(blockIdx_x*8 + 1)*max_channels+ch] = val[1];
|
||||
line0[(blockIdx_x*8 + 2)*max_channels+ch] = val[2];
|
||||
line0[(blockIdx_x*8 + 3)*max_channels+ch] = val[3];
|
||||
for (int ch = 0; ch < deltachannels; ch++) {
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
|
||||
half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
|
||||
line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
|
||||
line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
|
||||
line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
|
||||
line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
|
||||
|
||||
line0[(blockIdx_x*8 + 4)*max_channels+ch] = val[4];
|
||||
line0[(blockIdx_x*8 + 5)*max_channels+ch] = val[5];
|
||||
line0[(blockIdx_x*8 + 6)*max_channels+ch] = val[6];
|
||||
line0[(blockIdx_x*8 + 7)*max_channels+ch] = val[7];
|
||||
line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
|
||||
line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
|
||||
line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
|
||||
line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
|
||||
}
|
||||
|
||||
for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++)
|
||||
{
|
||||
line0[(blockIdx_x)*max_channels+ch] = dmabuf[blockIdx_x + ch*bottomwidth];
|
||||
for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
|
||||
line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
|
||||
}
|
||||
}
|
||||
|
||||
if (deltachannels < subchannels)
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
|
||||
memzero(line0 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half));
|
||||
memzero(
|
||||
line0 + blockIdx_x * max_channels + deltachannels,
|
||||
(subchannels - deltachannels) * sizeof(half));
|
||||
#else
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
|
||||
for (int ch = 0; ch < deltachannels; ch++)
|
||||
line0[blockIdx_x*max_channels+ch]
|
||||
= bottom0[(ch+startchannel)*bottomheight*bottomwidth + (y1+j-padding)*bottomwidth + blockIdx_x];
|
||||
line0[blockIdx_x * max_channels + ch] = bottom0
|
||||
[(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
|
||||
+ blockIdx_x];
|
||||
|
||||
if (deltachannels < subchannels)
|
||||
memzero(line0 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half));
|
||||
memzero(
|
||||
line0 + blockIdx_x * max_channels + deltachannels,
|
||||
(subchannels - deltachannels) * sizeof(half));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
memzero(line0, max_channels*bottomwidth*sizeof(half));
|
||||
} else
|
||||
memzero(line0, max_channels * bottomwidth * sizeof(half));
|
||||
|
||||
for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++)
|
||||
{
|
||||
for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
|
||||
int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;
|
||||
|
||||
// load line form blob 1 with repackaging according to the line we work on now
|
||||
if (y2+j-padding >= 0 && y2+j-padding < bottomheight)
|
||||
{
|
||||
#if defined(USE_MANUAL_DMA)
|
||||
__global const half* curr = bottom1 + startchannel*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth;
|
||||
dmacpyLineSrcStrideStart(curr,
|
||||
dmabuf,
|
||||
bottomwidth*deltachannels*sizeof(half),
|
||||
bottomwidth*sizeof(half),
|
||||
bottomwidth*bottomheight*sizeof(half));
|
||||
if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
|
||||
#if defined(USE_DMA)
|
||||
__global const half *curr =
|
||||
bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
|
||||
dmacpyLineSrcStrideStart(
|
||||
curr,
|
||||
dmabuf,
|
||||
bottomwidth * deltachannels * sizeof(half),
|
||||
bottomwidth * sizeof(half),
|
||||
bottomwidth * bottomheight * sizeof(half));
|
||||
|
||||
for (int ch = 0; ch < deltachannels; ch++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++)
|
||||
{
|
||||
half8 val = ((half8*)(dmabuf + ch*bottomwidth))[blockIdx_x];
|
||||
line1[(blockIdx_x*8 + 0)*max_channels+ch] = val[0];
|
||||
line1[(blockIdx_x*8 + 1)*max_channels+ch] = val[1];
|
||||
line1[(blockIdx_x*8 + 2)*max_channels+ch] = val[2];
|
||||
line1[(blockIdx_x*8 + 3)*max_channels+ch] = val[3];
|
||||
for (int ch = 0; ch < deltachannels; ch++) {
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
|
||||
half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
|
||||
line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
|
||||
line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
|
||||
line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
|
||||
line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
|
||||
|
||||
line1[(blockIdx_x*8 + 4)*max_channels+ch] = val[4];
|
||||
line1[(blockIdx_x*8 + 5)*max_channels+ch] = val[5];
|
||||
line1[(blockIdx_x*8 + 6)*max_channels+ch] = val[6];
|
||||
line1[(blockIdx_x*8 + 7)*max_channels+ch] = val[7];
|
||||
line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
|
||||
line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
|
||||
line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
|
||||
line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
|
||||
}
|
||||
|
||||
for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++)
|
||||
{
|
||||
line1[(blockIdx_x)*max_channels+ch] = dmabuf[blockIdx_x + ch*bottomwidth];
|
||||
for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
|
||||
line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int ch = 0; ch < deltachannels; ch++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth/8; blockIdx_x++)
|
||||
{
|
||||
half8 val = ((__global half8*)(bottom1 + (ch+startchannel)*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth))[blockIdx_x];
|
||||
line1[(blockIdx_x*8 + 0)*max_channels+ch] = val[0];
|
||||
line1[(blockIdx_x*8 + 1)*max_channels+ch] = val[1];
|
||||
line1[(blockIdx_x*8 + 2)*max_channels+ch] = val[2];
|
||||
line1[(blockIdx_x*8 + 3)*max_channels+ch] = val[3];
|
||||
for (int ch = 0; ch < deltachannels; ch++) {
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
|
||||
half8 val = ((
|
||||
__global half8
|
||||
*)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
|
||||
[blockIdx_x];
|
||||
line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
|
||||
line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
|
||||
line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
|
||||
line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
|
||||
|
||||
line1[(blockIdx_x*8 + 4)*max_channels+ch] = val[4];
|
||||
line1[(blockIdx_x*8 + 5)*max_channels+ch] = val[5];
|
||||
line1[(blockIdx_x*8 + 6)*max_channels+ch] = val[6];
|
||||
line1[(blockIdx_x*8 + 7)*max_channels+ch] = val[7];
|
||||
line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
|
||||
line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
|
||||
line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
|
||||
line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
|
||||
}
|
||||
for (int blockIdx_x = bottomwidth/8*8; blockIdx_x < bottomwidth; blockIdx_x++)
|
||||
{
|
||||
half val = (bottom1 + (ch+startchannel)*bottomheight*bottomwidth + (y2+j-padding)*bottomwidth)[blockIdx_x];
|
||||
line1[(blockIdx_x)*max_channels+ch] = val;
|
||||
for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
|
||||
half val =
|
||||
(bottom1 + (ch + startchannel) * bottomheight * bottomwidth
|
||||
+ (y2 + j - padding) * bottomwidth)[blockIdx_x];
|
||||
line1[(blockIdx_x)*max_channels + ch] = val;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
|
||||
if (deltachannels < subchannels)
|
||||
memzero(line1 + blockIdx_x*max_channels+deltachannels, (subchannels-deltachannels)*sizeof(half));
|
||||
memzero(
|
||||
line1 + blockIdx_x * max_channels + deltachannels,
|
||||
(subchannels - deltachannels) * sizeof(half));
|
||||
}
|
||||
}
|
||||
else
|
||||
memzero(line1, max_channels*bottomwidth*sizeof(half));
|
||||
} else
|
||||
memzero(line1, max_channels * bottomwidth * sizeof(half));
|
||||
|
||||
if(j == 0 && startchannel == 0)
|
||||
{
|
||||
memzero(dline, neighborhood_grid_width*topwidth*sizeof(half));
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(USE_MANUAL_DMA)
|
||||
dmacpyLineSrcStrideStart(top + top_channel_y*neighborhood_grid_width*topheight*topwidth + blockIdx_y*topwidth,
|
||||
dline,
|
||||
topwidth*neighborhood_grid_width*sizeof(half),
|
||||
topwidth*sizeof(half),
|
||||
topwidth*topheight*sizeof(half));
|
||||
if (j == 0 && startchannel == 0) {
|
||||
memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
|
||||
} else {
|
||||
#if defined(USE_DMA)
|
||||
dmacpyLineSrcStrideStart(
|
||||
top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
|
||||
dline,
|
||||
topwidth * neighborhood_grid_width * sizeof(half),
|
||||
topwidth * sizeof(half),
|
||||
topwidth * topheight * sizeof(half));
|
||||
#else
|
||||
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++)
|
||||
{
|
||||
half8 val = ((__global half8*)(top + ((top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth)))[blockIdx_x];
|
||||
((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] = val;
|
||||
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
|
||||
half8 val = ((
|
||||
__global half8
|
||||
*)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
|
||||
[blockIdx_x];
|
||||
((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val;
|
||||
}
|
||||
for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++)
|
||||
{
|
||||
dline[top_channel_x*topwidth+blockIdx_x] =
|
||||
top[(top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth+blockIdx_x];
|
||||
for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
|
||||
dline[top_channel_x * topwidth + blockIdx_x] =
|
||||
top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
|
||||
+ blockIdx_y * topwidth + blockIdx_x];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (y1+j-padding >= 0 && y1+j-padding < bottomheight && y2+j-padding >= 0 && y2+j-padding < bottomheight)
|
||||
{
|
||||
crosscorrh(line0, line1, dline, topwidth, max_displacement, neighborhood_grid_radius,
|
||||
kernel_size, padding, bottomwidth, stride1, stride2, max_channels, subchannels);
|
||||
if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
|
||||
&& y2 + j - padding < bottomheight) {
|
||||
crosscorrh(
|
||||
line0,
|
||||
line1,
|
||||
dline,
|
||||
topwidth,
|
||||
max_displacement,
|
||||
neighborhood_grid_radius,
|
||||
kernel_size,
|
||||
padding,
|
||||
bottomwidth,
|
||||
stride1,
|
||||
stride2,
|
||||
max_channels,
|
||||
subchannels);
|
||||
}
|
||||
|
||||
if (j == kernel_size-1 && endchannel == bottomchannels)
|
||||
{
|
||||
half8 scale = (half8){(half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems, (half)sumelems};
|
||||
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++)
|
||||
{
|
||||
((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] =
|
||||
((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] / scale;
|
||||
if (j == kernel_size - 1 && endchannel == bottomchannels) {
|
||||
half8 scale = (half8){
|
||||
(half)sumelems,
|
||||
(half)sumelems,
|
||||
(half)sumelems,
|
||||
(half)sumelems,
|
||||
(half)sumelems,
|
||||
(half)sumelems,
|
||||
(half)sumelems,
|
||||
(half)sumelems};
|
||||
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
|
||||
((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] =
|
||||
((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale;
|
||||
}
|
||||
for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++)
|
||||
{
|
||||
dline[top_channel_x*topwidth+blockIdx_x] = dline[top_channel_x*topwidth+blockIdx_x]/(half)sumelems;
|
||||
for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
|
||||
dline[top_channel_x * topwidth + blockIdx_x] =
|
||||
dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(USE_MANUAL_DMA)
|
||||
dmacpyLineDstStrideStart(dline,
|
||||
top + top_channel_y*neighborhood_grid_width*topheight*topwidth + blockIdx_y*topwidth,
|
||||
topwidth*neighborhood_grid_width*sizeof(half),
|
||||
topwidth*sizeof(half),
|
||||
topwidth*topheight*sizeof(half));
|
||||
#if defined(USE_DMA)
|
||||
dmacpyLineDstStrideStart(
|
||||
dline,
|
||||
top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
|
||||
topwidth * neighborhood_grid_width * sizeof(half),
|
||||
topwidth * sizeof(half),
|
||||
topwidth * topheight * sizeof(half));
|
||||
#else
|
||||
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++)
|
||||
{
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth/8; blockIdx_x++)
|
||||
{
|
||||
((__global half8*)(top + ((top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth)))[blockIdx_x] =
|
||||
((half8*)(dline + top_channel_x*topwidth))[blockIdx_x] + (half8) {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
|
||||
for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
|
||||
((__global half8
|
||||
*)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
|
||||
[blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x]
|
||||
+ (half8){0, 0, 0, 0, 0, 0, 0, 0};
|
||||
}
|
||||
for (int blockIdx_x = (topwidth/8)*8; blockIdx_x < topwidth; blockIdx_x++)
|
||||
{
|
||||
top[(top_channel_y*neighborhood_grid_width+top_channel_x)*topheight*topwidth + blockIdx_y*topwidth+blockIdx_x]
|
||||
= dline[top_channel_x*topwidth+blockIdx_x] + (half)0;
|
||||
for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
|
||||
top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
|
||||
+ blockIdx_y * topwidth + blockIdx_x] =
|
||||
dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -3,10 +3,12 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__global half *find(__global const half *begin, __global const half *end, half value) {
|
||||
__global half *find(__global const half *begin, __global const half *end, half value)
|
||||
{
|
||||
while (begin != end) {
|
||||
if (*begin == value) {
|
||||
if (*begin == value) {
|
||||
return begin;
|
||||
}
|
||||
++begin;
|
||||
@ -14,160 +16,79 @@ __global half *find(__global const half *begin, __global const half *end, half v
|
||||
return end;
|
||||
}
|
||||
|
||||
#define USE_MANUAL_DMA
|
||||
|
||||
#ifdef USE_MANUAL_DMA
|
||||
|
||||
__kernel void __dma_preload_CTCDecoder(__global half *probabilities,
|
||||
__global half *sequence_indicators,
|
||||
__global half *output_sequences,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
__local half *local_src,
|
||||
__local half *local_dst)
|
||||
__kernel void CTCDecoder(
|
||||
__global half *restrict probabilities,
|
||||
__global half *restrict sequence_indicators,
|
||||
__global half *restrict output,
|
||||
int width,
|
||||
int height,
|
||||
int channels)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
probabilities, // src
|
||||
__local half local_src[88 * 1 * 77];
|
||||
__local half local_dst[88 * 1];
|
||||
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
local_src, // dst
|
||||
width * sizeof(half), // src_width,
|
||||
width * sizeof(half), // dst_width,
|
||||
width * height * sizeof(half), // src_stride,
|
||||
width * sizeof(half), // dst_stride,
|
||||
width * height * channels * sizeof(half), // size
|
||||
probabilities, // src
|
||||
width, // num_elements_per_line,
|
||||
height * channels, // num_lines,
|
||||
width * (height - 1), // src_line_stride,
|
||||
width * (height - 1), // dst_line_stride,
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_CTCDecoder(__global half *probabilities,
|
||||
__global half *sequence_indicators,
|
||||
__global half *output_sequences,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
__local half *local_src,
|
||||
__local half *local_dst)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
output_sequences, // dst
|
||||
channels * sizeof(half), // src_width,
|
||||
channels * sizeof(half), // dst_width,
|
||||
channels * sizeof(half), // src_stride,
|
||||
channels * sizeof(half), // dst_stride,
|
||||
channels * height * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
__kernel void CTCDecoder(__global half *probabilities,
|
||||
__global half *sequence_indicators,
|
||||
__global half *output_sequences,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
__local half *local_src,
|
||||
__local half *local_dst)
|
||||
{
|
||||
const int T = channels;
|
||||
const int B = height;
|
||||
const int C = width;
|
||||
const int T = channels; // Time
|
||||
const int B = height; // Batches
|
||||
const int C = width; // Chars
|
||||
|
||||
for (int i = 0; i < B*T; i++)
|
||||
{
|
||||
#pragma unroll 4
|
||||
for (int i = 0; i < B * T; i++) {
|
||||
local_dst[i] = -1.h;
|
||||
}
|
||||
|
||||
int output_index = 0;
|
||||
|
||||
for (int b = 0; b < B; ++b)
|
||||
{
|
||||
__global const half *seq_ind = sequence_indicators + b*T;
|
||||
for (int b = 0; b < B; ++b) {
|
||||
__global const half *restrict seq_ind = sequence_indicators + b * T;
|
||||
const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
|
||||
const int time = min(seq_len, T);
|
||||
const int time = min(seq_len, T);
|
||||
|
||||
int prev_class_idx = -1;
|
||||
|
||||
for (int t = 0; t < time; ++t)
|
||||
{
|
||||
__local const half *probs = local_src + b*C + t*C*B;
|
||||
int max_class_idx = 0;
|
||||
half max_prob = probs[0];
|
||||
#pragma unroll 4
|
||||
for (int t = 0; t < time; ++t) {
|
||||
__local const half *restrict probs = local_src + b * C + t * C * B;
|
||||
|
||||
for (int c = 1; c < C; ++c)
|
||||
{
|
||||
int max_class_idx = 0;
|
||||
half max_prob = probs[0];
|
||||
for (int c = 1; c < C; ++c) {
|
||||
const half prob = probs[c];
|
||||
if (prob > max_prob)
|
||||
{
|
||||
if (prob > max_prob) {
|
||||
max_class_idx = c;
|
||||
max_prob = prob;
|
||||
max_prob = prob;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_class_idx < C-1 && max_class_idx != prev_class_idx)
|
||||
{
|
||||
local_dst[b*T + output_index] = (half)max_class_idx;
|
||||
if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) {
|
||||
local_dst[b * T + output_index] = (half)max_class_idx;
|
||||
output_index++;
|
||||
}
|
||||
|
||||
prev_class_idx = max_class_idx;
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_2D2D(
|
||||
output, // dst
|
||||
local_dst, // src
|
||||
channels, // num_elements_per_line,
|
||||
height, // num_lines,
|
||||
0, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void CTCDecoder(__global half *probabilities,
|
||||
__global half *sequence_indicators,
|
||||
__global half *output_sequences,
|
||||
int width,
|
||||
int height,
|
||||
int channels,
|
||||
__local half *local_src,
|
||||
__local half *local_dst)
|
||||
{
|
||||
const int T = channels;
|
||||
const int B = height;
|
||||
const int C = width;
|
||||
|
||||
for (int i = 0; i < B*T; i++)
|
||||
{
|
||||
output_sequences[i] = -1.h;
|
||||
}
|
||||
|
||||
int output_index = 0;
|
||||
|
||||
for (int b = 0; b < B; ++b)
|
||||
{
|
||||
__global const half *seq_ind = sequence_indicators + b*T;
|
||||
const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
|
||||
const int time = min(seq_len, T);
|
||||
|
||||
int prev_class_idx = -1;
|
||||
|
||||
for (int t = 0; t < time; ++t)
|
||||
{
|
||||
__global const half *probs = probabilities + b*C + t*C*B;
|
||||
int max_class_idx = 0;
|
||||
half max_prob = probs[0];
|
||||
|
||||
for (int c = 1; c < C; ++c)
|
||||
{
|
||||
const half prob = probs[c];
|
||||
if (prob > max_prob)
|
||||
{
|
||||
max_class_idx = c;
|
||||
max_prob = prob;
|
||||
}
|
||||
}
|
||||
|
||||
if (max_class_idx < C-1 && max_class_idx != prev_class_idx)
|
||||
{
|
||||
output_sequences[b*T + output_index] = (half)max_class_idx;
|
||||
output_index++;
|
||||
}
|
||||
|
||||
prev_class_idx = max_class_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -1,6 +1,6 @@
|
||||
<CustomLayer name="ReorgYolo" type="MVCL" version="1">
|
||||
<Kernel entry="reorg_hwc_naive">
|
||||
<Source filename="reorg_hwc.bin"/>
|
||||
<Source filename="reorg_hwc_naive.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
|
||||
@ -8,15 +8,12 @@
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
||||
<Scalar arg-name="stride" type="int" source="stride"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="0"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="0"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="input,0" global="F,1,1" local="stride*stride,1,1"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<CustomLayer name="ReorgYolo" type="MVCL" version="1">
|
||||
<Where stride="2"/>
|
||||
<Kernel entry="reorg_chw">
|
||||
<Source filename="reorg_chw.bin"/>
|
||||
<Parameters>
|
||||
@ -26,22 +23,18 @@
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
||||
<Scalar arg-name="stride" type="int" source="stride"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*2*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*2*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="1,stride,1"/>
|
||||
<WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="stride,stride,1"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
||||
<Where do_softmax="1"/>
|
||||
<Kernel entry="region_chw">
|
||||
<Source filename="region.bin"/>
|
||||
<Source filename="region_chw.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
@ -50,82 +43,74 @@
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X+7)/8)*8*Y,num,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
|
||||
<WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
||||
<Where do_softmax="0" mask="0,1,2"/>
|
||||
<Kernel entry="region_chw">
|
||||
<Source filename="region.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
<Scalar arg-name="coords" type="int" source="coords"/>
|
||||
<Scalar arg-name="num" type="int" source="num"/>
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X+7)/8)*8*Y,3,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
<CustomLayer name="RegionYolo" type="MVCL" version="1">-->
|
||||
<Where do_softmax="0" mask="0,1,2"/>
|
||||
<Kernel entry="region_chw">
|
||||
<Source filename="region_chw.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
<Scalar arg-name="coords" type="int" source="coords"/>
|
||||
<Scalar arg-name="num" type="int" source="num"/>
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
||||
<Where do_softmax="1"/>
|
||||
<Kernel entry="region_hwc">
|
||||
<Source filename="region.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BYXF"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BYXF"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
<Scalar arg-name="coords" type="int" source="coords"/>
|
||||
<Scalar arg-name="num" type="int" source="num"/>
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X+7)/8)*8*Y,num,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
<Where do_softmax="1"/>
|
||||
<Kernel entry="region_hwc">
|
||||
<Source filename="region_hwc.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
<Scalar arg-name="coords" type="int" source="coords"/>
|
||||
<Scalar arg-name="num" type="int" source="num"/>
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<CustomLayer name="RegionYolo" type="MVCL" version="1">
|
||||
<Where do_softmax="0" mask="0,1,2"/>
|
||||
<Kernel entry="region_hwc">
|
||||
<Source filename="region.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BYXF"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BYXF"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*(coords+1+classes)*2"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
<Scalar arg-name="coords" type="int" source="coords"/>
|
||||
<Scalar arg-name="num" type="int" source="num"/>
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X+7)/8)*8*Y,3,1" local="((X+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
<Where do_softmax="0" mask="0,1,2"/>
|
||||
<Kernel entry="region_hwc">
|
||||
<Source filename="region_hwc.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="classes" type="int" source="classes"/>
|
||||
<Scalar arg-name="coords" type="int" source="coords"/>
|
||||
<Scalar arg-name="num" type="int" source="num"/>
|
||||
<Scalar arg-name="maskSize" type="int" source="3"/>
|
||||
<Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<!-- Pixel-wise kernel binding, local work group config is per line in the input tensor -->
|
||||
<CustomLayer name="GRN" type="MVCL" version="1">
|
||||
<Kernel entry="grn_NCHW">
|
||||
<Kernel entry="grn">
|
||||
<Source filename="grn.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
||||
<Scalar arg-name="bias" type="float" source="bias"/>
|
||||
</Parameters>
|
||||
@ -136,7 +121,7 @@
|
||||
<!-- Two stage layer binding, first kernel computes mean and variance, the second one normalizes input tensor-->
|
||||
<CustomLayer name="MVN" type="MVCL" version="1">
|
||||
<Kernel entry="reduction_mean" stage="0">
|
||||
<Source filename="mvn.bin"/>
|
||||
<Source filename="mvn_reduction.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="mean" type="output_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
|
||||
@ -144,12 +129,11 @@
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="across_channels" type="int" source="across_channels"/>
|
||||
<Data arg-name="src_line" type="local_data" dim="input,0" size="X*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
|
||||
</Kernel>
|
||||
<Kernel entry="mvn_scale" stage="1">
|
||||
<Source filename="mvn.bin"/>
|
||||
<Source filename="mvn_scale.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
@ -160,8 +144,6 @@
|
||||
<Scalar arg-name="across_channels" type="int" source="across_channels"/>
|
||||
<Scalar arg-name="normalize_variance" type="int" source="normalize_variance"/>
|
||||
<Scalar arg-name="nparts" type="int" port-index="0" source="I.Y"/>
|
||||
<Data arg-name="src_line" type="local_data" dim="input,0" size="X*2"/>
|
||||
<Data arg-name="dst_line" type="local_data" dim="input,0" size="X*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
|
||||
</Kernel>
|
||||
@ -174,12 +156,10 @@
|
||||
<Parameters>
|
||||
<Tensor arg-name="probabilities" type="input" port-index="0" format="FYX"/>
|
||||
<Tensor arg-name="sequence_indicators" type="input" port-index="1" format="BF"/>
|
||||
<Tensor arg-name="output_sequences" type="output" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="width" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="height" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="F*Y*X*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="F*Y*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="output,0" global="1,1,1" local="1,1,1"/>
|
||||
</Kernel>
|
||||
@ -204,64 +184,36 @@
|
||||
|
||||
<!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
|
||||
<CustomLayer name="FakeQuantize" type="MVCL" version="1">
|
||||
<!-- <Where levels="2"/> -->
|
||||
<Kernel entry="quantize">
|
||||
<Source filename="quantize.bin"/>
|
||||
<Source filename="fakequantize.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="input_low" type="input" port-index="1" format="ANY"/>
|
||||
<Tensor arg-name="input_high" type="input" port-index="2" format="ANY"/>
|
||||
<Tensor arg-name="output_low" type="input" port-index="3" format="ANY"/>
|
||||
<Tensor arg-name="output_high" type="input" port-index="4" format="ANY"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="levels" type="int" source="levels"/>
|
||||
<Scalar arg-name="input_low_size" type="int" port-index="1" source="I.F"/>
|
||||
<Scalar arg-name="input_high_size" type="int" port-index="2" source="I.F"/>
|
||||
<Scalar arg-name="output_low_size" type="int" port-index="3" source="I.F"/>
|
||||
<Scalar arg-name="output_high_size" type="int" port-index="4" source="I.F"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="input,0" global="1,Y,1" local="1,1,1"/>
|
||||
<WorkSizes dim="input,0" global="1,Y,F" local="1,Y,1"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
|
||||
<CustomLayer name="QuantizeTemporaryType" type="MVCL" version="1">
|
||||
<Where levels="256"/>
|
||||
<Kernel entry="quantize">
|
||||
<Source filename="binary_layers.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="input_low" type="input" port-index="1" format="BFYX"/>
|
||||
<Tensor arg-name="input_high" type="input" port-index="2" format="BFYX"/>
|
||||
<Tensor arg-name="output_low" type="input" port-index="3" format="BFYX"/>
|
||||
<Tensor arg-name="output_high" type="input" port-index="4" format="BFYX"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="levels" type="int" source="levels"/>
|
||||
<Scalar arg-name="input_low_size" type="int" source="input_low_size"/>
|
||||
<Scalar arg-name="input_high_size" type="int" source="input_high_size"/>
|
||||
<Scalar arg-name="output_low_size" type="int" source="output_low_size"/>
|
||||
<Scalar arg-name="output_high_size" type="int" source="output_high_size"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
|
||||
<Data arg-name="src_local" type="local_data" dim="input,0" size="X*Y*2"/>
|
||||
<Data arg-name="dst_local" type="local_data" dim="input,0" size="X*Y*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="input,0" global="1,1,F" local="1,1,1"/>
|
||||
</Kernel>
|
||||
</CustomLayer>
|
||||
|
||||
<CustomLayer name="QuantizeTemporaryType" type="MVCL" version="1">
|
||||
<CustomLayer name="FakeQuantizeBin" type="MVCL" version="1">
|
||||
<Where levels="2"/>
|
||||
<Kernel entry="binarization">
|
||||
<Source filename="binary_layers.bin"/>
|
||||
<Source filename="binarization.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="input_low_high" type="input" port-index="1" format="BFYX"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="switch_out" type="int" source="switch_out"/>
|
||||
<Scalar arg-name="input_low_high_size" type="int" source="input_low_size"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
@ -301,9 +253,6 @@
|
||||
<Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
|
||||
<Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
|
||||
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
||||
|
||||
<Data arg-name="src_local" type="local_data" dim="input,0" size="X*F*3*2"/>
|
||||
<Data arg-name="dst_local" type="local_data" dim="output,0" size="X*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
|
||||
</Kernel>
|
||||
@ -331,9 +280,6 @@
|
||||
<Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
|
||||
<Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
|
||||
<Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
|
||||
|
||||
<Data arg-name="src_local" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Data arg-name="dst_local" type="local_data" dim="output,0" size="X*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
|
||||
</Kernel>
|
||||
@ -343,7 +289,7 @@
|
||||
<!-- An example of a kernel binding that uses data blob from IR -->
|
||||
<CustomLayer name="BinaryConvolution" type="MVCL" version="1">
|
||||
<Kernel entry="binary_convolution">
|
||||
<Source filename="binary_layers.bin"/>
|
||||
<Source filename="binary_convolution.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
|
||||
@ -369,12 +315,10 @@
|
||||
<CustomLayer name="Resample" type="MVCL" version="1">
|
||||
<Where antialias="0"/>
|
||||
<Kernel entry="resample_nearest">
|
||||
<Source filename="resample_nn.bin"/>
|
||||
<Source filename="resample_noAA.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*ceil(1/factor)*F*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="output,0" size="X*F*2"/>
|
||||
<Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="factor" type="float" source="factor"/>
|
||||
@ -389,12 +333,10 @@
|
||||
<CustomLayer name="Resample" type="MVCL" version="1">
|
||||
<Where antialias="1"/>
|
||||
<Kernel entry="resample_with_antialias">
|
||||
<Source filename="resample_with_antialias.bin"/>
|
||||
<Source filename="resample_AA.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*5*F*2"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="output,0" size="X*F*2"/>
|
||||
<Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
|
||||
<Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
|
||||
<Scalar arg-name="factor" type="float" source="factor"/>
|
||||
@ -409,7 +351,7 @@
|
||||
<CustomLayer name="Convolution" type="MVCL" version="1">
|
||||
<Where kernel="1,1" dilation="1,1"/>
|
||||
<Kernel entry="Convolution1x1_NCHW">
|
||||
<Source filename="convolution1x1.bin"/>
|
||||
<Source filename="convolution1x1_chw.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="in" type="input" port-index="0" format="BFYX"/>
|
||||
<Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
|
||||
@ -429,10 +371,6 @@
|
||||
<Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
|
||||
<Scalar arg-name="output" type="int" port-index="0" source="output"/>
|
||||
<Scalar arg-name="group" type="int" port-index="0" source="group"/>
|
||||
|
||||
<Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Data arg-name="out_local" type="local_data" dim="output,0" size="X*2"/>
|
||||
|
||||
</Parameters>
|
||||
<WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
|
||||
</Kernel>
|
||||
@ -441,7 +379,7 @@
|
||||
<CustomLayer name="Convolution" type="MVCL" version="1">
|
||||
<Where kernel="1,1" dilation="1,1"/>
|
||||
<Kernel entry="Convolution1x1_NHWC">
|
||||
<Source filename="convolution1x1.bin"/>
|
||||
<Source filename="convolution1x1_hwc.bin"/>
|
||||
<Parameters>
|
||||
<Tensor arg-name="in" type="input" port-index="0" format="BYXF"/>
|
||||
<Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
|
||||
@ -461,9 +399,6 @@
|
||||
<Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
|
||||
<Scalar arg-name="output" type="int" port-index="0" source="output"/>
|
||||
<Scalar arg-name="group" type="int" port-index="0" source="group"/>
|
||||
|
||||
<Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
<Data arg-name="out_local" type="local_data" dim="output,0" size="X*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
|
||||
</Kernel>
|
||||
@ -509,8 +444,6 @@
|
||||
<Tensor arg-name="input_feature_map" type="input" port-index="1" format="BFYX"/>
|
||||
<Tensor arg-name="input_rois" type="input" port-index="2" format="BFYX"/>
|
||||
<Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
|
||||
<Data arg-name="local_input_priors" type="local_data" dim="input,1" size="X*2"/>
|
||||
<Data arg-name="local_output" type="local_data" dim="input,1" size="((X+7)/8)*12*2"/>
|
||||
<Scalar arg-name="grid_h" type="int" port-index="1" source="I.Y"/>
|
||||
<Scalar arg-name="grid_w" type="int" port-index="1" source="I.X"/>
|
||||
<Scalar arg-name="stride_h" type="float" source="stride_h"/>
|
||||
@ -530,8 +463,6 @@
|
||||
<Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="scale" type="float" source="scale"/>
|
||||
<Scalar arg-name="bias" type="float" source="bias"/>
|
||||
<Data arg-name="local_src" type="local_data" dim="input,0" size="X*1"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="input,0" global="X,Y,F" local="X,1,1"/>
|
||||
</Kernel>
|
||||
@ -570,7 +501,6 @@
|
||||
<Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
|
||||
<Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
|
||||
<Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
|
||||
<Data arg-name="local_dst" type="local_data" dim="input,0" size="X*F*2"/>
|
||||
</Parameters>
|
||||
<WorkSizes dim="input,0" global="(X+511)/512,Y,1" local="1,1,1"/>
|
||||
</Kernel>
|
||||
|
@ -3,88 +3,46 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define USE_MANUAL_DMA 1
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void __dma_preload_cvtu8f16(
|
||||
__global uchar* restrict src,
|
||||
__global half* restrict dst,
|
||||
float scale,
|
||||
float bias,
|
||||
__local uchar* restrict local_src,
|
||||
__local half* restrict local_dst)
|
||||
__kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias)
|
||||
{
|
||||
WorkGroupDmaCreate3DTransaction(
|
||||
src + get_group_id(0)*get_local_size(0)
|
||||
+ get_group_id(1)*get_local_size(1)*get_global_size(0)
|
||||
+ get_group_id(2)*get_local_size(2)*get_global_size(0)*get_global_size(1), // src
|
||||
__local uchar local_src[8 * 1024];
|
||||
__local half local_dst[8 * 1024];
|
||||
|
||||
event_t e1 = async_work_group_copy_3D3D(
|
||||
local_src, // dst
|
||||
get_local_size(0) * sizeof(uchar), // src width
|
||||
get_local_size(0) * sizeof(uchar), // dst width
|
||||
get_global_size(0) * sizeof(uchar), // src stride
|
||||
get_local_size(0) * sizeof(uchar), // dst stride
|
||||
src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
|
||||
+ get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src
|
||||
get_local_size(0), // num_elements_per_line
|
||||
get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines
|
||||
get_global_size(0) - get_local_size(0), // src_line_stride
|
||||
0, // dst_line_stride
|
||||
get_local_size(2), // num planes
|
||||
get_global_size(0) * get_global_size(1) * sizeof(uchar), // src plane stride
|
||||
get_local_size(0) * get_local_size(1) * sizeof(uchar), // dst plane stride
|
||||
get_local_size(0) * get_local_size(1) * sizeof(uchar), // plane size
|
||||
get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride
|
||||
0, // dst plane stride
|
||||
0);
|
||||
}
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
__kernel void __dma_postwrite_cvtu8f16(
|
||||
__global uchar* restrict src,
|
||||
__global half* restrict dst,
|
||||
float scale,
|
||||
float bias,
|
||||
__local uchar* restrict local_src,
|
||||
__local half* restrict local_dst)
|
||||
{
|
||||
WorkGroupDmaCreate3DTransaction(
|
||||
size_t idx = get_local_id(0)
|
||||
+ get_local_id(1) * get_local_size(0)
|
||||
+ get_local_id(2) * get_local_size(0) * get_local_size(1);
|
||||
|
||||
local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_3D3D(
|
||||
dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
|
||||
+ get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst
|
||||
local_dst, // src
|
||||
dst + get_group_id(0)*get_local_size(0)
|
||||
+ get_group_id(1)*get_local_size(1)*get_global_size(0)
|
||||
+ get_group_id(2)*get_local_size(2)*get_global_size(0)*get_global_size(1), // dst
|
||||
get_local_size(0) * sizeof(half), // src width
|
||||
get_local_size(0) * sizeof(half), // dst width
|
||||
get_local_size(0) * sizeof(half), // src stride
|
||||
get_global_size(0) * sizeof(half), // dst stride
|
||||
get_local_size(2), // num planes
|
||||
get_local_size(0) * get_local_size(1) * sizeof(half), // src plane stride
|
||||
get_global_size(0) * get_global_size(1) * sizeof(half), // dst plane stride
|
||||
get_local_size(0) * get_local_size(1) * sizeof(half), // plane size
|
||||
get_local_size(0), // num_elements_per_line
|
||||
get_local_size(1), // num_lines
|
||||
0, // src_line_stride
|
||||
get_global_size(0) - get_local_size(0), // dst_line_stride
|
||||
get_local_size(2), // num_planes
|
||||
0, // src_plane_stride
|
||||
get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
||||
__kernel void cvtu8f16(
|
||||
__global uchar* restrict src,
|
||||
__global half* restrict dst,
|
||||
float scale,
|
||||
float bias,
|
||||
__local uchar* restrict local_src,
|
||||
__local half* restrict local_dst)
|
||||
{
|
||||
size_t idx = get_local_id(0) +
|
||||
get_local_id(1)*get_local_size(0) +
|
||||
get_local_id(2)*get_local_size(0)*get_local_size(1);
|
||||
local_dst[idx] = convert_half(local_src[idx])*(half)scale+(half)bias;
|
||||
}
|
||||
|
||||
#else // defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void cvtu8f16(
|
||||
__global uchar* restrict src,
|
||||
__global half* restrict dst,
|
||||
float scale,
|
||||
float bias,
|
||||
__local uchar* restrict local_src, // unused, added for compatibility with DMA variant
|
||||
__local half* restrict local_dst) // unused, added for compatibility with DMA variant
|
||||
{
|
||||
int idx = get_global_id(0) +
|
||||
get_global_id(1) * get_global_size(0) +
|
||||
get_global_id(2) * get_global_size(0) * get_global_size(1);
|
||||
dst[idx] = convert_half(src[idx])*(half)scale+(half)bias;
|
||||
}
|
||||
|
||||
#endif // defined (USE_MANUAL_DMA)
|
||||
|
||||
|
@ -3,102 +3,63 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__kernel void __dma_preload_experimental_detectron_prior_grid_generator(
|
||||
__global const half* restrict input_priors,
|
||||
__global const half* restrict input_feature_map,
|
||||
__global const half* restrict input_rois,
|
||||
__global half* restrict output,
|
||||
__local half* restrict local_input_priors,
|
||||
__local half* restrict local_output,
|
||||
int grid_h,
|
||||
int grid_w,
|
||||
float stride_h,
|
||||
float stride_w,
|
||||
int num_priors,
|
||||
int num_anchors_per_prior) {
|
||||
|
||||
// Move input_priors to local memory.
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
input_priors, // src
|
||||
local_input_priors, // dst
|
||||
num_anchors_per_prior * num_priors * sizeof(half), // src_width
|
||||
num_anchors_per_prior * num_priors * sizeof(half), // dst_width
|
||||
num_anchors_per_prior * num_priors * sizeof(half), // src_stride
|
||||
num_anchors_per_prior * num_priors * sizeof(half), // dst_stride
|
||||
num_anchors_per_prior * num_priors * sizeof(half), // total_size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_experimental_detectron_prior_grid_generator(
|
||||
__global const half* restrict input_priors,
|
||||
__global const half* restrict input_feature_map,
|
||||
__global const half* restrict input_rois,
|
||||
__global half* restrict output,
|
||||
__local half* restrict local_input_priors,
|
||||
__local half* restrict local_output,
|
||||
int grid_h,
|
||||
int grid_w,
|
||||
float stride_h,
|
||||
float stride_w,
|
||||
int num_priors,
|
||||
int num_anchors_per_prior) {
|
||||
|
||||
int local_width = get_local_size(0);
|
||||
int width_start = get_group_id(0) * get_local_size(0);
|
||||
int width_end = min(width_start + local_width, grid_w);
|
||||
int width = width_end - width_start;
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_output, // src
|
||||
output + get_group_id(0) * get_local_size(0) *
|
||||
num_anchors_per_prior * num_priors
|
||||
+ get_group_id(1) * get_local_size(1) * grid_w *
|
||||
num_anchors_per_prior * num_priors, // dst
|
||||
width * num_anchors_per_prior * num_priors * sizeof(half), // src_width
|
||||
width * num_anchors_per_prior * num_priors * sizeof(half), // dst_width
|
||||
grid_w * num_anchors_per_prior * num_priors * sizeof(half), // src_stride
|
||||
grid_w * num_anchors_per_prior * num_priors * sizeof(half), // dst_stride
|
||||
width * num_anchors_per_prior * num_priors * sizeof(half), // total_size
|
||||
0);
|
||||
}
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void experimental_detectron_prior_grid_generator(
|
||||
__global const half* restrict input_priors,
|
||||
__global const half* restrict input_feature_map,
|
||||
__global const half* restrict input_rois,
|
||||
__global half* restrict output,
|
||||
__local half* restrict local_input_priors,
|
||||
__local half* restrict local_output,
|
||||
__global const half *restrict input_priors,
|
||||
__global const half *restrict input_feature_map,
|
||||
__global const half *restrict input_rois,
|
||||
__global half *restrict output,
|
||||
int grid_h,
|
||||
int grid_w,
|
||||
float stride_h,
|
||||
float stride_w,
|
||||
int num_priors,
|
||||
int num_anchors_per_prior) {
|
||||
int num_anchors_per_prior)
|
||||
{
|
||||
__local half local_input_priors[8 * 1024];
|
||||
__local half local_output[8 * 1024];
|
||||
|
||||
int workgroup_width = get_local_size(0);
|
||||
int width_start = get_group_id(0) * workgroup_width;
|
||||
int width_end = min(width_start + workgroup_width, grid_w);
|
||||
int width = width_end - width_start;
|
||||
event_t e1 = async_work_group_copy(
|
||||
local_input_priors,
|
||||
input_priors,
|
||||
num_anchors_per_prior * num_priors,
|
||||
0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int h = get_group_id(1);
|
||||
int w_idx = get_group_id(0) * workgroup_width;
|
||||
int width_start = get_group_id(0) * get_local_size(0);
|
||||
int width_end = min(width_start + get_local_size(0), (unsigned)grid_w);
|
||||
int width = width_end - width_start;
|
||||
|
||||
int h = get_group_id(1);
|
||||
int w_idx = get_group_id(0) * get_local_size(0);
|
||||
for (int w = 0; w < width; ++w) {
|
||||
#pragma unroll 4
|
||||
for (int p = 0; p < num_priors; ++p) {
|
||||
local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
|
||||
local_input_priors[4 * p + 0] +
|
||||
convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
|
||||
local_input_priors[4 * p + 0]
|
||||
+ convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
|
||||
local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
|
||||
local_input_priors[4 * p + 1] +
|
||||
convert_half(stride_h) * (convert_half(h) + 0.5);
|
||||
local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
|
||||
local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
|
||||
local_input_priors[4 * p + 2] +
|
||||
convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
|
||||
local_input_priors[4 * p + 2]
|
||||
+ convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
|
||||
local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
|
||||
local_input_priors[4 * p + 3] +
|
||||
convert_half(stride_h) * (convert_half(h) + 0.5);
|
||||
local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_2D2D(
|
||||
output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
|
||||
+ get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
|
||||
* num_priors, // dst
|
||||
local_output, // src
|
||||
width * num_anchors_per_prior * num_priors, // num_elements_per_line
|
||||
1, // num_lines
|
||||
(grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
|
||||
(grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
111
inference-engine/src/vpu/custom_kernels/fakequantize.cl
Normal file
111
inference-engine/src/vpu/custom_kernels/fakequantize.cl
Normal file
@ -0,0 +1,111 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void quantize(
|
||||
__global const half *restrict src_data,
|
||||
__global const half *restrict input_low,
|
||||
__global const half *restrict input_high,
|
||||
__global const half *restrict output_low,
|
||||
__global const half *restrict output_high,
|
||||
__global half *restrict dst_data,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int H)
|
||||
{
|
||||
__local half local_src[15 * 1024];
|
||||
__local half local_dst[15 * 1024];
|
||||
|
||||
event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int c = get_group_id(2);
|
||||
|
||||
half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]);
|
||||
half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
|
||||
half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]);
|
||||
half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
|
||||
|
||||
half const1 = (half)(
|
||||
!(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
|
||||
half const2 =
|
||||
(half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
|
||||
|
||||
__local const half *restrict src = local_src + W * get_local_id(1);
|
||||
__local half *restrict dst = local_dst + W * get_local_id(1);
|
||||
|
||||
for (int w = 0; w < W / 8; w++) {
|
||||
half8 val = *((__local half8 *)src + w);
|
||||
half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
|
||||
|
||||
aux = (half8){
|
||||
(half)(short)(aux.s0),
|
||||
(half)(short)(aux.s1),
|
||||
(half)(short)(aux.s2),
|
||||
(half)(short)(aux.s3),
|
||||
(half)(short)(aux.s4),
|
||||
(half)(short)(aux.s5),
|
||||
(half)(short)(aux.s6),
|
||||
(half)(short)(aux.s7)};
|
||||
|
||||
aux = aux * (half8)const2 + (half8)h_olow;
|
||||
|
||||
short8 a;
|
||||
short8 b;
|
||||
a.s0 = (val.s0 <= h_ilow);
|
||||
a.s1 = (val.s1 <= h_ilow);
|
||||
a.s2 = (val.s2 <= h_ilow);
|
||||
a.s3 = (val.s3 <= h_ilow);
|
||||
a.s4 = (val.s4 <= h_ilow);
|
||||
a.s5 = (val.s5 <= h_ilow);
|
||||
a.s6 = (val.s6 <= h_ilow);
|
||||
a.s7 = (val.s7 <= h_ilow);
|
||||
|
||||
b.s0 = (val.s0 > h_ihigh);
|
||||
b.s1 = (val.s1 > h_ihigh);
|
||||
b.s2 = (val.s2 > h_ihigh);
|
||||
b.s3 = (val.s3 > h_ihigh);
|
||||
b.s4 = (val.s4 > h_ihigh);
|
||||
b.s5 = (val.s5 > h_ihigh);
|
||||
b.s6 = (val.s6 > h_ihigh);
|
||||
b.s7 = (val.s7 > h_ihigh);
|
||||
|
||||
a = ~(a - (short8)1);
|
||||
b = ~(b - (short8)1);
|
||||
|
||||
short8 c1 = (~a & b);
|
||||
short8 c2 = (~a & ~b);
|
||||
|
||||
short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux));
|
||||
|
||||
*((__local half8 *)dst + w) = as_half8(res);
|
||||
}
|
||||
|
||||
for (int w = W & (~0x7); w < W; w++) {
|
||||
half val = src[w];
|
||||
short a = val <= h_ilow;
|
||||
a = ~(a - 1);
|
||||
short b = val > h_ihigh;
|
||||
b = ~(b - 1);
|
||||
|
||||
short c1 = (~a & b);
|
||||
short c2 = (~a & ~b);
|
||||
|
||||
short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh))
|
||||
| (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow)));
|
||||
|
||||
dst[w] = as_half(res);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -3,111 +3,61 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define USE_MANUAL_DMA 1
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void __dma_preload_grn_NCHW(
|
||||
__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int C,
|
||||
float bias)
|
||||
__kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias)
|
||||
{
|
||||
WorkGroupDmaCreate3DTransaction(
|
||||
src + get_group_id(0)*get_local_size(0)
|
||||
+ get_group_id(1)*get_local_size(1)*get_global_size(0), // src
|
||||
local_src, // dst
|
||||
get_local_size(0) * sizeof(half), // src width
|
||||
get_local_size(0) * sizeof(half), // dst width
|
||||
get_global_size(0) * sizeof(half), // src stride
|
||||
get_local_size(0) * sizeof(half), // dst stride
|
||||
C, // num planes
|
||||
get_global_size(0) * get_global_size(1) * sizeof(half), // src plane stride
|
||||
get_local_size(0) * get_local_size(1) * sizeof(half), // dst plane stride
|
||||
get_local_size(0) * get_local_size(1) * sizeof(half), // plane size
|
||||
__local half src[8 * 1024];
|
||||
__local half dst[8 * 1024];
|
||||
|
||||
const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0);
|
||||
|
||||
event_t e1 = async_work_group_copy_3D3D(
|
||||
src, // dst
|
||||
src_data + index, // src
|
||||
get_local_size(0), // num_elements_per_line,
|
||||
get_local_size(1), // num_lines,
|
||||
get_global_size(0) - get_local_size(0), // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
C, // num_planes,
|
||||
get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride
|
||||
0, // dst_plane_stride
|
||||
0);
|
||||
}
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
__kernel void __dma_postwrite_grn_NCHW(
|
||||
__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local const half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int C,
|
||||
float bias)
|
||||
{
|
||||
WorkGroupDmaCreate3DTransaction(
|
||||
local_dst, // src
|
||||
dst + get_group_id(0)*get_local_size(0)
|
||||
+ get_group_id(1)*get_local_size(1)*get_global_size(0), // dst
|
||||
get_local_size(0) * sizeof(half), // src width
|
||||
get_local_size(0) * sizeof(half), // dst width
|
||||
get_local_size(0) * sizeof(half), // src stride
|
||||
get_global_size(0) * sizeof(half), // dst stride
|
||||
C, // num planes
|
||||
get_local_size(0) * get_local_size(1) * sizeof(half), // src plane stride
|
||||
get_global_size(0) * get_global_size(1) * sizeof(half), // dst plane stride
|
||||
get_local_size(0) * get_local_size(1) * sizeof(half), // plane size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void grn_NCHW(
|
||||
__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int C,
|
||||
float bias)
|
||||
{
|
||||
float variance = bias + 1e-9f;
|
||||
|
||||
#pragma unroll 8
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
float val = (float) local_src[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)];
|
||||
variance += val*val;
|
||||
for (int c = 0; c < C; c++) {
|
||||
float val = (float)src[c * get_local_size(1) * get_local_size(0)
|
||||
+ get_local_id(1) * get_local_size(0)
|
||||
+ get_local_id(0)];
|
||||
variance += val * val;
|
||||
}
|
||||
|
||||
half hvariance = (half)(native_rsqrt((half)(variance/16.f))*0.25f);
|
||||
half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f);
|
||||
|
||||
#pragma unroll 8
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
local_dst[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)]
|
||||
= local_src[c*get_local_size(1)*get_local_size(0) + get_local_id(1)*get_local_size(0) + get_local_id(0)] * hvariance;
|
||||
for (int c = 0; c < C; c++) {
|
||||
dst[c * get_local_size(1) * get_local_size(0)
|
||||
+ get_local_id(1) * get_local_size(0)
|
||||
+ get_local_id(0)] =
|
||||
src[c * get_local_size(1) * get_local_size(0)
|
||||
+ get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_3D3D(
|
||||
dst_data + index, // src
|
||||
dst, // dst
|
||||
get_local_size(0), // num_elements_per_line,
|
||||
get_local_size(1), // num_lines,
|
||||
0, // src_line_stride,
|
||||
get_global_size(0) - get_local_size(0), // dst_line_stride,
|
||||
C, // num_planes,
|
||||
0, // src_plane_stride
|
||||
get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
||||
#else // defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void grn_NCHW(
|
||||
__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict local_src, // unused, added for compatibility with DMA variant
|
||||
__local half* restrict local_dst, // unused, added for compatibility with DMA variant
|
||||
int C,
|
||||
float bias)
|
||||
{
|
||||
float variance = bias + 1e-9f;
|
||||
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
float val = (float) src[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)];
|
||||
variance += val*val;
|
||||
}
|
||||
|
||||
half hvariance = (half)(native_rsqrt((half)(variance/16.f))*0.25f);
|
||||
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
dst[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)]
|
||||
= src[c*get_global_size(1)*get_global_size(0) + get_global_id(1)*get_global_size(0) + get_global_id(0)] * hvariance;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // defined (USE_MANUAL_DMA)
|
||||
|
@ -1,390 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
|
||||
#define USE_MANUAL_DMA 1
|
||||
|
||||
// Set to 1 if only output is zerroed before kernel execution
|
||||
#define USE_ATOMICS 0
|
||||
|
||||
void atomic_add_global(volatile __global float *source, const float operand) {
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} newVal;
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} prevVal;
|
||||
|
||||
do {
|
||||
prevVal.floatVal = *source;
|
||||
newVal.floatVal = prevVal.floatVal + operand;
|
||||
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
|
||||
}
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void __dma_preload_reduction_mean(const __global half* restrict src,
|
||||
__global float* restrict mean,
|
||||
__global float* restrict variance,
|
||||
int W,
|
||||
int H,
|
||||
int across_channels,
|
||||
__local half* restrict src_line)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + get_group_id(1)*get_local_size(1)*W +
|
||||
get_group_id(2)*get_local_size(2)*W*get_global_size(1), // src
|
||||
src_line, // dst
|
||||
W*get_local_size(1) * sizeof(half), // src width
|
||||
W*get_local_size(1) * sizeof(half), // dst width
|
||||
W*get_global_size(1) * sizeof(half), // src stride
|
||||
W*get_local_size(1) * sizeof(half), // dst stride
|
||||
W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
__kernel void reduction_mean(const __global half* restrict src,
|
||||
__global float* restrict mean,
|
||||
__global float* restrict variance,
|
||||
int W,
|
||||
int H,
|
||||
int across_channels,
|
||||
__local half* restrict src_line)
|
||||
{
|
||||
int h = get_global_id(1);
|
||||
int c = get_global_id(2);
|
||||
|
||||
const int MAX_LOCAL_SIZE = 8;
|
||||
|
||||
__local float mbuf[MAX_LOCAL_SIZE];
|
||||
__local float vbuf[MAX_LOCAL_SIZE];
|
||||
|
||||
mbuf[get_local_id(1)] = 0;
|
||||
vbuf[get_local_id(1)] = 0;
|
||||
|
||||
if (h < H)
|
||||
{
|
||||
float sum = 0.f;
|
||||
float sum2 = 0.f;
|
||||
|
||||
float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const __local half8* lsrc = ((const __local half8*)(src_line + get_local_id(1)*W) );
|
||||
|
||||
#pragma unroll 16
|
||||
for (size_t w = 0; w < W/8; w++)
|
||||
{
|
||||
half8 sh = lsrc[w];
|
||||
float8 valf = convert_float8(sh);
|
||||
|
||||
sum4 += valf;
|
||||
sum24 += valf*valf;
|
||||
}
|
||||
|
||||
for (size_t w = W/8*8; w < W; w++)
|
||||
{
|
||||
float val = (float)src_line[get_local_id(1)*W + w];
|
||||
sum += val;
|
||||
sum2 += val*val;
|
||||
}
|
||||
|
||||
mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
|
||||
vbuf[get_local_id(1)] = sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_local_id(1) == 0)
|
||||
{
|
||||
float res = 0;
|
||||
float res2 = 0;
|
||||
|
||||
for (int i = 0; i < get_local_size(1); i++)
|
||||
{
|
||||
res += mbuf[i];
|
||||
res2 += vbuf[i];
|
||||
}
|
||||
|
||||
// requires memory reset before layer execution
|
||||
#if USE_ATOMICS
|
||||
int idx = (across_channels == 0) ? c : 0;
|
||||
|
||||
atomic_add_global(mean + idx, res);
|
||||
atomic_add_global(variance + idx, res2);
|
||||
#else
|
||||
int idx = c*get_num_groups(1) + get_group_id(1);
|
||||
|
||||
mean[idx] = res;
|
||||
variance[idx] = res2;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_mvn_scale(const __global half * restrict src,
|
||||
__global half * restrict dst,
|
||||
__global float * restrict mean_part,
|
||||
__global float * restrict power_mean,
|
||||
int W,
|
||||
int H1,
|
||||
int across_channels,
|
||||
int normalize_variance,
|
||||
int nparts,
|
||||
__local half * restrict src_line,
|
||||
__local half * restrict dst_line
|
||||
)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + get_group_id(1)*get_local_size(1)*W +
|
||||
get_group_id(2)*get_local_size(2)*W*get_global_size(1), // src
|
||||
src_line, // dst
|
||||
W*get_local_size(1) * sizeof(half), // src width
|
||||
W*get_local_size(1) * sizeof(half), // dst width
|
||||
W*get_global_size(1) * sizeof(half), // src stride
|
||||
W*get_local_size(1) * sizeof(half), // dst stride
|
||||
W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_mvn_scale(const __global half * restrict src,
|
||||
__global half * restrict dst,
|
||||
__global float * restrict mean_part,
|
||||
__global float * restrict power_mean,
|
||||
int W,
|
||||
int H1,
|
||||
int across_channels,
|
||||
int normalize_variance,
|
||||
int nparts,
|
||||
__local half * restrict src_line,
|
||||
__local half * restrict dst_line)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
dst_line, // src
|
||||
dst + get_group_id(1)*get_local_size(1)*W +
|
||||
get_group_id(2)*get_local_size(2)*W*get_global_size(1), // dst
|
||||
W*get_local_size(1) * sizeof(half), // src width
|
||||
W*get_local_size(1) * sizeof(half), // dst width
|
||||
W*get_local_size(1) * sizeof(half), // dst stride
|
||||
W*get_global_size(1) * sizeof(half), // src stride
|
||||
W*get_local_size(1)*get_local_size(2)*sizeof(half), //total size
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
__kernel void mvn_scale(const __global half * restrict src,
|
||||
__global half * restrict dst,
|
||||
__global float * restrict mean_part,
|
||||
__global float * restrict power_mean,
|
||||
int W,
|
||||
int H1,
|
||||
int across_channels,
|
||||
int normalize_variance,
|
||||
int nparts,
|
||||
__local half * restrict src_line,
|
||||
__local half * restrict dst_line)
|
||||
{
|
||||
int h = get_global_id(1);
|
||||
int H = get_global_size(1);
|
||||
|
||||
// can we avoid this check and use min/max? We can pass number of groups just as a param.
|
||||
//#if !USE_ATOMICS
|
||||
// if (h >= H1) return;
|
||||
//#endif
|
||||
|
||||
int c = get_global_id(2);
|
||||
int C = get_global_size(2);
|
||||
|
||||
int idx = (across_channels == 0) ? nparts*c : 0;
|
||||
float scale = (across_channels == 0) ? H*W : H*W*C;
|
||||
|
||||
#if USE_ATOMICS
|
||||
float mean = mean_part[idx];
|
||||
float variance = power_mean[idx];
|
||||
#else
|
||||
|
||||
int total = (across_channels == 0) ? nparts : nparts*C;
|
||||
float mean = 0.f;
|
||||
float variance = 0.f;
|
||||
|
||||
for (int i = 0; i < total; i++)
|
||||
{
|
||||
mean += mean_part[idx+i];
|
||||
variance += power_mean[idx+i];
|
||||
}
|
||||
#endif
|
||||
|
||||
mean = mean/scale;
|
||||
variance = variance/scale;
|
||||
variance = variance - mean*mean;
|
||||
variance = native_sqrt(variance) + 1e-9f;
|
||||
|
||||
half hmean = mean;
|
||||
half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
|
||||
|
||||
const __local half8 * restrict src_data8 = (const __local half8 * restrict)(src_line + get_local_id(1)*W);
|
||||
__local half8 * restrict dst_data8 = (__local half8 * restrict)(dst_line + get_local_id(1)*W);
|
||||
|
||||
#pragma unroll 16
|
||||
for (size_t w = 0; w < W/8; w++)
|
||||
{
|
||||
dst_data8[w] = (src_data8[w] - hmean) * hvariance;
|
||||
}
|
||||
for (size_t w = W/8*8; w < W; w++)
|
||||
{
|
||||
dst_line[get_local_id(1)*W + w] = (src_line[get_local_id(1)*W + w] - hmean) * hvariance;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void reduction_mean(const __global half* restrict src,
|
||||
__global float* restrict mean,
|
||||
__global float* restrict variance,
|
||||
int W,
|
||||
int H,
|
||||
int across_channels,
|
||||
__local half* restrict src_line) // for compatimility with DMA kernel
|
||||
{
|
||||
int h = get_global_id(1);
|
||||
int c = get_global_id(2);
|
||||
|
||||
const int MAX_LOCAL_SIZE = 8;
|
||||
|
||||
__local float mbuf[MAX_LOCAL_SIZE];
|
||||
__local float vbuf[MAX_LOCAL_SIZE];
|
||||
|
||||
mbuf[get_local_id(1)] = 0;
|
||||
vbuf[get_local_id(1)] = 0;
|
||||
|
||||
if (h < H)
|
||||
{
|
||||
float sum = 0.f;
|
||||
float sum2 = 0.f;
|
||||
|
||||
float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const __global half8* src_line = (const __global half8 *)(src + c*H*W + h*W);
|
||||
|
||||
#pragma unroll 16
|
||||
for (size_t w = 0; w < W/8; w++)
|
||||
{
|
||||
half8 sh = src_line[w];
|
||||
float8 valf = convert_float8(sh);
|
||||
|
||||
sum4 += valf;
|
||||
sum24 += valf*valf;
|
||||
}
|
||||
|
||||
for (size_t w = W/8*8; w < W; w++)
|
||||
{
|
||||
float val = (float)src[c*H*W + h*W + w];
|
||||
|
||||
sum += val;
|
||||
sum2 += val*val;
|
||||
}
|
||||
|
||||
mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
|
||||
vbuf[get_local_id(1)] = sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_local_id(1) == 0)
|
||||
{
|
||||
float res = 0;
|
||||
float res2 = 0;
|
||||
|
||||
for (int i = 0; i < get_local_size(1); i++)
|
||||
{
|
||||
res += mbuf[i];
|
||||
res2 += vbuf[i];
|
||||
}
|
||||
|
||||
// requires memory reset before layer execution
|
||||
#if USE_ATOMICS
|
||||
int idx = (across_channels == 0) ? c : 0;
|
||||
|
||||
atomic_add_global(mean + idx, res);
|
||||
atomic_add_global(variance + idx, res2);
|
||||
#else
|
||||
int idx = c*get_num_groups(1) + get_group_id(1);
|
||||
|
||||
mean[idx] = res;
|
||||
variance[idx] = res2;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void mvn_scale(const __global half * restrict src_data,
|
||||
__global half * restrict dst_data,
|
||||
__global float * restrict mean_part,
|
||||
__global float * restrict power_mean,
|
||||
int W,
|
||||
int H1,
|
||||
int across_channels,
|
||||
int normalize_variance,
|
||||
int nparts,
|
||||
__local half * restrict src_line,
|
||||
__local half * restrict dst_line)
|
||||
{
|
||||
int h = get_global_id(1);
|
||||
int H = get_global_size(1);
|
||||
|
||||
// can we avoid this check and use min/max? We can pass number of groups just as a param.
|
||||
//#if !USE_ATOMICS
|
||||
// if (h >= H1) return;
|
||||
//#endif
|
||||
|
||||
int c = get_global_id(2);
|
||||
int C = get_global_size(2);
|
||||
|
||||
int idx = (across_channels == 0) ? nparts*c : 0;
|
||||
float scale = (across_channels == 0) ? H*W : H*W*C;
|
||||
|
||||
#if USE_ATOMICS
|
||||
float mean = mean_part[idx];
|
||||
float variance = power_mean[idx];
|
||||
#else
|
||||
|
||||
int total = (across_channels == 0) ? nparts : nparts*C;
|
||||
float mean = 0.f;
|
||||
float variance = 0.f;
|
||||
|
||||
for (int i = 0; i < total; i++)
|
||||
{
|
||||
mean += mean_part[idx+i];
|
||||
variance += power_mean[idx+i];
|
||||
}
|
||||
#endif
|
||||
|
||||
mean = mean/scale;
|
||||
variance = variance/scale;
|
||||
variance = variance - mean*mean;
|
||||
variance = native_sqrt(variance) + 1e-9f;
|
||||
|
||||
half hmean = mean;
|
||||
half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
|
||||
|
||||
const __global half8 * restrict src_data8 = (const __global half8 * restrict)(src_data + c*H*W + h*W);
|
||||
__global half8 * restrict dst_data8 = (__global half8 * restrict)(dst_data + c*H*W + h*W);
|
||||
|
||||
#pragma unroll 16
|
||||
for (size_t w = 0; w < W/8; w++)
|
||||
{
|
||||
dst_data8[w] = (src_data8[w] - hmean) * hvariance;
|
||||
}
|
||||
for (size_t w = W/8*8; w < W; w++)
|
||||
{
|
||||
dst_data[c*H*W + h*W + w] = (src_data[c*H*W + h*W + w] - hmean) * hvariance;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // USE_MANUAL_DMA
|
115
inference-engine/src/vpu/custom_kernels/mvn_reduction.cl
Normal file
115
inference-engine/src/vpu/custom_kernels/mvn_reduction.cl
Normal file
@ -0,0 +1,115 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
// Set to 1 only if output is zerroed before kernel execution
|
||||
#define USE_ATOMICS 0
|
||||
|
||||
void atomic_add_global(volatile __global float *source, const float operand)
|
||||
{
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} newVal;
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} prevVal;
|
||||
|
||||
do {
|
||||
prevVal.floatVal = *source;
|
||||
newVal.floatVal = prevVal.floatVal + operand;
|
||||
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
|
||||
}
|
||||
|
||||
__kernel void reduction_mean(
|
||||
__global const half *restrict src,
|
||||
__global float *restrict mean,
|
||||
__global float *restrict variance,
|
||||
int W,
|
||||
int H,
|
||||
int across_channels)
|
||||
{
|
||||
__local half src_line[4 * 1024];
|
||||
event_t e;
|
||||
|
||||
e = async_work_group_copy_2D2D(
|
||||
src_line, // dst
|
||||
src + get_group_id(1) * get_local_size(1) * W
|
||||
+ get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src
|
||||
W * get_local_size(1), // num_elements_per_line,
|
||||
get_local_size(2), // num_lines,
|
||||
W * (get_global_size(1) - get_local_size(1)), // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
|
||||
wait_group_events(1, &e);
|
||||
|
||||
int h = get_global_id(1);
|
||||
int c = get_global_id(2);
|
||||
|
||||
const int MAX_LOCAL_SIZE = 8;
|
||||
|
||||
__local float mbuf[MAX_LOCAL_SIZE];
|
||||
__local float vbuf[MAX_LOCAL_SIZE];
|
||||
|
||||
mbuf[get_local_id(1)] = 0;
|
||||
vbuf[get_local_id(1)] = 0;
|
||||
|
||||
if (h < H) {
|
||||
float sum = 0.f;
|
||||
float sum2 = 0.f;
|
||||
|
||||
float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W));
|
||||
|
||||
#pragma unroll 16
|
||||
for (size_t w = 0; w < W / 8; w++) {
|
||||
half8 sh = lsrc[w];
|
||||
float8 valf = convert_float8(sh);
|
||||
|
||||
sum4 += valf;
|
||||
sum24 += valf * valf;
|
||||
}
|
||||
|
||||
for (size_t w = W / 8 * 8; w < W; w++) {
|
||||
float val = (float)src_line[get_local_id(1) * W + w];
|
||||
sum += val;
|
||||
sum2 += val * val;
|
||||
}
|
||||
|
||||
mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
|
||||
vbuf[get_local_id(1)] =
|
||||
sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (get_local_id(1) == 0) {
|
||||
float res = 0;
|
||||
float res2 = 0;
|
||||
|
||||
for (int i = 0; i < get_local_size(1); i++) {
|
||||
res += mbuf[i];
|
||||
res2 += vbuf[i];
|
||||
}
|
||||
|
||||
// requires memory reset before layer execution
|
||||
#if USE_ATOMICS
|
||||
int idx = (across_channels == 0) ? c : 0;
|
||||
|
||||
atomic_add_global(mean + idx, res);
|
||||
atomic_add_global(variance + idx, res2);
|
||||
#else
|
||||
int idx = c * get_num_groups(1) + get_group_id(1);
|
||||
|
||||
mean[idx] = res;
|
||||
variance[idx] = res2;
|
||||
#endif
|
||||
}
|
||||
}
|
68
inference-engine/src/vpu/custom_kernels/mvn_scale.cl
Normal file
68
inference-engine/src/vpu/custom_kernels/mvn_scale.cl
Normal file
@ -0,0 +1,68 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
// Set to 1 only if output is zerroed before kernel execution
|
||||
#define USE_ATOMICS 0
|
||||
|
||||
__attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale(
|
||||
const __global half *restrict src,
|
||||
__global float *restrict mean_part,
|
||||
__global float *restrict power_mean,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H1,
|
||||
int across_channels,
|
||||
int normalize_variance,
|
||||
int nparts)
|
||||
{
|
||||
__local half src_line[4 * 1024];
|
||||
__local half dst_line[4 * 1024];
|
||||
|
||||
int c = get_group_id(2);
|
||||
int C = get_global_size(2);
|
||||
|
||||
int h = get_group_id(1);
|
||||
int H = get_global_size(1);
|
||||
|
||||
event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
int idx = (across_channels == 0) ? nparts * c : 0;
|
||||
float scale = (across_channels == 0) ? H * W : H * W * C;
|
||||
|
||||
#if USE_ATOMICS
|
||||
float mean = mean_part[idx];
|
||||
float variance = power_mean[idx];
|
||||
#else
|
||||
|
||||
int total = (across_channels == 0) ? nparts : nparts * C;
|
||||
float mean = 0.f;
|
||||
float variance = 0.f;
|
||||
|
||||
for (int i = 0; i < total; i++) {
|
||||
mean += mean_part[idx + i];
|
||||
variance += power_mean[idx + i];
|
||||
}
|
||||
#endif
|
||||
|
||||
mean = mean / scale;
|
||||
variance = variance / scale;
|
||||
variance = variance - mean * mean;
|
||||
variance = native_sqrt(variance) + 1e-9f;
|
||||
|
||||
half hmean = mean;
|
||||
half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
|
||||
|
||||
for (size_t w = 0; w < W; w++) {
|
||||
dst_line[w] = (src_line[w] - hmean) * hvariance;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -1,176 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__kernel void __dma_preload_quantize(__global half const *const restrict src,
|
||||
__global half const *const restrict input_low,
|
||||
__global half const *const restrict input_high,
|
||||
__global half const *const restrict output_low,
|
||||
__global half const *const restrict output_high,
|
||||
__global half *const restrict dst,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int C,
|
||||
__local half *const restrict local_src,
|
||||
__local half const *const restrict local_dst)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + get_group_id(1) * get_local_size(1) * W, // src
|
||||
local_src, // dst
|
||||
W * sizeof(half), // src_width,
|
||||
W * sizeof(half), // dst_width,
|
||||
get_global_size(1) * W * sizeof(half), // src_stride,
|
||||
W * sizeof(half), // dst_stride,
|
||||
W * C * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_quantize(__global half const *const restrict src,
|
||||
__global half const *const restrict input_low,
|
||||
__global half const *const restrict input_high,
|
||||
__global half const *const restrict output_low,
|
||||
__global half const *const restrict output_high,
|
||||
__global half *const restrict dst,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int C,
|
||||
__local half const *const restrict local_src,
|
||||
__local half const *const restrict local_dst)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
dst + get_group_id(1) * get_local_size(1) * W, // dst
|
||||
W * sizeof(half), // src_width,
|
||||
W * sizeof(half), // dst_width,
|
||||
W * sizeof(half), // src_stride,
|
||||
get_global_size(1) * W * sizeof(half), // dst_stride,
|
||||
W * C * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void quantize(__global half const *const restrict src,
|
||||
__global half const *const restrict input_low,
|
||||
__global half const *const restrict input_high,
|
||||
__global half const *const restrict output_low,
|
||||
__global half const *const restrict output_high,
|
||||
__global half const *const restrict dst,
|
||||
int levels,
|
||||
int input_low_size,
|
||||
int input_high_size,
|
||||
int output_low_size,
|
||||
int output_high_size,
|
||||
int W,
|
||||
int C,
|
||||
__local half const *const restrict local_src,
|
||||
__local half *const restrict local_dst)
|
||||
{
|
||||
int h = get_global_id(1);
|
||||
int H = get_global_size(1);
|
||||
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]);
|
||||
half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
|
||||
half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]);
|
||||
half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
|
||||
|
||||
half const1 = (half)(!(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
|
||||
half const2 = (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
|
||||
|
||||
__local const half* restrict addr_src = local_src + c*W;
|
||||
__local half* restrict addr_dst = local_dst + c*W;
|
||||
|
||||
for (int w = 0; w < W / 8; w++)
|
||||
{
|
||||
half8 val = *((__local half8*)addr_src + w);
|
||||
#if 1
|
||||
// round is too slow =( 902 b of code
|
||||
//half8 aux = round((val - (half8)h_ilow) * (half8)const1);
|
||||
|
||||
half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
|
||||
|
||||
aux = (half8){
|
||||
(half)(short)(aux.s0),
|
||||
(half)(short)(aux.s1),
|
||||
(half)(short)(aux.s2),
|
||||
(half)(short)(aux.s3),
|
||||
(half)(short)(aux.s4),
|
||||
(half)(short)(aux.s5),
|
||||
(half)(short)(aux.s6),
|
||||
(half)(short)(aux.s7)
|
||||
};
|
||||
|
||||
aux = aux * (half8)const2 + (half8)h_olow;
|
||||
|
||||
// vector comparison add 756 b of assembly, so do in manually
|
||||
// short8 a = val <= (half8)h_olow;
|
||||
// short8 b = val > (half8)h_ohigh;
|
||||
|
||||
short8 a;
|
||||
short8 b;
|
||||
a.s0 = (val.s0 <= h_ilow);
|
||||
a.s1 = (val.s1 <= h_ilow);
|
||||
a.s2 = (val.s2 <= h_ilow);
|
||||
a.s3 = (val.s3 <= h_ilow);
|
||||
a.s4 = (val.s4 <= h_ilow);
|
||||
a.s5 = (val.s5 <= h_ilow);
|
||||
a.s6 = (val.s6 <= h_ilow);
|
||||
a.s7 = (val.s7 <= h_ilow);
|
||||
|
||||
b.s0 = (val.s0 > h_ihigh);
|
||||
b.s1 = (val.s1 > h_ihigh);
|
||||
b.s2 = (val.s2 > h_ihigh);
|
||||
b.s3 = (val.s3 > h_ihigh);
|
||||
b.s4 = (val.s4 > h_ihigh);
|
||||
b.s5 = (val.s5 > h_ihigh);
|
||||
b.s6 = (val.s6 > h_ihigh);
|
||||
b.s7 = (val.s7 > h_ihigh);
|
||||
|
||||
a = ~(a-(short8)1);
|
||||
b = ~(b-(short8)1);
|
||||
|
||||
short8 c1 = (~a & b);
|
||||
short8 c2 = (~a & ~b);
|
||||
|
||||
short8 res = a & as_short8((half8)h_olow)
|
||||
| c1 & as_short8((half8)h_ohigh)
|
||||
| c2 & as_short8(aux);
|
||||
|
||||
*((__local half8*)addr_dst + w) = as_half8(res);
|
||||
#else
|
||||
*((__local half8*)addr_dst + w) = val;
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int w = W & (~0x7); w < W; w++)
|
||||
//for (int w = 0 ; w < W; w++)
|
||||
{
|
||||
half val = addr_src[w];
|
||||
#if 1
|
||||
short a = val <= h_ilow; a = ~(a-1);
|
||||
short b = val > h_ihigh; b = ~(b-1);
|
||||
|
||||
short c1 = (~a & b);
|
||||
short c2 = (~a & ~b);
|
||||
|
||||
short res = a & as_short(h_olow)
|
||||
| c1 & as_short(h_ohigh)
|
||||
| c2 & as_short(((half)(round( (val - h_ilow) * const1) * const2) + h_olow));
|
||||
|
||||
addr_dst[w] = as_half(res);
|
||||
#else
|
||||
addr_dst[w] = val;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
@ -1,474 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
|
||||
|
||||
#define ALLOW_EARLY_RETURN 1
|
||||
|
||||
#define USE_MANUAL_DMA 1
|
||||
|
||||
#if USE_MANUAL_DMA
|
||||
|
||||
static void inline logistic_activate(__local const half* restrict src,
|
||||
__local half* restrict dst,
|
||||
int offset)
|
||||
{
|
||||
half val = src[offset];
|
||||
val = 1.0h / (1.0h + exp2(val * -log_2_e));
|
||||
dst[offset] = val;
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_region_chw(
|
||||
__global const half* restrict src,
|
||||
__global half* restrict _0,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict _1,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1)*local_C;
|
||||
const int h = get_group_id(0);
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + c*H*W + h*W, // src
|
||||
local_src, // dst
|
||||
W*sizeof(half), // src_width,
|
||||
W*sizeof(half), // dst_width,
|
||||
W*H*sizeof(half), // src_stride,
|
||||
W*sizeof(half), // dst_stride,
|
||||
W*local_C*sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_region_chw(
|
||||
__global half* restrict _0,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict _1,
|
||||
__local const half* restrict local_dst,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1)*local_C;
|
||||
const int h = get_group_id(0);
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
dst + c*H*W + h*W, // dst
|
||||
W*sizeof(half), // src_width,
|
||||
W*sizeof(half), // dst_width,
|
||||
W*sizeof(half), // src_stride,
|
||||
W*H*sizeof(half), // dst_stride,
|
||||
W*local_C*sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void region_chw(
|
||||
__global half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
__local const half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int w = get_local_id(0);
|
||||
|
||||
#if ALLOW_EARLY_RETURN
|
||||
if (w >= W) return;
|
||||
#endif
|
||||
|
||||
__local const half *restrict src = local_src + w;
|
||||
__local half *restrict dst = local_dst + w;
|
||||
|
||||
const int stride = W;
|
||||
logistic_activate(src, dst, 0*stride);
|
||||
logistic_activate(src, dst, 1*stride);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst[2*stride] = src[2*stride];
|
||||
dst[3*stride] = src[3*stride];
|
||||
|
||||
logistic_activate(src, dst, 4*stride);
|
||||
|
||||
src += (coords + 1)*stride;
|
||||
dst += (coords + 1)*stride;
|
||||
|
||||
if (doSoftmax)
|
||||
{
|
||||
half max_val = src[0];
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
max_val = max(max_val, src[c*stride]);
|
||||
}
|
||||
|
||||
half expSum = 0.0h;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
const half e = src[c*stride] - max_val;
|
||||
const half tmp = exp2(e * log_2_e);
|
||||
dst[c*stride] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
|
||||
const half invExpSum = 1.0h / expSum;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
dst[c*stride] *= invExpSum;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
logistic_activate(src, dst, c*stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_region_hwc(
|
||||
__global const half* restrict src,
|
||||
__global half* restrict _0,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict _1,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1)*local_C;
|
||||
const int h = get_group_id(0);
|
||||
if (!doSoftmax) num = maskSize;
|
||||
const int C = local_C*num;
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + h*W*C + c, // src
|
||||
local_src, // dst
|
||||
local_C*sizeof(half), // src_width,
|
||||
local_C*sizeof(half), // dst_width,
|
||||
C*sizeof(half), // src_stride,
|
||||
local_C*sizeof(half), // dst_stride,
|
||||
local_C*W*sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_region_hwc(
|
||||
__global half* restrict _0,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict _1,
|
||||
__local const half* restrict local_dst,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
// Region always outputs in CHW layout; same as postwrite_chw
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1)*local_C;
|
||||
const int h = get_group_id(0);
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
dst + c*H*W + h*W, // dst
|
||||
W*sizeof(half), // src_width,
|
||||
W*sizeof(half), // dst_width,
|
||||
W*sizeof(half), // src_stride,
|
||||
W*H*sizeof(half), // dst_stride,
|
||||
W*local_C*sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
static void inline logistic_activate_hwc(__local const half* restrict src,
|
||||
__local half* restrict dst,
|
||||
int offset,
|
||||
int stride)
|
||||
{
|
||||
half val = src[offset];
|
||||
val = 1.0h / (1.0h + exp2(val * -log_2_e));
|
||||
dst[offset*stride] = val;
|
||||
}
|
||||
|
||||
__kernel void region_hwc(
|
||||
__global half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
__local const half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int w = get_local_id(0);
|
||||
|
||||
#if ALLOW_EARLY_RETURN
|
||||
if (w >= W) return;
|
||||
#endif
|
||||
|
||||
const int local_C = classes + coords + 1;
|
||||
|
||||
__local const half *restrict src = local_src + w*local_C;
|
||||
__local half *restrict dst = local_dst + w;
|
||||
|
||||
const int stride = W;
|
||||
logistic_activate_hwc(src, dst, 0, stride);
|
||||
logistic_activate_hwc(src, dst, 1, stride);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst[2*stride] = src[2];
|
||||
dst[3*stride] = src[3];
|
||||
|
||||
logistic_activate_hwc(src, dst, 4, stride);
|
||||
|
||||
src += coords + 1;
|
||||
dst += (coords + 1)*stride;
|
||||
|
||||
if (doSoftmax)
|
||||
{
|
||||
half max_val = src[0];
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
max_val = max(max_val, src[c]);
|
||||
}
|
||||
|
||||
half expSum = 0.0h;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
const half e = src[c] - max_val;
|
||||
const half tmp = exp2(e * log_2_e);
|
||||
dst[c*stride] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
|
||||
const half invExpSum = 1.0h / expSum;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
dst[c*stride] *= invExpSum;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
logistic_activate_hwc(src, dst, c, stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else // defined (USE_MANUAL_DMA)
|
||||
|
||||
#define NUM_CLASSES 80
|
||||
|
||||
static void inline logistic_activate(__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
int offset)
|
||||
{
|
||||
half val = src[offset];
|
||||
val = 1.0h / (1.0h + exp2(val * -log_2_e));
|
||||
dst[offset] = val;
|
||||
}
|
||||
|
||||
__kernel void region_chw(
|
||||
__global const half* restrict global_src,
|
||||
__global half* restrict global_dst,
|
||||
__local half* restrict _0,
|
||||
__local half* restrict _1,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int w = get_local_id(0);
|
||||
|
||||
#if ALLOW_EARLY_RETURN
|
||||
if (w >= W) return;
|
||||
#endif
|
||||
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1)*local_C;
|
||||
const int h = get_group_id(0);
|
||||
|
||||
__global const half *restrict src = global_src + c*H*W + h*W + w;
|
||||
__global half *restrict dst = global_dst + c*H*W + h*W + w;
|
||||
|
||||
const int stride = H*W;
|
||||
logistic_activate(src, dst, 0*stride);
|
||||
logistic_activate(src, dst, 1*stride);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst[2*stride] = src[2*stride];
|
||||
dst[3*stride] = src[3*stride];
|
||||
|
||||
logistic_activate(src, dst, 4*stride);
|
||||
|
||||
src += (coords + 1)*stride;
|
||||
dst += (coords + 1)*stride;
|
||||
|
||||
if (doSoftmax)
|
||||
{
|
||||
__private half data[NUM_CLASSES];
|
||||
|
||||
half max_val = src[0];
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
half tmp = src[c*stride];
|
||||
data[c] = tmp;
|
||||
max_val = max(max_val, tmp);
|
||||
}
|
||||
|
||||
half expSum = 0.0h;
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
half tmp = half_exp(data[c] - max_val);
|
||||
data[c] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
dst[c*stride] = data[c] / expSum;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
logistic_activate(src, dst, c*stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void inline logistic_activate_hwc(__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
int offset,
|
||||
int stride)
|
||||
{
|
||||
half val = src[offset];
|
||||
val = 1.0h / (1.0h + exp2(val * -log_2_e));
|
||||
dst[offset*stride] = val;
|
||||
}
|
||||
|
||||
|
||||
__kernel void region_hwc(
|
||||
__global const half* restrict global_src,
|
||||
__global half* restrict global_dst,
|
||||
__local half* restrict _0,
|
||||
__local half* restrict _1,
|
||||
int W, /* 13 */
|
||||
int H, /* 13 */
|
||||
int classes, /* 20 */
|
||||
int coords, /* 4 */
|
||||
int num, /* 5 */
|
||||
int maskSize,
|
||||
int doSoftmax
|
||||
)
|
||||
{
|
||||
const int w = get_local_id(0);
|
||||
|
||||
#if ALLOW_EARLY_RETURN
|
||||
if (w >= W) return;
|
||||
#endif
|
||||
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1)*local_C;
|
||||
const int h = get_group_id(0);
|
||||
const int C = num*local_C;
|
||||
|
||||
__global const half *restrict src = global_src + h*W*C + w*C + c;
|
||||
__global half *restrict dst = global_dst + c*H*W + h*W + w;
|
||||
|
||||
const int stride = H*W;
|
||||
logistic_activate_hwc(src, dst, 0, stride);
|
||||
logistic_activate_hwc(src, dst, 1, stride);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst[2*stride] = src[2];
|
||||
dst[3*stride] = src[3];
|
||||
|
||||
logistic_activate_hwc(src, dst, 4, stride);
|
||||
|
||||
src += coords + 1;
|
||||
dst += (coords + 1)*stride;
|
||||
|
||||
if (doSoftmax)
|
||||
{
|
||||
__private half data[NUM_CLASSES];
|
||||
|
||||
half max_val = src[0];
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
half tmp = src[c];
|
||||
data[c] = tmp;
|
||||
max_val = max(max_val, tmp);
|
||||
}
|
||||
|
||||
half expSum = 0.0h;
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
half tmp = half_exp(data[c] - max_val);
|
||||
data[c] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
dst[c*stride] = data[c] / expSum;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++)
|
||||
{
|
||||
logistic_activate_hwc(src, dst, c, stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // defined (USE_MANUAL_DMA)
|
@ -3,75 +3,106 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define NUM_CLASSES 80
|
||||
__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
|
||||
|
||||
#define nlog_2_e ((half)(-1.442695040888963))
|
||||
#define ALLOW_EARLY_RETURN 1
|
||||
|
||||
static void logistic_activate(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int offset)
|
||||
static void inline logistic_activate(__local const half *restrict src, __local half *restrict dst, int offset)
|
||||
{
|
||||
half val = src_data[offset];
|
||||
val = 1.f/(1.f + __builtin_shave_sau_exp2_f16_l_r(val*nlog_2_e));
|
||||
dst_data[offset] = val;
|
||||
half val = src[offset];
|
||||
val = 1.0h / (1.0h + exp2(val * -log_2_e));
|
||||
dst[offset] = val;
|
||||
}
|
||||
|
||||
__kernel void region_ocl(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int W,
|
||||
int H,
|
||||
int classes,
|
||||
int coords,
|
||||
int num,
|
||||
int maskSize,
|
||||
int doSoftmax)
|
||||
__kernel void region_chw(
|
||||
__global const half *restrict src_data,
|
||||
__global half *restrict dst_data,
|
||||
int W,
|
||||
int H,
|
||||
int classes,
|
||||
int coords,
|
||||
int num,
|
||||
int maskSize,
|
||||
int doSoftmax)
|
||||
{
|
||||
int box_sz = H * W * (classes + coords + 1);
|
||||
int pixel_pos = min((int)get_global_id(0), H*W);
|
||||
int box = get_global_id(1);
|
||||
__local half local_src[13 * 13 * (4 + 1 + 80)];
|
||||
__local half local_dst[13 * 13 * (4 + 1 + 80)];
|
||||
|
||||
//if (pixel_pos >= H*W) return;
|
||||
const int box_sz = W * H * (classes + coords + 1);
|
||||
event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(1) * box_sz, box_sz, 0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W);
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W);
|
||||
const int pixel_pos = get_local_id(0);
|
||||
const int stride = W * H;
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W];
|
||||
dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W];
|
||||
#if ALLOW_EARLY_RETURN
|
||||
if (pixel_pos < W * H)
|
||||
#endif
|
||||
{
|
||||
__local const half *restrict src = local_src + pixel_pos;
|
||||
__local half *restrict dst = local_dst + pixel_pos;
|
||||
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W);
|
||||
logistic_activate(src, dst, 0 * stride);
|
||||
logistic_activate(src, dst, 1 * stride);
|
||||
|
||||
int data_offset = box * box_sz + (coords + 1) * W * H;
|
||||
//copy plane 2 and 3
|
||||
dst[2 * stride] = src[2 * stride];
|
||||
dst[3 * stride] = src[3 * stride];
|
||||
|
||||
__private half data[NUM_CLASSES];
|
||||
logistic_activate(src, dst, 4 * stride);
|
||||
|
||||
if (doSoftmax) {
|
||||
half max_val = src_data[data_offset + 0*H*W + pixel_pos];
|
||||
for (int c = 0; c < classes; c++) {
|
||||
half tmp = src_data[data_offset + c*H*W + pixel_pos];
|
||||
data[c] = tmp;
|
||||
max_val = max( max_val, tmp);
|
||||
}
|
||||
src += (coords + 1) * stride;
|
||||
dst += (coords + 1) * stride;
|
||||
|
||||
half expSum = 0.0f;
|
||||
if (doSoftmax) {
|
||||
half max_val = src[0];
|
||||
#pragma unroll 4
|
||||
for (int c = 1; c < classes; c++) {
|
||||
max_val = max(max_val, src[c * stride]);
|
||||
}
|
||||
|
||||
for (int c = 0; c < classes; c++) {
|
||||
half tmp = half_exp(data[c] - max_val);
|
||||
data[c] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
for (int c = 0; c < classes; c++) {
|
||||
data[c] = data[c] / expSum;
|
||||
}
|
||||
half expSum = 0.0h;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++) {
|
||||
const half e = src[c * stride] - max_val;
|
||||
const half tmp = exp2(e * log_2_e);
|
||||
dst[c * stride] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
|
||||
for (int c = 0; c < classes; c++) {
|
||||
dst_data[data_offset + c*H*W + pixel_pos + 0] = data[c];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i = 0; i < classes; i++) {
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + (5 + i)*H*W);
|
||||
const half recip = 1.h / expSum;
|
||||
int c = 0;
|
||||
for (; c < (classes & ~0x3); c += 4) {
|
||||
const half t0 = dst[(c + 0) * stride];
|
||||
const half t1 = dst[(c + 1) * stride];
|
||||
const half t2 = dst[(c + 2) * stride];
|
||||
const half t3 = dst[(c + 3) * stride];
|
||||
|
||||
const half e0 = t0 * recip;
|
||||
const half e1 = t1 * recip;
|
||||
const half e2 = t2 * recip;
|
||||
const half e3 = t3 * recip;
|
||||
|
||||
dst[(c + 0) * stride] = e0;
|
||||
dst[(c + 1) * stride] = e1;
|
||||
dst[(c + 2) * stride] = e2;
|
||||
dst[(c + 3) * stride] = e3;
|
||||
}
|
||||
for (; c < classes; c++) {
|
||||
dst[c * stride] *= recip;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++) {
|
||||
logistic_activate(src, dst, c * stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy(dst_data + get_group_id(1) * box_sz, local_dst, box_sz, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
@ -1,58 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define NUM_CLASSES 80
|
||||
|
||||
static void logistic_activate(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int offset)
|
||||
{
|
||||
half val = src_data[offset];
|
||||
val = 1.0f/(1.0f + native_exp(-val));
|
||||
dst_data[offset] = val;
|
||||
}
|
||||
|
||||
__kernel void region_ocl(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int W,
|
||||
int H,
|
||||
int classes,
|
||||
int coords)
|
||||
{
|
||||
const int box_sz = H * W * (classes + coords + 1);
|
||||
const int pixel_pos = min((int)get_global_id(0), ((H*W) - 1));
|
||||
const int box = get_global_id(1);
|
||||
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W);
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W];
|
||||
dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W];
|
||||
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W);
|
||||
int data_offset = box * box_sz + (coords + 1) * W * H;
|
||||
|
||||
__private half data[NUM_CLASSES];
|
||||
|
||||
half max_val = src_data[data_offset + 0*H*W + pixel_pos];
|
||||
for (int c = 0; c < classes; c++) {
|
||||
half tmp = src_data[data_offset + c*H*W + pixel_pos];
|
||||
data[c] = tmp;
|
||||
max_val = max( max_val, tmp);
|
||||
}
|
||||
|
||||
half expSum = 0.0f;
|
||||
|
||||
for (int c = 0; c < classes; c++) {
|
||||
half tmp = half_exp(data[c] - max_val);
|
||||
data[c] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
for (int c = 0; c < classes; c++) {
|
||||
dst_data[data_offset + c*H*W + pixel_pos + 0] = data[c] / expSum;
|
||||
}
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define NUM_CLASSES 80
|
||||
|
||||
static void logistic_activate(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int offset)
|
||||
{
|
||||
half val = src_data[offset];
|
||||
val = 1.0f/(1.0f + native_exp(-val));
|
||||
dst_data[offset] = val;
|
||||
}
|
||||
|
||||
__kernel void region_ocl(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int W,
|
||||
int H,
|
||||
int classes,
|
||||
int coords)
|
||||
{
|
||||
int box_sz = H * W * (classes + coords + 1);
|
||||
int pixel_pos = min((int)get_global_id(0), ((H*W) - 1));
|
||||
int box = get_global_id(1);
|
||||
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 0*H*W);
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 1*H*W);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst_data[box * box_sz + pixel_pos + 2*H*W] = src_data[box * box_sz + pixel_pos + 2*H*W];
|
||||
dst_data[box * box_sz + pixel_pos + 3*H*W] = src_data[box * box_sz + pixel_pos + 3*H*W];
|
||||
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + 4*H*W);
|
||||
|
||||
int data_offset = box * box_sz + (coords + 1) * W * H;
|
||||
|
||||
for (int i = 0; i < classes; i++) {
|
||||
logistic_activate(src_data, dst_data, box * box_sz + pixel_pos + (5 + i)*H*W);
|
||||
}
|
||||
}
|
114
inference-engine/src/vpu/custom_kernels/region_hwc.cl
Normal file
114
inference-engine/src/vpu/custom_kernels/region_hwc.cl
Normal file
@ -0,0 +1,114 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
|
||||
|
||||
#define ALLOW_EARLY_RETURN 1
|
||||
|
||||
static void inline logistic_activate_hwc(
|
||||
__local const half *restrict src,
|
||||
__local half *restrict dst,
|
||||
int offset,
|
||||
int stride)
|
||||
{
|
||||
half val = src[offset];
|
||||
val = 1.0h / (1.0h + exp2(val * -log_2_e));
|
||||
dst[offset * stride] = val;
|
||||
}
|
||||
|
||||
__kernel void region_hwc(
|
||||
__global const half *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int classes,
|
||||
int coords,
|
||||
int num,
|
||||
int maskSize,
|
||||
int doSoftmax)
|
||||
{
|
||||
__local half local_src[13 * 13 * (4 + 1 + 80)];
|
||||
__local half local_dst[13 * 13 * (4 + 1 + 80)];
|
||||
|
||||
const int pixel_pos = get_local_id(0);
|
||||
|
||||
const int local_C = classes + coords + 1;
|
||||
const int c = get_group_id(1) * local_C;
|
||||
const int h = get_group_id(0);
|
||||
|
||||
num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
|
||||
const int C = local_C * num;
|
||||
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
local_src, // dst
|
||||
src + h * W * C + c, // src
|
||||
local_C, // num_elements_per_line,
|
||||
H * W, // num_lines,
|
||||
C - local_C, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
#if ALLOW_EARLY_RETURN
|
||||
if (pixel_pos < W * H)
|
||||
#endif
|
||||
{
|
||||
const int w = pixel_pos % W;
|
||||
const int h = pixel_pos / W;
|
||||
|
||||
__local const half *restrict src = local_src + h * W * local_C + w * local_C;
|
||||
__local half *restrict dst = local_dst + h * W + w;
|
||||
|
||||
const int stride = H * W;
|
||||
logistic_activate_hwc(src, dst, 0, stride);
|
||||
logistic_activate_hwc(src, dst, 1, stride);
|
||||
|
||||
//copy plane 2 and 3
|
||||
dst[2 * stride] = src[2];
|
||||
dst[3 * stride] = src[3];
|
||||
|
||||
logistic_activate_hwc(src, dst, 4, stride);
|
||||
|
||||
src += coords + 1;
|
||||
dst += (coords + 1) * stride;
|
||||
|
||||
if (doSoftmax) {
|
||||
half max_val = src[0];
|
||||
#pragma unroll 4
|
||||
for (int c = 1; c < classes; c++) {
|
||||
max_val = max(max_val, src[c]);
|
||||
}
|
||||
|
||||
half expSum = 0.0h;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++) {
|
||||
const half e = src[c] - max_val;
|
||||
const half tmp = exp2(e * log_2_e);
|
||||
dst[c * stride] = tmp;
|
||||
expSum += tmp;
|
||||
}
|
||||
|
||||
const half invExpSum = 1.0h / expSum;
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++) {
|
||||
dst[c * stride] *= invExpSum;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll 4
|
||||
for (int c = 0; c < classes; c++) {
|
||||
logistic_activate_hwc(src, dst, c, stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
const int box_sz = W * H * (classes + coords + 1);
|
||||
event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -3,119 +3,65 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define USE_MANUAL_DMA
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void __dma_preload_reorg_chw(__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict local_src,
|
||||
__local half *restrict local_dst
|
||||
)
|
||||
__kernel void reorg_chw(
|
||||
__global const half *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride)
|
||||
{
|
||||
const int stride_y = get_group_id(1);
|
||||
__local half local_src[8 * 1024];
|
||||
__local half local_dst[8 * 1024];
|
||||
|
||||
const int srcIdx = stride_y*W*stride + W*stride*stride*get_group_id(0);
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + srcIdx, // src
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
local_src, // dst
|
||||
W * stride * sizeof(half), // src width
|
||||
W * stride * sizeof(half), // dst width
|
||||
W * stride * stride * get_num_groups(0) * sizeof(half), // src stride
|
||||
W * stride * sizeof(half), // dst stride
|
||||
W * stride * get_local_size(0) * sizeof(half), //total size
|
||||
src + get_group_id(1) * W * stride
|
||||
+ get_group_id(0) * W * stride * stride, // src
|
||||
W * stride, // num_elements_per_line,
|
||||
get_local_size(0), // num_lines,
|
||||
W * stride * (stride * get_num_groups(0) - 1), // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
}
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
__kernel void __dma_postwrite_reorg_chw(__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict local_src,
|
||||
__local half const *restrict local_dst
|
||||
)
|
||||
{
|
||||
const int stride_y = get_group_id(1);
|
||||
|
||||
const int dstIdx = stride_y*W*stride*get_global_size(0) + get_group_id(0)*W;
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
dst + dstIdx, // dst
|
||||
W * sizeof(half), // src width
|
||||
W * sizeof(half), // dst width
|
||||
W * sizeof(half), // src stride
|
||||
W * get_num_groups(0) * sizeof(half), // dst stride
|
||||
get_local_size(0) * W * stride * sizeof(half), //total size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void reorg_chw(__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict local_src,
|
||||
__local half *restrict local_dst
|
||||
)
|
||||
{
|
||||
const int c = get_local_id(0);
|
||||
const int c = get_local_id(0);
|
||||
const int stride_x = get_local_id(1);
|
||||
|
||||
const int srcIdx = stride_x + c*W*stride;
|
||||
const int dstIdx = stride_x*W*get_local_size(0) + c*W;
|
||||
const int srcIdx = stride_x + c * W * stride;
|
||||
const int dstIdx = stride_x * W * get_local_size(0) + c * W;
|
||||
|
||||
int x = 0;
|
||||
for (; x <= W - 8; x += 8) {
|
||||
half8 data = (half8) {
|
||||
local_src[srcIdx + (x + 0)*stride], local_src[srcIdx + (x + 1)*stride],
|
||||
local_src[srcIdx + (x + 2)*stride], local_src[srcIdx + (x + 3)*stride],
|
||||
local_src[srcIdx + (x + 4)*stride], local_src[srcIdx + (x + 5)*stride],
|
||||
local_src[srcIdx + (x + 6)*stride], local_src[srcIdx + (x + 7)*stride]
|
||||
};
|
||||
half8 data = (half8){
|
||||
local_src[srcIdx + (x + 0) * stride],
|
||||
local_src[srcIdx + (x + 1) * stride],
|
||||
local_src[srcIdx + (x + 2) * stride],
|
||||
local_src[srcIdx + (x + 3) * stride],
|
||||
local_src[srcIdx + (x + 4) * stride],
|
||||
local_src[srcIdx + (x + 5) * stride],
|
||||
local_src[srcIdx + (x + 6) * stride],
|
||||
local_src[srcIdx + (x + 7) * stride]};
|
||||
|
||||
*((__local half8*)(&local_dst[dstIdx + x])) = data;
|
||||
*((__local half8 *)(&local_dst[dstIdx + x])) = data;
|
||||
}
|
||||
|
||||
for (; x < W; x++) {
|
||||
local_dst[dstIdx + x] = local_src[srcIdx + x*stride];
|
||||
local_dst[dstIdx + x] = local_src[srcIdx + x * stride];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_2D2D(
|
||||
dst + get_group_id(0) * W
|
||||
+ get_group_id(1) * W * stride * get_global_size(0), // dst
|
||||
local_dst, // src
|
||||
W, // num_elements_per_line
|
||||
get_local_size(0) * stride, // num_lines
|
||||
0, // src_line_stride
|
||||
W * (get_num_groups(0) - 1), // dst_line_stride
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void reorg_chw(__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half const *restrict _0,
|
||||
__local half *restrict _1
|
||||
)
|
||||
{
|
||||
const int stride_x = get_local_id(1);
|
||||
const int stride_y = get_group_id(1);
|
||||
const int N = get_global_size(0);
|
||||
const int c = get_local_id(0)*get_num_groups(0) + get_group_id(0);
|
||||
|
||||
const int srcIdx = c*W*stride*stride + stride_x + stride_y*W*stride;
|
||||
const int dstIdx = c*W + stride_x*W*N + stride_y*W*N*stride;
|
||||
|
||||
#pragma unroll 8
|
||||
for (int x = 0; x < W; x++) {
|
||||
dst[dstIdx + x] = src[srcIdx + x*stride];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -1,40 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
// kernel with local memory buffer
|
||||
__kernel void reorg(__global const half* restrict src,
|
||||
__global half* restrict out,
|
||||
__local half* restrict tmp,
|
||||
int H,
|
||||
int W,
|
||||
int stride)
|
||||
{
|
||||
int h = min((int)get_global_id(0), H-1);
|
||||
|
||||
int c = get_global_id(1);
|
||||
int C = get_global_size(1);
|
||||
int C2 = C/(stride*stride);
|
||||
|
||||
int offset = c / C2;
|
||||
|
||||
int c2 = c - C2 * offset;
|
||||
|
||||
int H2 = H*stride;
|
||||
int W2 = W*stride;
|
||||
|
||||
for (int w = 0; w < W; ++w)
|
||||
{
|
||||
int h2 = h*stride + offset / stride;
|
||||
int w2 = w*stride + offset - stride * (offset / stride);
|
||||
|
||||
tmp[get_local_id(1)*get_local_size(0)*W + get_local_id(0)*W + w] = src[W2*H2*c2 + W2*h2 + w2];
|
||||
}
|
||||
|
||||
for (int w = 0; w < W; ++w)
|
||||
{
|
||||
out[W*H*c + W*h + w] = tmp[get_local_id(1)*get_local_size(0)*W + get_local_id(0)*W + w];
|
||||
}
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define MAX_W 512
|
||||
|
||||
// kernel that uses private memory on stack
|
||||
__kernel void reorg(__global const half* restrict src,
|
||||
__global half* restrict out,
|
||||
int H,
|
||||
int W,
|
||||
int stride)
|
||||
{
|
||||
int h = min((int)get_global_id(0), H-1);
|
||||
|
||||
int c = get_global_id(1);
|
||||
int C = get_global_size(1);
|
||||
int C2 = C/(stride*stride);
|
||||
|
||||
int offset = c / C2;
|
||||
|
||||
int c2 = c - C2 * offset;
|
||||
|
||||
int b = get_global_id(2);
|
||||
|
||||
__private half tmp[MAX_W];
|
||||
|
||||
int H2 = H*stride;
|
||||
int W2 = W*stride;
|
||||
|
||||
for (int w = 0; w < W; ++w)
|
||||
{
|
||||
int h2 = h*stride + offset / stride;
|
||||
int w2 = w*stride + offset - stride * (offset / stride);
|
||||
|
||||
tmp[w] = src[W2*H2*C2*b + W2*H2*c2 + W2*h2 + w2];
|
||||
}
|
||||
|
||||
for (int w = 0; w < W; ++w)
|
||||
{
|
||||
out[W*H*C*b + W*H*c + W*h + w] = tmp[w];
|
||||
}
|
||||
}
|
@ -3,66 +3,32 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
__kernel void __dma_preload_reorg_hwc(__global half const *restrict src,
|
||||
__global half *restrict _0,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict local_src,
|
||||
__local half *restrict _1
|
||||
)
|
||||
__kernel void reorg_hwc(
|
||||
__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride)
|
||||
{
|
||||
const int stride_x = get_group_id(1);
|
||||
__local half local_src[8 * 1024];
|
||||
__local half local_dst[8 * 1024];
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + get_group_id(0) * stride + stride_x * C, // src
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
local_src, // dst
|
||||
stride * sizeof(half), // src_width,
|
||||
stride * sizeof(half), // dst_width,
|
||||
C * stride * sizeof(half), // src_stride,
|
||||
stride * sizeof(half), // dst_stride,
|
||||
H * W * sizeof(half), // size
|
||||
src + get_group_id(0) * stride + get_group_id(1) * C, // src
|
||||
stride, // num_elements_per_line
|
||||
H * W / stride, // num_lines
|
||||
(C - 1) * stride, // src_line_stride
|
||||
0, // dst_line_stride
|
||||
0);
|
||||
}
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
__kernel void __dma_postwrite_reorg_hwc(__global half const *restrict _0,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict _1,
|
||||
__local half *restrict local_dst
|
||||
)
|
||||
{
|
||||
const int stride_x = get_group_id(1);
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
dst + stride_x * C + get_group_id(0) * stride, // dst
|
||||
stride * sizeof(half), // src_width,
|
||||
stride * sizeof(half), // dst_width,
|
||||
stride * sizeof(half), // src_stride,
|
||||
C * stride * sizeof(half), // dst_stride,
|
||||
W * H * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void reorg_hwc(__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict local_src,
|
||||
__local half *restrict local_dst
|
||||
)
|
||||
{
|
||||
const int stride_y = get_local_id(1);
|
||||
const int blocks = get_local_size(0);
|
||||
const int b = get_local_id(0);
|
||||
const int blocks = get_local_size(0);
|
||||
const int b = get_local_id(0);
|
||||
|
||||
const int OC = stride * stride;
|
||||
const int OH = H / stride;
|
||||
@ -73,67 +39,27 @@ __kernel void reorg_hwc(__global half const *restrict src,
|
||||
|
||||
for (int block_h = 0; block_h < stride; block_h++) {
|
||||
const int src_line = b * stride * stride + stride_y * stride + block_h;
|
||||
const int c = src_line / IH;
|
||||
const int h = src_line % IH;
|
||||
const int c = src_line / IH;
|
||||
const int h = src_line % IH;
|
||||
|
||||
const int dst_line = b * stride + stride_y * blocks * stride + block_h;
|
||||
const int oc = dst_line / OH;
|
||||
const int oh = dst_line % OH;
|
||||
const int oc = dst_line / OH;
|
||||
const int oh = dst_line % OH;
|
||||
|
||||
for (int w = 0; w < W / stride; w++) {
|
||||
local_dst[oh*OW*OC + w*OC + oc] = local_src[h*IW*IC + w*IC + c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void reorg_hwc_naive(__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride,
|
||||
__local half *restrict local_src,
|
||||
__local half *restrict local_dst
|
||||
)
|
||||
{
|
||||
const int out_c = C / (stride * stride);
|
||||
const int oc = C * (stride * stride);
|
||||
const int oh = H / stride;
|
||||
const int ow = W / stride;
|
||||
|
||||
const int c = get_global_id(0);
|
||||
|
||||
for (int h = 0; h < H; ++h)
|
||||
{
|
||||
int in_index = W * (h + H*c) + (0);
|
||||
int new_z = in_index / (oh*ow);
|
||||
int new_y = (in_index %(oh*ow)) / ow;
|
||||
int new_x = (in_index %(oh*ow)) % ow;
|
||||
int new_index = new_z + new_x * oc + new_y * oc * ow;
|
||||
|
||||
in_index++;
|
||||
|
||||
int c2 = c % out_c;
|
||||
int offset = c / out_c;
|
||||
int w2 = 0 * stride + offset % stride;
|
||||
int h2 = h * stride + offset / stride;
|
||||
int out_index = w2 + W * stride * (h2 + H * stride * c2);
|
||||
|
||||
#pragma unroll 2
|
||||
for(int i = 0; i < W; ++i, out_index+=stride, in_index++)
|
||||
{
|
||||
// repacking coordinates
|
||||
int k0 = out_index / (H*W);
|
||||
int j0 = (out_index % (H*W)) / W;
|
||||
int i0 = (out_index % (H*W)) % W;
|
||||
int out_index_repack = k0 + C * i0 + C * W * j0;
|
||||
|
||||
dst[new_index] = src[out_index_repack];
|
||||
|
||||
int new_z = in_index / (oh*ow);
|
||||
int new_y = (in_index %(oh*ow)) / ow;
|
||||
int new_x = (in_index %(oh*ow)) % ow;
|
||||
new_index = new_z + new_x * oc + new_y * oc * ow;
|
||||
local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c];
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_2D2D(
|
||||
dst + get_group_id(1) * C + get_group_id(0) * stride, // dst
|
||||
local_dst, // src
|
||||
stride, // num_elements_per_line
|
||||
W * H / stride, // num_lines
|
||||
0, // src_line_stride
|
||||
C * stride - stride, // dst_line_stride
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
||||
|
53
inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl
Normal file
53
inference-engine/src/vpu/custom_kernels/reorg_hwc_naive.cl
Normal file
@ -0,0 +1,53 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__kernel void reorg_hwc_naive(
|
||||
__global half const *restrict src,
|
||||
__global half *restrict dst,
|
||||
int W,
|
||||
int H,
|
||||
int C,
|
||||
int stride)
|
||||
{
|
||||
const int out_c = C / (stride * stride);
|
||||
const int oc = C * (stride * stride);
|
||||
const int oh = H / stride;
|
||||
const int ow = W / stride;
|
||||
|
||||
const int c = get_global_id(0);
|
||||
|
||||
for (int h = 0; h < H; ++h) {
|
||||
int in_index = W * (h + H * c) + (0);
|
||||
int new_z = in_index / (oh * ow);
|
||||
int new_y = (in_index % (oh * ow)) / ow;
|
||||
int new_x = (in_index % (oh * ow)) % ow;
|
||||
int new_index = new_z + new_x * oc + new_y * oc * ow;
|
||||
|
||||
in_index++;
|
||||
|
||||
int c2 = c % out_c;
|
||||
int offset = c / out_c;
|
||||
int w2 = 0 * stride + offset % stride;
|
||||
int h2 = h * stride + offset / stride;
|
||||
int out_index = w2 + W * stride * (h2 + H * stride * c2);
|
||||
|
||||
#pragma unroll 2
|
||||
for (int i = 0; i < W; ++i, out_index += stride, in_index++) {
|
||||
// repacking coordinates
|
||||
int k0 = out_index / (H * W);
|
||||
int j0 = (out_index % (H * W)) / W;
|
||||
int i0 = (out_index % (H * W)) % W;
|
||||
int out_index_repack = k0 + C * i0 + C * W * j0;
|
||||
|
||||
dst[new_index] = src[out_index_repack];
|
||||
|
||||
int new_z = in_index / (oh * ow);
|
||||
int new_y = (in_index % (oh * ow)) / ow;
|
||||
int new_x = (in_index % (oh * ow)) % ow;
|
||||
new_index = new_z + new_x * oc + new_y * oc * ow;
|
||||
}
|
||||
}
|
||||
}
|
122
inference-engine/src/vpu/custom_kernels/resample_AA.cl
Normal file
122
inference-engine/src/vpu/custom_kernels/resample_AA.cl
Normal file
@ -0,0 +1,122 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define USE_OPTIMIZED_ROUND
|
||||
|
||||
#ifdef USE_OPTIMIZED_ROUND
|
||||
#define ROUND(x) ((int)((x) + 0.5f))
|
||||
#else
|
||||
#define ROUND(x) (int)(round(x))
|
||||
#endif
|
||||
|
||||
inline int out_to_in(float ox, float f)
|
||||
{
|
||||
#ifdef USE_OPTIMIZED_ROUND
|
||||
return (int)((ox + 0.5f) / f);
|
||||
#else
|
||||
return ROUND((ox + 0.5f) / f - 0.5f);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline float triangleCoeff(float x) { return 1.0f - fabs(x); }
|
||||
|
||||
static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); }
|
||||
|
||||
__kernel void resample_with_antialias(
|
||||
__global const half *restrict src,
|
||||
__global half *restrict dst,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
__local half local_src[20 * 1024];
|
||||
__local half local_dst[8 * 1024];
|
||||
|
||||
const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
|
||||
const int oy_first = get_group_id(1) * get_local_size(1);
|
||||
const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
|
||||
const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
|
||||
const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1);
|
||||
const int iy_size = iy_last - iy_first + 1;
|
||||
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
local_src, // dst
|
||||
src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src
|
||||
iy_size * iw, // num_elements_per_line,
|
||||
get_local_size(2), // num_lines,
|
||||
(ih - iy_size) * iw, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
const int oy = get_global_id(1);
|
||||
const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
|
||||
const int iy = ROUND(iy_f);
|
||||
|
||||
__local half const *restrict start_src =
|
||||
local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
|
||||
__local half *restrict start_dst =
|
||||
local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);
|
||||
|
||||
for (int ox = 0; ox < ow; ox++) {
|
||||
const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
|
||||
const int ix_i = ROUND(ix_f);
|
||||
|
||||
float4 v_sum = 0.f;
|
||||
float4 v_wsum = 0.f;
|
||||
for (int y = 0; y < iy_size; y++) {
|
||||
float dy = iy_f - y;
|
||||
int x = max(ix_i - r, 0);
|
||||
int end_x = min(ix_i + r, iw - 1);
|
||||
|
||||
float4 dx;
|
||||
for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i;
|
||||
|
||||
for (; x < end_x - 3; x += 4, dx -= 4) {
|
||||
float4 w =
|
||||
factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy);
|
||||
float4 src_vec = {
|
||||
start_src[y * iw + x + 0],
|
||||
start_src[y * iw + x + 1],
|
||||
start_src[y * iw + x + 2],
|
||||
start_src[y * iw + x + 3]};
|
||||
|
||||
v_sum += w * src_vec;
|
||||
v_wsum += w;
|
||||
}
|
||||
|
||||
for (; x <= end_x; x++) {
|
||||
float dx = ix_f - x;
|
||||
float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy);
|
||||
|
||||
v_sum[0] += w * start_src[y * iw + x];
|
||||
v_wsum[0] += w;
|
||||
}
|
||||
}
|
||||
|
||||
v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
|
||||
v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
|
||||
|
||||
start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e2 = async_work_group_copy_2D2D(
|
||||
dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow
|
||||
+ get_group_id(1) * get_local_size(1) * ow, // dst
|
||||
local_dst, // src
|
||||
get_local_size(1) * ow, // num_elements_per_line,
|
||||
get_local_size(2), // num_lines,
|
||||
0, // src_line_stride,
|
||||
(get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride,
|
||||
0);
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -1,173 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define USE_OPTIMIZED_ROUND
|
||||
|
||||
#ifdef USE_OPTIMIZED_ROUND
|
||||
#define ROUND(x) ((int)((x) + 0.5f))
|
||||
#else
|
||||
#define ROUND(x) (int)(round(x))
|
||||
#endif
|
||||
|
||||
inline int out_to_in(float ox, float f) {
|
||||
return (int)((ox + 0.5f) * f);
|
||||
}
|
||||
|
||||
#define USE_MANUAL_DMA
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
|
||||
void interpolationCHW_nn(__local half* psrc, __local half* pdst, int OW, int IW, int C, float rw, float rh)
|
||||
{
|
||||
float alpha = rh / 2.0f - 0.5f;
|
||||
|
||||
for (int w = 0; w < OW/8; w++)
|
||||
{
|
||||
float fw0 = rw*(w*8+0) + alpha;
|
||||
float fw1 = rw*(w*8+1) + alpha;
|
||||
float fw2 = rw*(w*8+2) + alpha;
|
||||
float fw3 = rw*(w*8+3) + alpha;
|
||||
|
||||
float fw4 = rw*(w*8+4) + alpha;
|
||||
float fw5 = rw*(w*8+5) + alpha;
|
||||
float fw6 = rw*(w*8+6) + alpha;
|
||||
float fw7 = rw*(w*8+7) + alpha;
|
||||
|
||||
int iw0 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw0), IW-1);
|
||||
int iw1 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw1), IW-1);
|
||||
int iw2 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw2), IW-1);
|
||||
int iw3 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw3), IW-1);
|
||||
|
||||
int iw4 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw4), IW-1);
|
||||
int iw5 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw5), IW-1);
|
||||
int iw6 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw6), IW-1);
|
||||
int iw7 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw7), IW-1);
|
||||
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
half8 val = {
|
||||
*((__local half*)(psrc + c * IW + iw0)),
|
||||
*((__local half*)(psrc + c * IW + iw1)),
|
||||
|
||||
*((__local half*)(psrc + c * IW + iw2)),
|
||||
*((__local half*)(psrc + c * IW + iw3)),
|
||||
|
||||
*((__local half*)(psrc + c * IW + iw4)),
|
||||
*((__local half*)(psrc + c * IW + iw5)),
|
||||
|
||||
*((__local half*)(psrc + c * IW + iw6)),
|
||||
*((__local half*)(psrc + c * IW + iw7)),
|
||||
};
|
||||
*((__local half8*)(pdst + c * OW + w*8)) = val;
|
||||
}
|
||||
}
|
||||
|
||||
for (int w = OW/8*8; w < OW; w++)
|
||||
{
|
||||
float fw = rw*w + alpha;
|
||||
int iw0 = __builtin_shave_cmu_min_i32_rr_int((int)ROUND(fw), IW-1);
|
||||
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
*((__local half*)(pdst + c * OW + w)) = *((__local half*)(psrc + c * IW + iw0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void __dma_preload_resample_nearest(__global const half* restrict src,
|
||||
__global half* restrict _0,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict _1,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
const int oy_first = get_group_id(1) * get_local_size(1);
|
||||
const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
|
||||
const int iy_first = out_to_in(oy_first, 1.0 / factor);
|
||||
const int iy_last = out_to_in(oy_last, 1.0 /factor);
|
||||
const int iy_size = iy_last - iy_first + 1;
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + get_group_id(2)*channels*ih*iw + iy_first*iw, // src
|
||||
local_src, // dst
|
||||
iy_size * iw * sizeof(half), // src_width,
|
||||
iy_size * iw * sizeof(half), // dst_width,
|
||||
ih * iw * sizeof(half), // src_stride,
|
||||
iy_size * iw * sizeof(half), // dst_stride,
|
||||
channels * iy_size * iw * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_resample_nearest(__global const half* restrict _0,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict _1,
|
||||
__local half* restrict local_dst,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
local_dst, // src
|
||||
dst + get_group_id(2)*channels*get_global_size(1)*ow + get_group_id(1)*get_local_size(1)*ow, // dst
|
||||
get_local_size(1) * ow * sizeof(half), // src_width,
|
||||
get_local_size(1) * ow * sizeof(half), // dst_width,
|
||||
get_local_size(1) * ow * sizeof(half), // src_stride,
|
||||
get_global_size(1) * ow * sizeof(half), // dst_stride,
|
||||
channels * get_local_size(1) * ow * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
kernel void resample_nearest(__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor);
|
||||
}
|
||||
|
||||
#else // defined (USE_MANUAL_DMA)
|
||||
|
||||
kernel void resample_nearest(__global const half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
const float inv_factor = 1.0f / factor;
|
||||
const int iy = out_to_in(get_global_id(1), inv_factor);
|
||||
|
||||
__global half* dst_data = dst + get_global_id(1)*ow;
|
||||
__global half* src_data = src + iy*iw;
|
||||
|
||||
for (int ox = 0; ox < ow; ++ox)
|
||||
{
|
||||
const int ix = out_to_in(ox, inv_factor);
|
||||
for (int c = 0; c < channels; c++) {
|
||||
dst_data[c*oh*ow + ox] = src_data[c*ih*iw + ix];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // defined (USE_MANUAL_DMA)
|
112
inference-engine/src/vpu/custom_kernels/resample_noAA.cl
Normal file
112
inference-engine/src/vpu/custom_kernels/resample_noAA.cl
Normal file
@ -0,0 +1,112 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define USE_OPTIMIZED_ROUND
|
||||
|
||||
#ifdef USE_OPTIMIZED_ROUND
|
||||
#define ROUND(x) ((int)((x) + 0.5f))
|
||||
#else
|
||||
#define ROUND(x) (int)(round(x))
|
||||
#endif
|
||||
|
||||
inline int out_to_in(float ox, float f) { return (int)((ox + 0.5f) * f); }
|
||||
|
||||
void interpolationCHW_nn(__local half *psrc, __local half *pdst, int OW, int IW, int C, float rw, float rh)
|
||||
{
|
||||
float alpha = rh / 2.0f - 0.5f;
|
||||
|
||||
for (int w = 0; w < OW / 8; w++) {
|
||||
float fw0 = rw * (w * 8 + 0) + alpha;
|
||||
float fw1 = rw * (w * 8 + 1) + alpha;
|
||||
float fw2 = rw * (w * 8 + 2) + alpha;
|
||||
float fw3 = rw * (w * 8 + 3) + alpha;
|
||||
|
||||
float fw4 = rw * (w * 8 + 4) + alpha;
|
||||
float fw5 = rw * (w * 8 + 5) + alpha;
|
||||
float fw6 = rw * (w * 8 + 6) + alpha;
|
||||
float fw7 = rw * (w * 8 + 7) + alpha;
|
||||
|
||||
int iw0 = min((int)ROUND(fw0), IW - 1);
|
||||
int iw1 = min((int)ROUND(fw1), IW - 1);
|
||||
int iw2 = min((int)ROUND(fw2), IW - 1);
|
||||
int iw3 = min((int)ROUND(fw3), IW - 1);
|
||||
|
||||
int iw4 = min((int)ROUND(fw4), IW - 1);
|
||||
int iw5 = min((int)ROUND(fw5), IW - 1);
|
||||
int iw6 = min((int)ROUND(fw6), IW - 1);
|
||||
int iw7 = min((int)ROUND(fw7), IW - 1);
|
||||
|
||||
for (int c = 0; c < C; c++) {
|
||||
half8 val = {
|
||||
*((__local half *)(psrc + c * IW + iw0)),
|
||||
*((__local half *)(psrc + c * IW + iw1)),
|
||||
*((__local half *)(psrc + c * IW + iw2)),
|
||||
*((__local half *)(psrc + c * IW + iw3)),
|
||||
|
||||
*((__local half *)(psrc + c * IW + iw4)),
|
||||
*((__local half *)(psrc + c * IW + iw5)),
|
||||
*((__local half *)(psrc + c * IW + iw6)),
|
||||
*((__local half *)(psrc + c * IW + iw7)),
|
||||
};
|
||||
*((__local half8 *)(pdst + c * OW + w * 8)) = val;
|
||||
}
|
||||
}
|
||||
|
||||
for (int w = OW / 8 * 8; w < OW; w++) {
|
||||
float fw = rw * w + alpha;
|
||||
int iw0 = min((int)ROUND(fw), IW - 1);
|
||||
|
||||
for (int c = 0; c < C; c++) {
|
||||
*((__local half *)(pdst + c * OW + w)) = *((__local half *)(psrc + c * IW + iw0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void resample_nearest(
|
||||
__global const half *restrict src,
|
||||
__global half *restrict dst,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
__local half local_src[14 * 1024];
|
||||
__local half local_dst[14 * 1024];
|
||||
|
||||
const int oy_first = get_group_id(1) * get_local_size(1);
|
||||
const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
|
||||
const int iy_first = out_to_in(oy_first, 1.0 / factor);
|
||||
const int iy_last = out_to_in(oy_last, 1.0 / factor);
|
||||
|
||||
const int iy_size = iy_last - iy_first + 1;
|
||||
|
||||
event_t e1 = async_work_group_copy_2D2D(
|
||||
local_src, // dst
|
||||
src + get_group_id(2) * channels * ih * iw + iy_first * iw, // src
|
||||
iy_size * iw, // num_elements_per_line,
|
||||
channels, // num_lines,
|
||||
ih * iw - iy_size * iw, // src_line_stride,
|
||||
0, // dst_line_stride,
|
||||
0);
|
||||
|
||||
wait_group_events(1, &e1);
|
||||
|
||||
interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor);
|
||||
|
||||
event_t e2 = async_work_group_copy_2D2D(
|
||||
dst + get_group_id(2) * channels * get_global_size(1) * ow + get_group_id(1) * get_local_size(1) * ow, // dst
|
||||
local_dst, // src
|
||||
get_local_size(1) * ow, // size_t num_elements_per_line,
|
||||
channels, // size_t num_lines,
|
||||
0, // size_t src_line_stride,
|
||||
get_global_size(1) * ow - get_local_size(1) * ow, // size_t dst_line_stride,
|
||||
0);
|
||||
|
||||
wait_group_events(1, &e2);
|
||||
}
|
@ -1,245 +0,0 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define USE_OPTIMIZED_ROUND
|
||||
|
||||
#ifdef USE_OPTIMIZED_ROUND
|
||||
#define ROUND(x) ((int)((x) + 0.5f))
|
||||
#else
|
||||
#define ROUND(x) (int)(round(x))
|
||||
#endif
|
||||
|
||||
|
||||
inline int out_to_in(float ox, float f) {
|
||||
#ifdef USE_OPTIMIZED_ROUND
|
||||
return (int)((ox + 0.5f) / f);
|
||||
#else
|
||||
return ROUND((ox + 0.5f) / f - 0.5f);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline float triangleCoeff(float x)
|
||||
{
|
||||
return 1.0f - fabs(x);
|
||||
}
|
||||
|
||||
static inline float4 triangleCoeff4(float4 x)
|
||||
{
|
||||
return 1.0f - fabs(x);
|
||||
}
|
||||
|
||||
static inline half triangleCoeffHalf(half x)
|
||||
{
|
||||
return 1.0h - fabs(x);
|
||||
}
|
||||
|
||||
static inline half4 triangleCoeffHalf4(half4 x)
|
||||
{
|
||||
return 1.0h - fabs(x);
|
||||
}
|
||||
|
||||
static inline half8 triangleCoeffHalf8(half8 x)
|
||||
{
|
||||
return 1.0h - fabs(x);
|
||||
}
|
||||
|
||||
#define USE_MANUAL_DMA
|
||||
|
||||
#if defined (USE_MANUAL_DMA)
|
||||
|
||||
__kernel void __dma_preload_resample_with_antialias(__global const half* restrict src,
|
||||
__global half* restrict _0,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict _1,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
|
||||
const int oy_first = get_group_id(1) * get_local_size(1);
|
||||
const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
|
||||
const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
|
||||
const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1);
|
||||
const int iy_size = iy_last - iy_first + 1;
|
||||
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
src + get_group_id(2)*get_local_size(2)*ih*iw + iy_first*iw, // src
|
||||
local_src, // dst
|
||||
iy_size * iw * sizeof(half), // src_width,
|
||||
iy_size * iw * sizeof(half), // dst_width,
|
||||
ih * iw * sizeof(half), // src_stride,
|
||||
iy_size * iw * sizeof(half), // dst_stride,
|
||||
get_local_size(2) * iy_size * iw * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void __dma_postwrite_resample_with_antialias(__global const half* restrict _0,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict _1,
|
||||
__local half* restrict dst_local,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
WorkGroupDmaCreateStrideTransaction(
|
||||
dst_local, // src
|
||||
dst + get_group_id(2)*get_local_size(2)*get_global_size(1)*ow + get_group_id(1)*get_local_size(1)*ow, // dst
|
||||
get_local_size(1) * ow * sizeof(half), // src_width,
|
||||
get_local_size(1) * ow * sizeof(half), // dst_width,
|
||||
get_local_size(1) * ow * sizeof(half), // src_stride,
|
||||
get_global_size(1) * ow * sizeof(half), // dst_stride,
|
||||
get_local_size(2) * get_local_size(1) * ow * sizeof(half), // size
|
||||
0);
|
||||
}
|
||||
|
||||
__kernel void resample_with_antialias(const __global half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict local_src,
|
||||
__local half* restrict local_dst,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
|
||||
const int oy_first = get_group_id(1) * get_local_size(1);
|
||||
const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
|
||||
const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
|
||||
const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1);
|
||||
const int iy_size = iy_last - iy_first + 1;
|
||||
const int oy = get_global_id(1);
|
||||
const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
|
||||
const int iy = ROUND(iy_f);
|
||||
|
||||
__local half const *restrict start_src = local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
|
||||
__local half *restrict start_dst = local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);
|
||||
|
||||
for (int ox = 0; ox < ow; ox++)
|
||||
{
|
||||
const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
|
||||
const int ix_i = ROUND(ix_f);
|
||||
|
||||
float4 v_sum = 0.f;
|
||||
float4 v_wsum = 0.f;
|
||||
for (int y = 0; y < iy_size; y++)
|
||||
{
|
||||
float dy = iy_f - y;
|
||||
int x = max(ix_i - r, 0);
|
||||
int end_x = min(ix_i + r, iw - 1);
|
||||
|
||||
float4 dx;
|
||||
for (int i = 0; i < 4; i++)
|
||||
dx[i] = ix_f - x - i;
|
||||
|
||||
for (; x < end_x - 3; x += 4, dx -= 4)
|
||||
{
|
||||
float4 w = factor*triangleCoeff4(factor*dx) * factor*triangleCoeff(factor*dy);
|
||||
float4 src_vec = { start_src[y*iw + x + 0],
|
||||
start_src[y*iw + x + 1],
|
||||
start_src[y*iw + x + 2],
|
||||
start_src[y*iw + x + 3] };
|
||||
|
||||
v_sum += w * src_vec;
|
||||
v_wsum += w;
|
||||
}
|
||||
|
||||
for (; x <= end_x; x++)
|
||||
{
|
||||
float dx = ix_f - x;
|
||||
float w = factor*triangleCoeff(factor*dx) * factor*triangleCoeff(factor*dy);
|
||||
|
||||
v_sum[0] += w * start_src[y*iw + x];
|
||||
v_wsum[0] += w;
|
||||
}
|
||||
}
|
||||
|
||||
v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
|
||||
v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
|
||||
|
||||
start_dst[get_local_id(1)*ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__kernel void resample_with_antialias(const __global half* restrict src,
|
||||
__global half* restrict dst,
|
||||
__local half* restrict _0,
|
||||
__local half* restrict _1,
|
||||
int iw,
|
||||
int ih,
|
||||
float factor,
|
||||
int ow,
|
||||
int oh,
|
||||
int channels)
|
||||
{
|
||||
int oy = get_global_id(1);
|
||||
int c = get_global_id(2);
|
||||
|
||||
int r = (factor > 1.0f) ? 2 : ceil((1.0f)/factor);
|
||||
|
||||
const __global half* restrict start_src = src + iw * ih * c;
|
||||
__global half* restrict start_dst = dst + ow * oh * c;
|
||||
|
||||
float iy_f = (oy + 0.5) / factor - 0.5f;
|
||||
int iy_i = ROUND(iy_f);
|
||||
|
||||
for (int ox = 0; ox < ow; ox++)
|
||||
{
|
||||
float ix_f = (ox + 0.5) / factor - 0.5f;
|
||||
int ix_i = ROUND(ix_f);
|
||||
|
||||
float4 v_sum = 0.f;
|
||||
float4 v_wsum = 0.f;
|
||||
|
||||
for (int y = max(iy_i - r, 0); y <= min(iy_i + r, (int)ih - 1); y++)
|
||||
{
|
||||
float dy = iy_f - y;
|
||||
int x = max(ix_i - r, 0);
|
||||
int end_x = min(ix_i + r, (int)iw - 1);
|
||||
|
||||
float4 dx;
|
||||
for (int i = 0; i < 4; i++)
|
||||
dx[i] = ix_f - x - i;
|
||||
|
||||
for (; x <= end_x - 3; x += 4, dx -= 4)
|
||||
{
|
||||
float4 w = factor*triangleCoeff4(factor*dx) * factor*triangleCoeff(factor*dy);
|
||||
float4 src_vec = { start_src[y*iw + x + 0],
|
||||
start_src[y*iw + x + 1],
|
||||
start_src[y*iw + x + 2],
|
||||
start_src[y*iw + x + 3] };
|
||||
|
||||
v_sum += w * src_vec;
|
||||
v_wsum += w;
|
||||
}
|
||||
|
||||
for (; x <= end_x; x++)
|
||||
{
|
||||
float dx = ix_f - x;
|
||||
float w = factor*triangleCoeff(factor*dx) * factor*triangleCoeff(factor*dy);
|
||||
|
||||
v_sum[0] += w * start_src[y*iw + x];
|
||||
v_wsum[0] += w;
|
||||
}
|
||||
}
|
||||
|
||||
v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
|
||||
v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
|
||||
|
||||
start_dst[oy*ow + ox] = (!v_wsum[0]) ? (half)0.0f : (half)(v_sum[0] / v_wsum[0]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -4,12 +4,13 @@
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
__kernel void ShuffleChannel(__global const half* restrict src_data,
|
||||
__global half* restrict dst_data,
|
||||
int C,
|
||||
int H,
|
||||
int W,
|
||||
int G)
|
||||
__kernel void ShuffleChannel(
|
||||
__global const half *restrict src_data,
|
||||
__global half *restrict dst_data,
|
||||
int C,
|
||||
int H,
|
||||
int W,
|
||||
int G)
|
||||
{
|
||||
int c = get_global_id(0);
|
||||
if (c >= C) return;
|
||||
@ -18,16 +19,15 @@ __kernel void ShuffleChannel(__global const half* restrict src_data,
|
||||
int cy = c % G;
|
||||
int cx = c / G;
|
||||
|
||||
__global const half8* src_line = ((__global const half8*)(src_data + cy*CX*H*W + cx*H*W));
|
||||
__global half8* dst_line = ((__global half8*)(dst_data + cx*CY*H*W + cy*H*W));
|
||||
__global const half8 *src_line =
|
||||
((__global const half8 *)(src_data + cy * CX * H * W + cx * H * W));
|
||||
__global half8 *dst_line = ((__global half8 *)(dst_data + cx * CY * H * W + cy * H * W));
|
||||
|
||||
for (int i = 0; i < W*H/8; i++)
|
||||
{
|
||||
for (int i = 0; i < W * H / 8; i++) {
|
||||
dst_line[i] = src_line[i];
|
||||
}
|
||||
|
||||
for (int i = W*H/8*8; i < W*H; i++)
|
||||
{
|
||||
dst_data[cx*CY*H*W + cy*H*W + i] = src_data[cy*CX*H*W + cx*H*W + i];
|
||||
for (int i = W * H / 8 * 8; i < W * H; i++) {
|
||||
dst_data[cx * CY * H * W + cy * H * W + i] = src_data[cy * CX * H * W + cx * H * W + i];
|
||||
}
|
||||
}
|
||||
|
@ -3,51 +3,29 @@
|
||||
//
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
|
||||
|
||||
#define MAX_WIDTH 512
|
||||
#define MIN(a, b) ((a) < (b)) ? (a) : (b);
|
||||
|
||||
__kernel void __dma_postwrite_ocl_st(__global half const *const restrict src_data,
|
||||
__global half const *const restrict theta,
|
||||
__global half *const restrict dst_data,
|
||||
int C,
|
||||
int W,
|
||||
__local half const *const restrict local_dst)
|
||||
{
|
||||
const int x0 = get_global_id(0) * MAX_WIDTH;
|
||||
const int x1 = MIN(x0 + MAX_WIDTH, W);
|
||||
const int length = x1 - x0;
|
||||
|
||||
WorkGroupDmaCreate3DTransaction(
|
||||
local_dst, // src
|
||||
dst_data + get_global_id(1) * W + x0, // dst
|
||||
length * sizeof(half), // src width
|
||||
length * sizeof(half), // dst width
|
||||
length * sizeof(half), // src stride
|
||||
W * sizeof(half), // dst stride
|
||||
C, // num planes
|
||||
get_local_size(1) * length * sizeof(half), // src plane stride
|
||||
get_global_size(1) * W * sizeof(half), // dst plane stride
|
||||
get_local_size(1) * length * sizeof(half), // plane size
|
||||
0);
|
||||
}
|
||||
|
||||
__attribute__((noinline))
|
||||
void calcInd(__global half const *const restrict theta,
|
||||
half *const restrict weight,
|
||||
int *const restrict ind,
|
||||
int y, int H, int x0, int length, int step, int W)
|
||||
__attribute__((noinline)) void calcInd(
|
||||
__global const half *restrict theta,
|
||||
__local half *restrict weight,
|
||||
__local int *restrict ind,
|
||||
int y,
|
||||
int H,
|
||||
int x0,
|
||||
int length,
|
||||
int step,
|
||||
int W)
|
||||
{
|
||||
float a = (float)y * 1.0f / H * 2 - 1;
|
||||
|
||||
int x = 0;
|
||||
|
||||
float8 va = (float8) {a, a, a, a, a, a, a, a};
|
||||
float8 vxy = (float8) {x0 + 0, x0 + 1, x0 + 2, x0 + 3,
|
||||
x0 + 4, x0 + 5, x0 + 6, x0 + 7};
|
||||
float8 va = (float8){a, a, a, a, a, a, a, a};
|
||||
float8 vxy = (float8){x0 + 0, x0 + 1, x0 + 2, x0 + 3, x0 + 4, x0 + 5, x0 + 6, x0 + 7};
|
||||
|
||||
for (; x <= length - 8; x += 8, vxy += 8)
|
||||
{
|
||||
for (; x <= length - 8; x += 8, vxy += 8) {
|
||||
float8 va1 = vxy * 1.0f / W * 2 - 1.f;
|
||||
|
||||
float8 vx = (va * theta[0] + va1 * theta[1] + theta[2] + 1.f) / 2.f * H;
|
||||
@ -61,21 +39,27 @@ void calcInd(__global half const *const restrict theta,
|
||||
float8 bx = 1.f - ax;
|
||||
float8 by = 1.f - ay;
|
||||
|
||||
union {int8 d; uint8 i; } check_x;
|
||||
union {
|
||||
int8 d;
|
||||
uint8 i;
|
||||
} check_x;
|
||||
|
||||
check_x.d = ix;
|
||||
int8 b01 = check_x.i < (uint8)H;
|
||||
int8 b01 = check_x.i < (uint8)H;
|
||||
|
||||
check_x.d = ix + 1;
|
||||
int8 b45 = check_x.i < (uint8)H;
|
||||
int8 b45 = check_x.i < (uint8)H;
|
||||
|
||||
union {int8 d; uint8 i; } check_y;
|
||||
union {
|
||||
int8 d;
|
||||
uint8 i;
|
||||
} check_y;
|
||||
|
||||
check_y.d = iy;
|
||||
int8 b23 = check_y.i < (uint8)W;
|
||||
int8 b23 = check_y.i < (uint8)W;
|
||||
|
||||
check_y.d = iy + 1;
|
||||
int8 b67 = check_y.i < (uint8)W;
|
||||
int8 b67 = check_y.i < (uint8)W;
|
||||
|
||||
int8 b0123 = b01 & b23;
|
||||
int8 b0167 = b01 & b67;
|
||||
@ -87,33 +71,48 @@ void calcInd(__global half const *const restrict theta,
|
||||
int8 TR_id = ((ix + 0) * W + (iy + 1)) * (b0167 & 1);
|
||||
int8 BR_id = ((ix + 1) * W + (iy + 1)) * (b4567 & 1);
|
||||
|
||||
union {float8 f; int8 i;} w0; w0.f = bx * by;
|
||||
union {float8 f; int8 i;} w1; w1.f = ax * by;
|
||||
union {float8 f; int8 i;} w2; w2.f = bx * ay;
|
||||
union {float8 f; int8 i;} w3; w3.f = ax * ay;
|
||||
union {
|
||||
float8 f;
|
||||
int8 i;
|
||||
} w0;
|
||||
w0.f = bx * by;
|
||||
union {
|
||||
float8 f;
|
||||
int8 i;
|
||||
} w1;
|
||||
w1.f = ax * by;
|
||||
union {
|
||||
float8 f;
|
||||
int8 i;
|
||||
} w2;
|
||||
w2.f = bx * ay;
|
||||
union {
|
||||
float8 f;
|
||||
int8 i;
|
||||
} w3;
|
||||
w3.f = ax * ay;
|
||||
|
||||
w0.i = w0.i & b0123;
|
||||
w1.i = w1.i & b4523;
|
||||
w2.i = w2.i & b0167;
|
||||
w3.i = w3.i & b4567;
|
||||
|
||||
*((half8*)(weight + x + 0*step)) = convert_half8(w0.f);
|
||||
*((half8*)(weight + x + 1*step)) = convert_half8(w1.f);
|
||||
*((half8*)(weight + x + 2*step)) = convert_half8(w2.f);
|
||||
*((half8*)(weight + x + 3*step)) = convert_half8(w3.f);
|
||||
*((__local half8 *)(weight + x + 0 * step)) = convert_half8(w0.f);
|
||||
*((__local half8 *)(weight + x + 1 * step)) = convert_half8(w1.f);
|
||||
*((__local half8 *)(weight + x + 2 * step)) = convert_half8(w2.f);
|
||||
*((__local half8 *)(weight + x + 3 * step)) = convert_half8(w3.f);
|
||||
|
||||
*((int8*)(ind + x + 0*step)) = TL_id;
|
||||
*((int8*)(ind + x + 1*step)) = BL_id;
|
||||
*((int8*)(ind + x + 2*step)) = TR_id;
|
||||
*((int8*)(ind + x + 3*step)) = BR_id;
|
||||
*((__local int8 *)(ind + x + 0 * step)) = TL_id;
|
||||
*((__local int8 *)(ind + x + 1 * step)) = BL_id;
|
||||
*((__local int8 *)(ind + x + 2 * step)) = TR_id;
|
||||
*((__local int8 *)(ind + x + 3 * step)) = BR_id;
|
||||
}
|
||||
|
||||
for (; x < length; x++)
|
||||
{
|
||||
for (; x < length; x++) {
|
||||
float a1 = (float)(x0 + x) * 1.0f / W * 2 - 1;
|
||||
|
||||
float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1)/2 * H;
|
||||
float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1)/2 * W;
|
||||
float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1) / 2 * H;
|
||||
float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1) / 2 * W;
|
||||
|
||||
const int ix = (int)(fx) - (fx < 0);
|
||||
const int iy = (int)(fy) - (fy < 0);
|
||||
@ -123,15 +122,15 @@ void calcInd(__global half const *const restrict theta,
|
||||
float bx = 1 - ax;
|
||||
float by = 1 - ay;
|
||||
|
||||
int b0 = ix >= 0;
|
||||
int b0 = ix >= 0;
|
||||
int b4 = ix >= -1;
|
||||
int b1 = ix < H;
|
||||
int b5 = ix < H-1;
|
||||
int b1 = ix < H;
|
||||
int b5 = ix < H - 1;
|
||||
|
||||
int b2 = iy >= 0;
|
||||
int b2 = iy >= 0;
|
||||
int b6 = iy >= -1;
|
||||
int b3 = iy < W;
|
||||
int b7 = iy < W-1;
|
||||
int b3 = iy < W;
|
||||
int b7 = iy < W - 1;
|
||||
|
||||
int b01 = b0 & b1;
|
||||
int b23 = b2 & b3;
|
||||
@ -148,69 +147,79 @@ void calcInd(__global half const *const restrict theta,
|
||||
int TR_id = ((ix + 0) * W + (iy + 1)) * b0167;
|
||||
int BR_id = ((ix + 1) * W + (iy + 1)) * b4567;
|
||||
|
||||
half w0 = bx*by*b0123;
|
||||
half w1 = ax*by*b4523;
|
||||
half w2 = bx*ay*b0167;
|
||||
half w3 = ax*ay*b4567;
|
||||
half w0 = bx * by * b0123;
|
||||
half w1 = ax * by * b4523;
|
||||
half w2 = bx * ay * b0167;
|
||||
half w3 = ax * ay * b4567;
|
||||
|
||||
weight[x + 0*step] = w0;
|
||||
weight[x + 1*step] = w1;
|
||||
weight[x + 2*step] = w2;
|
||||
weight[x + 3*step] = w3;
|
||||
weight[x + 0 * step] = w0;
|
||||
weight[x + 1 * step] = w1;
|
||||
weight[x + 2 * step] = w2;
|
||||
weight[x + 3 * step] = w3;
|
||||
|
||||
ind[x + 0*step] = TL_id;
|
||||
ind[x + 1*step] = BL_id;
|
||||
ind[x + 2*step] = TR_id;
|
||||
ind[x + 3*step] = BR_id;
|
||||
ind[x + 0 * step] = TL_id;
|
||||
ind[x + 1 * step] = BL_id;
|
||||
ind[x + 2 * step] = TR_id;
|
||||
ind[x + 3 * step] = BR_id;
|
||||
}
|
||||
}
|
||||
|
||||
__attribute__((noinline))
|
||||
void apply(__global half const *const restrict src,
|
||||
half const *const restrict weight,
|
||||
int const *const restrict ind,
|
||||
__local half *const restrict dst,
|
||||
int length,
|
||||
int step)
|
||||
__attribute__((noinline)) void apply(
|
||||
__global half const *restrict src,
|
||||
__local half const *restrict weight,
|
||||
__local int const *restrict ind,
|
||||
__local half *restrict dst,
|
||||
int src_stride,
|
||||
int step)
|
||||
{
|
||||
int x = 0;
|
||||
for(; x <= length - 8; x += 8)
|
||||
{
|
||||
int8 TL_id = *((int8*)(ind + x + 0*step));
|
||||
int8 BL_id = *((int8*)(ind + x + 1*step));
|
||||
int8 TR_id = *((int8*)(ind + x + 2*step));
|
||||
int8 BR_id = *((int8*)(ind + x + 3*step));
|
||||
for (; x <= src_stride - 8; x += 8) {
|
||||
int8 TL_id = *((__local int8 *)(ind + x + 0 * step));
|
||||
int8 BL_id = *((__local int8 *)(ind + x + 1 * step));
|
||||
int8 TR_id = *((__local int8 *)(ind + x + 2 * step));
|
||||
int8 BR_id = *((__local int8 *)(ind + x + 3 * step));
|
||||
|
||||
half8 w00 = *((half8*)(weight + x + 0*step));
|
||||
half8 w01 = *((half8*)(weight + x + 1*step));
|
||||
half8 w02 = *((half8*)(weight + x + 2*step));
|
||||
half8 w03 = *((half8*)(weight + x + 3*step));
|
||||
half8 w00 = *((__local half8 *)(weight + x + 0 * step));
|
||||
half8 w01 = *((__local half8 *)(weight + x + 1 * step));
|
||||
half8 w02 = *((__local half8 *)(weight + x + 2 * step));
|
||||
half8 w03 = *((__local half8 *)(weight + x + 3 * step));
|
||||
|
||||
half8 TL = (half8){src[TL_id[0]], src[TL_id[1]], src[TL_id[2]], src[TL_id[3]],
|
||||
src[TL_id[4]], src[TL_id[5]], src[TL_id[6]], src[TL_id[7]]};
|
||||
half8 TR = (half8){src[TR_id[0]], src[TR_id[1]], src[TR_id[2]], src[TR_id[3]],
|
||||
src[TR_id[4]], src[TR_id[5]], src[TR_id[6]], src[TR_id[7]]};
|
||||
half8 BL = (half8){src[BL_id[0]], src[BL_id[1]], src[BL_id[2]], src[BL_id[3]],
|
||||
src[BL_id[4]], src[BL_id[5]], src[BL_id[6]], src[BL_id[7]]};
|
||||
half8 BR = (half8){src[BR_id[0]], src[BR_id[1]], src[BR_id[2]], src[BR_id[3]],
|
||||
src[BR_id[4]], src[BR_id[5]], src[BR_id[6]], src[BR_id[7]]};
|
||||
half8 TL = (half8){
|
||||
src[TL_id[0]], src[TL_id[1]],
|
||||
src[TL_id[2]], src[TL_id[3]],
|
||||
src[TL_id[4]], src[TL_id[5]],
|
||||
src[TL_id[6]], src[TL_id[7]]};
|
||||
half8 TR = (half8){
|
||||
src[TR_id[0]], src[TR_id[1]],
|
||||
src[TR_id[2]], src[TR_id[3]],
|
||||
src[TR_id[4]], src[TR_id[5]],
|
||||
src[TR_id[6]], src[TR_id[7]]};
|
||||
half8 BL = (half8){
|
||||
src[BL_id[0]], src[BL_id[1]],
|
||||
src[BL_id[2]], src[BL_id[3]],
|
||||
src[BL_id[4]], src[BL_id[5]],
|
||||
src[BL_id[6]], src[BL_id[7]]};
|
||||
half8 BR = (half8){
|
||||
src[BR_id[0]], src[BR_id[1]],
|
||||
src[BR_id[2]], src[BR_id[3]],
|
||||
src[BR_id[4]], src[BR_id[5]],
|
||||
src[BR_id[6]], src[BR_id[7]]};
|
||||
|
||||
half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
|
||||
half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
|
||||
|
||||
*((__local half8*)(dst + x)) = res;
|
||||
*((__local half8 *)(dst + x)) = res;
|
||||
}
|
||||
|
||||
for (; x < length; x++)
|
||||
{
|
||||
int TL_id = ind[x + 0*step];
|
||||
int BL_id = ind[x + 1*step];
|
||||
int TR_id = ind[x + 2*step];
|
||||
int BR_id = ind[x + 3*step];
|
||||
for (; x < src_stride; x++) {
|
||||
int TL_id = ind[x + 0 * step];
|
||||
int BL_id = ind[x + 1 * step];
|
||||
int TR_id = ind[x + 2 * step];
|
||||
int BR_id = ind[x + 3 * step];
|
||||
|
||||
half w00 = weight[x + 0*step];
|
||||
half w01 = weight[x + 1*step];
|
||||
half w02 = weight[x + 2*step];
|
||||
half w03 = weight[x + 3*step];
|
||||
half w00 = weight[x + 0 * step];
|
||||
half w01 = weight[x + 1 * step];
|
||||
half w02 = weight[x + 2 * step];
|
||||
half w03 = weight[x + 3 * step];
|
||||
|
||||
half TL = src[TL_id];
|
||||
half TR = src[TR_id];
|
||||
@ -218,36 +227,52 @@ void apply(__global half const *const restrict src,
|
||||
half BR = src[BR_id];
|
||||
|
||||
half res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
|
||||
|
||||
dst[x] = res;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void ocl_st(__global half const *const restrict src_data,
|
||||
__global half const *const restrict theta,
|
||||
__global half const *const restrict dst_data,
|
||||
int C,
|
||||
int W,
|
||||
__local half *const restrict local_dst)
|
||||
__kernel void ocl_st(
|
||||
__global half const *const restrict src_data,
|
||||
__global half const *const restrict theta,
|
||||
__global half *const restrict dst_data,
|
||||
int C,
|
||||
int W)
|
||||
{
|
||||
__local int ind[4 * MAX_WIDTH] __attribute__((aligned(16)));
|
||||
__local half weight[4 * MAX_WIDTH] __attribute__((aligned(16)));
|
||||
__local half local_dst[4 * 1024];
|
||||
|
||||
int w = get_group_id(0);
|
||||
|
||||
int y = get_global_id(1);
|
||||
int H = get_global_size(1);
|
||||
|
||||
__private int ind[4][MAX_WIDTH] __attribute__((aligned(16)));
|
||||
__private half weight[4][MAX_WIDTH] __attribute__((aligned(16)));
|
||||
const int x0 = w * MAX_WIDTH;
|
||||
const int x1 = min(x0 + MAX_WIDTH, W);
|
||||
const int src_stride = x1 - x0;
|
||||
|
||||
const int x0 = w * MAX_WIDTH;
|
||||
const int x1 = MIN(x0 + MAX_WIDTH, W);
|
||||
const int length = x1 - x0;
|
||||
calcInd(theta, weight, ind, y, H, x0, src_stride, MAX_WIDTH, W);
|
||||
|
||||
calcInd(theta, weight, ind, y, H, x0, length, MAX_WIDTH, W);
|
||||
for (int c = 0; c < C; c++) {
|
||||
__global half const *restrict src = src_data + c * H * W;
|
||||
__local half *restrict dst = local_dst + c * get_local_size(1) * src_stride + get_local_id(1) * src_stride;
|
||||
|
||||
for (int c = 0; c < C; c++)
|
||||
{
|
||||
__global half const *const restrict src = src_data + c*H*W;
|
||||
__local half *const restrict dst = local_dst + c*get_local_size(1)*length + get_local_id(1)*length;
|
||||
|
||||
apply(src, weight, ind, dst, length, MAX_WIDTH);
|
||||
apply(src, weight, ind, dst, src_stride, MAX_WIDTH);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
event_t e = async_work_group_copy_3D3D(
|
||||
dst_data + get_group_id(1) * get_local_size(1) * W + x0, // dst
|
||||
local_dst, // src
|
||||
src_stride, // num_elements_per_line
|
||||
get_local_size(1), // num_lines
|
||||
0, // src_line_stride
|
||||
W - src_stride, // dst_line_stride
|
||||
C, // num planes
|
||||
0, // src plane stride
|
||||
W * (get_global_size(1) - get_local_size(1)), // dst plane stride
|
||||
0);
|
||||
wait_group_events(1, &e);
|
||||
}
|
||||
|
@ -0,0 +1,188 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#ifndef SHAVE_METADATA_H_INCLUDED
|
||||
#define SHAVE_METADATA_H_INCLUDED
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
|
||||
enum {
|
||||
md_invalid_index = ~0u,
|
||||
};
|
||||
|
||||
enum md_version_t {
|
||||
md_version_1_0 = 0x00010000, // version 1.0
|
||||
md_version_1_1 = 0x00010001, // version 1.1
|
||||
md_version_1_2 = 0x00010002, // version 1.2
|
||||
md_version_latest = md_version_1_2
|
||||
};
|
||||
|
||||
struct md_header_t {
|
||||
uint32_t version; // 0xFFFF0000 = Major 0x0000FFFF = Minor
|
||||
|
||||
// md_kernel_descriptor_t array info
|
||||
uint32_t kernel_count; // number of kernels in the .metadata
|
||||
uint32_t kernel_first; // absolute byte offset to first
|
||||
// md_kernel_descriptor_t from start of .metadata
|
||||
|
||||
// md_kernel_argument_t array info
|
||||
uint32_t arg_count; // number of arguments in the .metadata
|
||||
uint32_t arg_first; // absolute byte offset to first
|
||||
// md_kernel_argument_t from start of .metadata
|
||||
|
||||
// md_kernel_sipp_info_t array info
|
||||
uint32_t sipp_info_count; // number of sipp dma infos in .metadata
|
||||
uint32_t sipp_info_first; // absolute byte offset to first
|
||||
// md_kernel_sipp_info_t from start of .metadata
|
||||
|
||||
// md_expr_t array info
|
||||
uint32_t expr_count; // number of expressions in .metadata
|
||||
uint32_t expr_first; // absolute byte offset to first
|
||||
// kernel_expr_t from start of .metadata
|
||||
|
||||
// md_expr_node_t array info
|
||||
uint32_t expr_node_count; // number of expression nodes in .metadata
|
||||
uint32_t expr_node_first; // absolute byte offset to first md_expr_node_t
|
||||
// from start of .metadata
|
||||
|
||||
// function table
|
||||
uint32_t func_count; // number of functions in the function table
|
||||
uint32_t func_first; // absolute byte offset to the first md_function_t
|
||||
};
|
||||
|
||||
struct md_function_t {
|
||||
uint32_t load_address; // runtime address of a kernel function
|
||||
};
|
||||
|
||||
struct md_kernel_variant_t {
|
||||
uint32_t name; // offset into the string table of the kernel name
|
||||
uint32_t factor; // vector width / unroll factor
|
||||
uint32_t func; // index into the kernel function table
|
||||
};
|
||||
|
||||
enum md_kernel_variant_type_t {
|
||||
md_variant_scalar = 0, // basic scalar kernel
|
||||
md_variant_vectorized, // kernel has been vectorized
|
||||
md_variant_unrolled, // kernel has been loop unrolled
|
||||
md_variant_sipp_dma, // sipp dma kernel
|
||||
md_variant_sipp_dma_vectorized, // vectorized sipp dma kernel
|
||||
md_variant_dma_preload, // kernel preload function
|
||||
md_variant_dma_postwrite, // kernel postwrite function
|
||||
md_variant_dma_fallback, // kernel fallback function
|
||||
md_VARIANT_COUNT
|
||||
};
|
||||
|
||||
constexpr int kVariantCount = md_VARIANT_COUNT;
|
||||
|
||||
enum md_kernel_flags_t {
|
||||
md_kernel_flags_ddr_write = 1u, // kernel writes to DDR memory
|
||||
md_kernel_flags_ddr_read = 2u, // kernel reads from DDR memory
|
||||
md_kernel_flags_generated_prepost = 4u, // kernel has an autogenerated prepost
|
||||
};
|
||||
|
||||
struct md_kernel_descriptor_t {
|
||||
uint32_t flags; // combination of md_kernel_flags_t
|
||||
|
||||
uint32_t arg_count; // number of arguments for this kernel
|
||||
uint32_t arg_index; // index of first kernel_argument_t
|
||||
|
||||
uint32_t sipp_dma_in_count; // number of SIPP dma input arguments (or 0 if no SIPP dma)
|
||||
uint32_t sipp_dma_out_count; // number of SIPP dma output arguments (or 0 if no SIPP dma)
|
||||
uint32_t sipp_info_index; // index into the kernel_sipp_info_t list
|
||||
|
||||
uint32_t name; // metadata string table offset for kernel name
|
||||
|
||||
uint32_t stack_size_wg; // estimate of stack usage per work group (fixed)
|
||||
uint32_t stack_size_wi; // estimate of stack usage per work item
|
||||
|
||||
// kernel variant list
|
||||
md_kernel_variant_t variant[kVariantCount];
|
||||
};
|
||||
|
||||
enum md_arg_addr_space_t {
|
||||
md_addr_space_private = 0,
|
||||
md_addr_space_global, // global address space (ddr)
|
||||
md_addr_space_constant, //
|
||||
md_addr_space_local, // local address space (cmx)
|
||||
|
||||
md_addr_space_undef, // none of the others
|
||||
};
|
||||
|
||||
enum md_arg_flags_t {
|
||||
md_arg_flags_dma_input = 1u, // local argument is being read from
|
||||
md_arg_flags_dma_output = 2u, // local argument is being written to
|
||||
md_arg_flags_dma_double_buffer = 4u, // local argument should be double buffered
|
||||
md_arg_flags_generated_prepost = 8u, // preload and post write are auto generated
|
||||
};
|
||||
|
||||
struct md_kernel_argument_t {
|
||||
uint32_t flags; // bitfield of md_arg_flags_t
|
||||
uint32_t name; // argument name
|
||||
uint32_t array_size_expr; // index to a `kernel_expr_t` type for evaluating total number of element
|
||||
uint32_t size_elm; // size in bytes of the underlying element
|
||||
md_arg_addr_space_t addr_space; // the arguments address space
|
||||
uint32_t alignment; // alignment require in bytes
|
||||
uint32_t arg_pack_offset; // offset into the argument pack
|
||||
};
|
||||
|
||||
struct md_kernel_sipp_info_t {
|
||||
uint32_t num_dims; // number of dimensions of the dma
|
||||
uint32_t span_x;
|
||||
uint32_t span_y;
|
||||
|
||||
// below are all indexes to a 'kernel_expr_t'
|
||||
uint32_t elm_size; // size in bytes of the element
|
||||
uint32_t stride_y; // stride in elm_size in y axis
|
||||
uint32_t stride_z; // z
|
||||
uint32_t base; // address of the base of the buffer
|
||||
uint32_t size_x; // size in elements for x dim
|
||||
uint32_t size_y; // y
|
||||
uint32_t size_z; // z
|
||||
uint32_t max_x; // max work item index in x dim
|
||||
uint32_t max_y; // y
|
||||
uint32_t max_z; // z
|
||||
};
|
||||
|
||||
enum md_expr_node_type_t {
|
||||
md_type_global_size = 0, // global work size
|
||||
md_type_local_size, // local work size
|
||||
md_type_param, // kernel parameter
|
||||
md_type_immediate, // uint32_t immediate value
|
||||
|
||||
md_type_op_umul, // unsigned multiply
|
||||
md_type_op_udiv, // unsigned divide
|
||||
|
||||
md_type_op_add, // add
|
||||
md_type_op_sub, // subtract
|
||||
|
||||
md_type_op_min, // signed min
|
||||
md_type_op_max, // signed max
|
||||
md_type_op_umin, // unsigned min
|
||||
md_type_op_umax, // unsigned max
|
||||
|
||||
md_type_op_and, // bitwise and
|
||||
md_type_op_or, // bitwise or
|
||||
md_type_op_xor, // bitwise xor
|
||||
|
||||
md_type_op_shl, // left shift
|
||||
md_type_op_lshr, // right shift
|
||||
|
||||
// more operators as needed
|
||||
// ...
|
||||
};
|
||||
|
||||
struct md_expr_node_t {
|
||||
md_expr_node_type_t type; // type of this expression node
|
||||
uint32_t value; // immediate or operand
|
||||
};
|
||||
|
||||
struct md_expr_t {
|
||||
uint32_t node_count; // number of md_expr_node_t's that make up this
|
||||
// expression
|
||||
uint32_t node_first; // index of the first md_expr_node_t that
|
||||
// is part of this expression
|
||||
};
|
||||
|
||||
#endif // SHAVE_METADATA_H_INCLUDED
|
@ -0,0 +1,225 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#ifndef SHAVE_METADATA_PARSER_H_INCLUDED
|
||||
#define SHAVE_METADATA_PARSER_H_INCLUDED
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#include "ShaveElfMetadata.h"
|
||||
|
||||
|
||||
struct md_parser_t {
|
||||
md_parser_t(const uint8_t *data, size_t data_size,
|
||||
const char *strtab,
|
||||
size_t strtab_size)
|
||||
: hdr(reinterpret_cast<const md_header_t *>(data)),
|
||||
kernel_descriptor(reinterpret_cast<const md_kernel_descriptor_t *>(
|
||||
data + hdr->kernel_first)),
|
||||
kernel_argument(reinterpret_cast<const md_kernel_argument_t *>(
|
||||
data + hdr->arg_first)),
|
||||
kernel_sipp_info(reinterpret_cast<const md_kernel_sipp_info_t *>(
|
||||
data + hdr->sipp_info_first)),
|
||||
expr_node(reinterpret_cast<const md_expr_node_t *>(
|
||||
data + hdr->expr_node_first)),
|
||||
expr(reinterpret_cast<const md_expr_t *>(data + hdr->expr_first)),
|
||||
func(reinterpret_cast<const md_function_t *>(data + hdr->func_first)),
|
||||
strtab(strtab), strtab_size(strtab_size) {
|
||||
(void)data_size;
|
||||
(void)strtab_size;
|
||||
assert(hdr->version == md_version_latest);
|
||||
}
|
||||
|
||||
// Return the metadata version
|
||||
//
|
||||
md_version_t get_version() const {
|
||||
return static_cast<md_version_t>(hdr->version);
|
||||
}
|
||||
|
||||
// Get a kernel by name
|
||||
//
|
||||
const md_kernel_descriptor_t *get_kernel(const std::string &name) const {
|
||||
for (uint32_t i=0; i < hdr->kernel_count; ++i) {
|
||||
const md_kernel_descriptor_t *d = get_kernel(i);
|
||||
const char *n = get_name(d);
|
||||
if (name == n) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Get a kernel id by name
|
||||
//
|
||||
int get_kernel_id(const std::string& name) const {
|
||||
for (uint32_t i = 0; i < hdr->kernel_count; ++i) {
|
||||
const md_kernel_descriptor_t* d = get_kernel(i);
|
||||
const char* n = get_name(d);
|
||||
if (name == n) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Return true if a kernel has a specific variant
|
||||
//
|
||||
bool kernel_has_variant(const md_kernel_descriptor_t *kernel,
|
||||
md_kernel_variant_type_t variant) const {
|
||||
const auto &v = kernel->variant[ variant ];
|
||||
return v.name != md_invalid_index &&
|
||||
v.func != md_invalid_index;
|
||||
}
|
||||
|
||||
// return the load address of a kernel variant
|
||||
//
|
||||
uint32_t get_kernel_load_addr(const md_kernel_descriptor_t *kernel, const md_kernel_variant_type_t variant) {
|
||||
if (!kernel_has_variant(kernel, variant)) {
|
||||
return 0;
|
||||
}
|
||||
const auto &v = kernel->variant[ variant ];
|
||||
const md_function_t &f = func[v.func];
|
||||
return f.load_address;
|
||||
}
|
||||
|
||||
// Get a rough stack size estimate for a kernel variant
|
||||
//
|
||||
uint32_t get_kernel_stack_estimate(const md_kernel_descriptor_t *kernel,
|
||||
md_kernel_variant_type_t variant,
|
||||
const uint32_t local_size[3]) const {
|
||||
const uint32_t local_area = local_size[0] * local_size[1] * local_size[2];
|
||||
const uint32_t per_wi = local_area * kernel->stack_size_wi;
|
||||
const uint32_t per_wg = kernel->stack_size_wg;
|
||||
const uint32_t factor = kernel->variant[variant].factor;
|
||||
switch (variant) {
|
||||
case md_variant_vectorized:
|
||||
case md_variant_unrolled: return per_wg + per_wi * factor;
|
||||
case md_variant_scalar:
|
||||
default: return per_wg + per_wi;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the number of local arguments a kernel has
|
||||
//
|
||||
uint32_t get_num_local_args(const md_kernel_descriptor_t *kernel) const {
|
||||
uint32_t out = 0;
|
||||
for (uint32_t i = 0; i < kernel->arg_count; ++i) {
|
||||
const md_kernel_argument_t *arg = get_argument(kernel->arg_index + i);
|
||||
out += arg->addr_space == md_addr_space_local;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Get the number of distinct kernels in this file
|
||||
//
|
||||
uint32_t get_kernel_count() const {
|
||||
return hdr->kernel_count;
|
||||
}
|
||||
|
||||
// Get a function by index
|
||||
//
|
||||
const md_function_t *get_func_ptr(uint32_t index) const {
|
||||
assert(index != md_invalid_index && index < hdr->func_count);
|
||||
return func + index;
|
||||
}
|
||||
|
||||
// Get a kernel by load address
|
||||
//
|
||||
const md_kernel_descriptor_t *get_kernel_by_addr(uint32_t addr) const {
|
||||
for (uint32_t i = 0; i < hdr->kernel_count; ++i) {
|
||||
const md_kernel_descriptor_t *desc = get_kernel(i);
|
||||
for (uint32_t j = 0; j < md_VARIANT_COUNT; ++j) {
|
||||
const uint32_t index = desc->variant[j].func;
|
||||
if (index == md_invalid_index) {
|
||||
continue;
|
||||
}
|
||||
const md_function_t *ptr = get_func_ptr(index);
|
||||
if (ptr->load_address == addr) {
|
||||
return desc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Get a kernel by index
|
||||
//
|
||||
const md_kernel_descriptor_t *get_kernel(uint32_t index) const {
|
||||
assert(index < hdr->kernel_count);
|
||||
return kernel_descriptor + index;
|
||||
}
|
||||
|
||||
// Get an argument by index
|
||||
//
|
||||
const md_kernel_argument_t *get_argument(uint32_t index) const {
|
||||
assert(index < hdr->arg_count);
|
||||
return kernel_argument + index;
|
||||
}
|
||||
|
||||
// Get SIPP info by index
|
||||
//
|
||||
const md_kernel_sipp_info_t *get_sipp_info(uint32_t index) const {
|
||||
assert(index < hdr->sipp_info_count);
|
||||
return kernel_sipp_info + index;
|
||||
}
|
||||
|
||||
// Get an expression node by index
|
||||
//
|
||||
const md_expr_node_t *get_expr_node(uint32_t index) const {
|
||||
assert(index < hdr->expr_node_count);
|
||||
return expr_node + index;
|
||||
}
|
||||
|
||||
// Get an expression by index
|
||||
//
|
||||
const md_expr_t *get_expr(uint32_t index) const {
|
||||
assert(index < hdr->expr_count);
|
||||
return expr + index;
|
||||
}
|
||||
|
||||
// Get a kernel argument for a specific kernel by position
|
||||
//
|
||||
const md_kernel_argument_t *get_argument(const md_kernel_descriptor_t *kernel, uint32_t index) const {
|
||||
assert(index < kernel->arg_count);
|
||||
return get_argument(kernel->arg_index + index);
|
||||
}
|
||||
|
||||
// Return the name of a kernel
|
||||
//
|
||||
const char *get_name(const md_kernel_descriptor_t *kernel) const {
|
||||
return strtab + kernel->name;
|
||||
}
|
||||
|
||||
// Return the name of an argument
|
||||
//
|
||||
const char *get_name(const md_kernel_argument_t *arg) const {
|
||||
return strtab + arg->name;
|
||||
}
|
||||
|
||||
// Evaluate an arbitary expression
|
||||
//
|
||||
uint32_t evaluate_expr(const md_expr_t *expression,
|
||||
const uint32_t local_size[3],
|
||||
const uint32_t global_size[3],
|
||||
const uint32_t *param,
|
||||
uint32_t param_count) const;
|
||||
|
||||
protected:
|
||||
// structure parsers
|
||||
const md_header_t *hdr;
|
||||
const md_kernel_descriptor_t *kernel_descriptor;
|
||||
const md_kernel_argument_t *kernel_argument;
|
||||
const md_kernel_sipp_info_t *kernel_sipp_info;
|
||||
const md_expr_node_t *expr_node;
|
||||
const md_expr_t *expr;
|
||||
const md_function_t *func;
|
||||
// string table
|
||||
const char *strtab;
|
||||
const size_t strtab_size;
|
||||
};
|
||||
|
||||
#endif // SHAVE_METADATA_PARSER_H_INCLUDED
|
@ -0,0 +1,93 @@
|
||||
// Copyright (C) 2018-2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "vpu/frontend/ShaveElfMetadataParser.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace {
|
||||
|
||||
// two operand operator evaluation
|
||||
uint32_t md_eval_expression_type_op_2(
|
||||
const md_expr_node_type_t type,
|
||||
const uint32_t lhs,
|
||||
const uint32_t rhs) {
|
||||
switch (type) {
|
||||
case md_type_op_umul: return lhs * rhs;
|
||||
case md_type_op_udiv: return lhs / rhs;
|
||||
case md_type_op_add: return (int32_t)lhs + (int32_t)rhs;
|
||||
case md_type_op_sub: return (int32_t)lhs - (int32_t)rhs;
|
||||
case md_type_op_min: return std::min((int32_t)lhs, (int32_t)rhs);
|
||||
case md_type_op_max: return std::max((int32_t)lhs, (int32_t)rhs);
|
||||
case md_type_op_umin: return std::min(lhs, rhs);
|
||||
case md_type_op_umax: return std::max(lhs, rhs);
|
||||
case md_type_op_and: return lhs & rhs;
|
||||
case md_type_op_or: return lhs | rhs;
|
||||
case md_type_op_xor: return lhs ^ rhs;
|
||||
case md_type_op_shl: return lhs << rhs;
|
||||
case md_type_op_lshr: return lhs >> rhs;
|
||||
default:
|
||||
assert(!"unknown node type");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
uint32_t md_parser_t::evaluate_expr(const md_expr_t *expression,
|
||||
const uint32_t local_size[3],
|
||||
const uint32_t global_size[3],
|
||||
const uint32_t *param,
|
||||
uint32_t param_count) const {
|
||||
// find the nodes for the given expr_index
|
||||
assert(expression->node_first < hdr->expr_node_count);
|
||||
const md_expr_node_t *node = expr_node + expression->node_first;
|
||||
// the intermediate value stack
|
||||
std::vector<uint32_t> values;
|
||||
// for all of the nodes in this expression
|
||||
for (uint32_t i = 0; i < expression->node_count; ++i) {
|
||||
// get the node
|
||||
const md_expr_node_t &v = node[i];
|
||||
// dispatch the opcode
|
||||
switch (v.type) {
|
||||
case md_type_immediate:
|
||||
values.push_back(v.value);
|
||||
break;
|
||||
case md_type_op_umul: {
|
||||
case md_type_op_udiv:
|
||||
case md_type_op_add:
|
||||
case md_type_op_sub:
|
||||
case md_type_op_min:
|
||||
case md_type_op_max:
|
||||
case md_type_op_umin:
|
||||
case md_type_op_umax:
|
||||
case md_type_op_and:
|
||||
case md_type_op_or:
|
||||
case md_type_op_xor:
|
||||
case md_type_op_shl:
|
||||
case md_type_op_lshr:
|
||||
uint32_t rhs = values.rbegin()[0];
|
||||
uint32_t lhs = values.rbegin()[1];
|
||||
values.pop_back();
|
||||
values.back() = md_eval_expression_type_op_2(v.type, lhs, rhs);
|
||||
}
|
||||
break;
|
||||
case md_type_global_size:
|
||||
assert(v.value < 3);
|
||||
values.push_back(global_size[v.value]);
|
||||
break;
|
||||
case md_type_local_size:
|
||||
assert(v.value < 3);
|
||||
values.push_back(local_size[v.value]);
|
||||
break;
|
||||
case md_type_param:
|
||||
assert(v.value < param_count);
|
||||
values.push_back(param[v.value]);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown node type");
|
||||
}
|
||||
}
|
||||
// should only be one value remaining which is the result
|
||||
assert(values.size() == 1);
|
||||
return values.back();
|
||||
}
|
@ -2,20 +2,30 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <vpu/frontend/custom_kernel.hpp>
|
||||
#include <xml_parse_utils.h>
|
||||
#include <caseless.hpp>
|
||||
#include <vpu/frontend/ShaveElfMetadataParser.h>
|
||||
#include <vpu/frontend/custom_kernel.hpp>
|
||||
#include <vpu/utils/error.hpp>
|
||||
#include <vpu/utils/extra.hpp>
|
||||
#include <xml_parse_utils.h>
|
||||
|
||||
namespace vpu {
|
||||
|
||||
VPU_PACKED(Elf32Shdr {
|
||||
uint32_t shName;
|
||||
uint32_t pad0[3];
|
||||
uint32_t shOffset;
|
||||
uint32_t shSize;
|
||||
uint32_t pad1[4];
|
||||
};)
|
||||
|
||||
VPU_PACKED(Elf32Ehdr {
|
||||
uint8_t offs1[28];
|
||||
uint32_t ePhoff; // Program header offset
|
||||
uint32_t eShoff; // Section header offset
|
||||
uint8_t offs2[12];
|
||||
uint16_t eShnum; // Number of sections
|
||||
uint16_t offs3;
|
||||
uint32_t pad0[7];
|
||||
uint32_t ePhoff;
|
||||
uint32_t eShoff;
|
||||
uint32_t pad1[3];
|
||||
uint16_t eShnum;
|
||||
uint16_t eShstrndx;
|
||||
};)
|
||||
|
||||
VPU_PACKED(Elf32Section {
|
||||
@ -95,111 +105,66 @@ std::pair<const Elf32Section*, const Elf32Section*> findSymbolTable(
|
||||
return std::make_pair(strShdr, symShdr);
|
||||
}
|
||||
|
||||
SmallVector<std::string> deduceKernelParameters(
|
||||
const char* ELFData,
|
||||
uint32_t kernelAddress) {
|
||||
IE_ASSERT(ELFData != nullptr);
|
||||
const auto cmp = ie::details::CaselessEq<std::string>{};
|
||||
SmallVector<std::string> deduceKernelParameters(const md_parser_t& parser, int kernelId) {
|
||||
const auto kernelDesc = parser.get_kernel(kernelId);
|
||||
IE_ASSERT(kernelDesc != nullptr);
|
||||
// Number of elements we get from parser is always greater by one
|
||||
const auto argCount = kernelDesc->arg_count - 1;
|
||||
|
||||
auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
|
||||
auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
|
||||
auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
|
||||
auto arguments = SmallVector<std::string>{};
|
||||
arguments.reserve(argCount);
|
||||
for (size_t i = 0; i < argCount; i++) {
|
||||
const auto arg = parser.get_argument(kernelDesc, i);
|
||||
VPU_THROW_UNLESS(arg, "Error while parsing custom layer elf file.");
|
||||
|
||||
const Elf32Section* strShdr = nullptr;
|
||||
const Elf32Section* symShdr = nullptr;
|
||||
std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
|
||||
IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
|
||||
|
||||
auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
|
||||
auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
|
||||
auto firstStr = ELFData + strShdr->shOffset;
|
||||
|
||||
const char* kernelArgStrings = nullptr;
|
||||
for (size_t i = 0; i < numSymEntries; i++) {
|
||||
if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) {
|
||||
kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset;
|
||||
break;
|
||||
// skip hoisted buffers
|
||||
if (arg->flags & md_arg_flags_generated_prepost) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
IE_ASSERT(kernelArgStrings != nullptr);
|
||||
|
||||
SmallVector<std::string> parameters;
|
||||
for (size_t i = 0; i < numSymEntries; i++) {
|
||||
if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) {
|
||||
auto ptr = ELFData + shdr[sym[i].stShndx].shOffset;
|
||||
auto numKernels = *reinterpret_cast<const int*>(ptr);
|
||||
|
||||
auto metaOffset = sizeof(int);
|
||||
for (int k = 0; k < numKernels; k++) {
|
||||
auto kHdr = reinterpret_cast<const KernelHdr*>(ptr + metaOffset);
|
||||
|
||||
if (kHdr->address-phdr->pVaddr == kernelAddress) {
|
||||
auto aHdr = reinterpret_cast<const KernelArgHdr*>(
|
||||
reinterpret_cast<const char*>(&(kHdr->argOffset)) + sizeof(kHdr->argOffset) + kHdr->argOffset);
|
||||
|
||||
auto numArgs = reinterpret_cast<const int*>(aHdr)[-1];
|
||||
for (int n = 0; n < numArgs; n++, aHdr++) {
|
||||
parameters.push_back(kernelArgStrings + aHdr->stringOffset);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
metaOffset += kHdr->sectionSize + sizeof(kHdr->address) + sizeof(kHdr->flags);
|
||||
}
|
||||
}
|
||||
const auto argName = parser.get_name(arg);
|
||||
arguments.emplace_back(argName);
|
||||
}
|
||||
|
||||
return parameters;
|
||||
return arguments;
|
||||
}
|
||||
|
||||
int32_t getKernelId(
|
||||
const char* ELFData,
|
||||
uint32_t kernelAddress) {
|
||||
IE_ASSERT(ELFData != nullptr);
|
||||
const auto cmp = ie::details::CaselessEq<std::string>{};
|
||||
static const Elf32Shdr *get_elf_section_with_name(const uint8_t *elf_data, const char* section_name) {
|
||||
IE_ASSERT(elf_data);
|
||||
IE_ASSERT(section_name);
|
||||
|
||||
auto ehdr = reinterpret_cast<const Elf32Ehdr*>(ELFData);
|
||||
auto phdr = reinterpret_cast<const Elf32Phdr*>(ELFData + ehdr->ePhoff);
|
||||
auto shdr = reinterpret_cast<const Elf32Section*>(ELFData + ehdr->eShoff);
|
||||
const auto *ehdr = reinterpret_cast<const Elf32Ehdr *>(elf_data);
|
||||
IE_ASSERT(0 != ehdr->eShoff);
|
||||
IE_ASSERT(0 != ehdr->ePhoff);
|
||||
|
||||
const Elf32Section* strShdr = nullptr;
|
||||
const Elf32Section* symShdr = nullptr;
|
||||
std::tie(strShdr, symShdr) = findSymbolTable(ELFData);
|
||||
IE_ASSERT(symShdr != nullptr && strShdr != nullptr);
|
||||
// Pointer to the first section header
|
||||
const Elf32Shdr *shdr = reinterpret_cast<const Elf32Shdr *>(elf_data + ehdr->eShoff);
|
||||
|
||||
auto numSymEntries = symShdr->shSize / symShdr->shEntsize;
|
||||
auto sym = reinterpret_cast<const Elf32Sym*>(ELFData + symShdr->shOffset);
|
||||
auto firstStr = ELFData + strShdr->shOffset;
|
||||
// Pointer to section header string table header
|
||||
const Elf32Shdr *strShdr = &shdr[ehdr->eShstrndx];
|
||||
|
||||
const char* kernelArgStrings = nullptr;
|
||||
for (size_t i = 0; i < numSymEntries; i++) {
|
||||
if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.strings")) {
|
||||
kernelArgStrings = ELFData + shdr[sym[i].stShndx].shOffset;
|
||||
break;
|
||||
}
|
||||
// We couldn't find sections for the symbol string names and for the symbols
|
||||
// entries
|
||||
if (!strShdr) {
|
||||
return nullptr;
|
||||
}
|
||||
IE_ASSERT(kernelArgStrings != nullptr);
|
||||
|
||||
for (size_t i = 0; i < numSymEntries; i++) {
|
||||
if (cmp(firstStr + sym[i].stName, "opencl.kernelArgs.info")) {
|
||||
auto ptr = ELFData + shdr[sym[i].stShndx].shOffset;
|
||||
auto numKernels = *reinterpret_cast<const int*>(ptr);
|
||||
// The string at index 0, which corresponds to the first byte, is a null
|
||||
// character
|
||||
const char *firstStr = reinterpret_cast<const char *>(elf_data + strShdr->shOffset);
|
||||
|
||||
auto metaOffset = sizeof(int);
|
||||
for (int k = 0; k < numKernels; k++) {
|
||||
auto kHdr = reinterpret_cast<const KernelHdr*>(ptr + metaOffset);
|
||||
// Find the section with the custom SHAVEComputeAorta data
|
||||
for (uint16_t i = 0; i < ehdr->eShnum; i++) {
|
||||
const char *currentSectionName = firstStr + shdr[i].shName;
|
||||
|
||||
if (kHdr->address-phdr->pVaddr == kernelAddress) {
|
||||
return k;
|
||||
}
|
||||
|
||||
metaOffset += kHdr->sectionSize + sizeof(kHdr->address) + sizeof(kHdr->flags);
|
||||
}
|
||||
if (0 == strcmp(currentSectionName, section_name)) {
|
||||
return shdr + i;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
// If we reached this point, it means that there wasn't a section with
|
||||
// the name we were looking for
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) {
|
||||
@ -230,8 +195,9 @@ uint32_t getKernelEntry(const char* ELFData, const std::string& kernelName) {
|
||||
CustomKernel::CustomKernel(const pugi::xml_node& kernel, std::string configDir): _configDir {std::move(configDir)} {
|
||||
_maxShaves = XMLParseUtils::GetIntAttr(kernel, "max-shaves", 0);
|
||||
|
||||
std::string fileName;
|
||||
for (auto source = kernel.child("Source"); !source.empty(); source = source.next_sibling("Source")) {
|
||||
auto fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(source, "filename", "");
|
||||
fileName = _configDir + "/" + XMLParseUtils::GetStrAttr(source, "filename", "");
|
||||
|
||||
std::ifstream inputFile(fileName, std::ios::binary);
|
||||
if (!inputFile.is_open()) {
|
||||
@ -244,9 +210,30 @@ CustomKernel::CustomKernel(const pugi::xml_node& kernel, std::string configDir):
|
||||
}
|
||||
|
||||
const auto kernelEntryName = XMLParseUtils::GetStrAttr(kernel, "entry");
|
||||
const auto kernelEntry = getKernelEntry(&_kernelBinary[0], kernelEntryName);
|
||||
_parameters = deduceKernelParameters(&_kernelBinary[0], kernelEntry);
|
||||
_kernelId = getKernelId(&_kernelBinary[0], kernelEntry);
|
||||
|
||||
const auto elf = reinterpret_cast<const uint8_t*>(_kernelBinary.data());
|
||||
const Elf32Shdr *neoMetadataShdr = get_elf_section_with_name(elf, ".neo_metadata");
|
||||
VPU_THROW_UNLESS(neoMetadataShdr, "Error while parsing custom layer elf: Couldn't find .neo_metadata section");
|
||||
|
||||
const uint8_t *neoMetadata = elf + neoMetadataShdr->shOffset;
|
||||
const size_t neoMetadataSize = neoMetadataShdr->shSize;
|
||||
|
||||
const Elf32Shdr *neoMetadataStrShdr = get_elf_section_with_name(elf, ".neo_metadata.str");
|
||||
VPU_THROW_UNLESS(neoMetadataStrShdr, "Error while parsing custom layer elf: Couldn't find .neo_metadata.str section");
|
||||
|
||||
const char *neoMetadataStr = reinterpret_cast<const char *>(elf + neoMetadataStrShdr->shOffset);
|
||||
const size_t neoMetadataStrSize = neoMetadataStrShdr->shSize;
|
||||
|
||||
const auto parser = md_parser_t{neoMetadata, neoMetadataSize, neoMetadataStr, neoMetadataStrSize};
|
||||
_kernelId = parser.get_kernel_id(kernelEntryName);
|
||||
VPU_THROW_UNLESS(_kernelId != -1, "Failed to find kernel with name `%l`", kernelEntryName);
|
||||
|
||||
VPU_THROW_UNLESS(parser.get_kernel_count() == 1,
|
||||
"Failed to load kernel binary '%l'\n"
|
||||
"\tReason: binary should contain only one kernel, but contains %l",
|
||||
fileName, parser.get_kernel_count());
|
||||
|
||||
_parameters = deduceKernelParameters(parser, _kernelId);
|
||||
|
||||
processParametersNode(kernel);
|
||||
processWorkSizesNode(kernel);
|
||||
|
@ -136,7 +136,7 @@ private:
|
||||
case CustomParamType::OutputBuffer:
|
||||
case CustomParamType::Data: {
|
||||
VPU_THROW_UNLESS(ports.find(kp) != ports.end(),
|
||||
"XML specification for %s layer has no definition for %s parameter. Layer name: %s",
|
||||
"XML specification for %s layer has no definition for '%s' parameter. Layer name: %s",
|
||||
origLayer()->type, kp, origLayer()->name);
|
||||
|
||||
int id = ports.find(kp)->second;
|
||||
|
@ -20,7 +20,7 @@ INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsFakeQuantize_smoke,
|
||||
INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsQuantizeBinarize_smoke,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(s_QuantizeTensors),
|
||||
::testing::ValuesIn(s_QuantizeLevels),
|
||||
::testing::Values(2),
|
||||
::testing::ValuesIn(s_QuantizeSwitchOut),
|
||||
::testing::ValuesIn(s_CustomConfig)));
|
||||
|
||||
|
@ -799,7 +799,7 @@ TEST_P(myriadLayersTestsQuantizeBinarize_smoke, Quantize_Binarization) {
|
||||
</port>
|
||||
</output>
|
||||
</layer>
|
||||
<layer id="5" name="Quantize" precision="FP16" type="QuantizeTemporaryType">
|
||||
<layer id="5" name="Quantize" precision="FP16" type="FakeQuantizeBin">
|
||||
<data levels="@levels@" input_low_size="@input_low_size@" input_high_size="@input_high_size@" output_low_size="@output_low_size@" output_high_size="@output_high_size@" switch_out="@switch_out@"/>
|
||||
<input>
|
||||
<port id="0">
|
||||
@ -1057,6 +1057,10 @@ TEST_P(myriadLayersTestsBinaryConvolution_smoke, BinaryConvolution) {
|
||||
}
|
||||
_config[InferenceEngine::MYRIAD_CUSTOM_LAYERS] = customConfig;
|
||||
|
||||
if (kernel.x == 3 && kernel.y == 3 && dilations == 2) {
|
||||
GTEST_SKIP() << "Computing wrong after hoisting";
|
||||
}
|
||||
|
||||
SetInputTensor(dims);
|
||||
auto dimsOutput = dims;
|
||||
dimsOutput.h = (dims.h) / strides;
|
||||
@ -1112,7 +1116,7 @@ static std::vector<Group> s_BinaryConvolutionGroup = {
|
||||
static std::vector<Kernel> s_BinaryConvolutionKernel = {
|
||||
{{1, 1}},
|
||||
{{1, 3}},
|
||||
{{3, 3}},
|
||||
{{3, 3}}
|
||||
};
|
||||
static std::vector<Strides> s_BinaryConvolutionStrides = {
|
||||
1, 2
|
||||
|
@ -14,5 +14,22 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::Values<DoSoftmax>(1, 0),
|
||||
::testing::Values(vpu::LayoutPreference::ChannelMajor, vpu::LayoutPreference::ChannelMinor),
|
||||
::testing::Values(IRVersion::v7, IRVersion::v10),
|
||||
::testing::ValuesIn(s_CustomConfig)
|
||||
::testing::Values("")
|
||||
));
|
||||
|
||||
#ifdef VPU_HAS_CUSTOM_KERNELS
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
accuracy_custom, myriadLayersTestsRegionYolo_smoke,
|
||||
::testing::Combine(
|
||||
::testing::Values<Coords>(4),
|
||||
::testing::Values<Classes>(20),
|
||||
::testing::Values<Num>(5, 10),
|
||||
::testing::Values<MaskSize>(3),
|
||||
::testing::Values<DoSoftmax>(1, 0),
|
||||
::testing::Values(vpu::LayoutPreference::ChannelMajor, vpu::LayoutPreference::ChannelMinor),
|
||||
::testing::Values(IRVersion::v7, IRVersion::v10),
|
||||
::testing::Values(s_CustomConfig[1])
|
||||
));
|
||||
|
||||
#endif
|
||||
|
@ -9,5 +9,17 @@ INSTANTIATE_TEST_CASE_P(accuracy, myriadLayersTestsReorg_smoke, ::testing::Combi
|
||||
::testing::Values<Stride>(2),
|
||||
::testing::Values(vpu::LayoutPreference::ChannelMinor, vpu::LayoutPreference::ChannelMajor),
|
||||
::testing::Values(IRVersion::v7, IRVersion::v10),
|
||||
::testing::ValuesIn(s_CustomConfig)
|
||||
::testing::Values<CustomConfig>({})
|
||||
));
|
||||
|
||||
#ifdef VPU_HAS_CUSTOM_KERNELS
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(accuracy_custom, myriadLayersTestsReorg_smoke, ::testing::Combine(
|
||||
::testing::ValuesIn(s_ReorgInputs_CustomLayer),
|
||||
::testing::Values<Stride>(2),
|
||||
::testing::Values(vpu::LayoutPreference::ChannelMinor, vpu::LayoutPreference::ChannelMajor),
|
||||
::testing::Values(IRVersion::v7, IRVersion::v10),
|
||||
::testing::Values(s_CustomConfig[1])
|
||||
));
|
||||
|
||||
#endif
|
||||
|
@ -111,3 +111,9 @@ static std::vector<SizeVector> s_ReorgInputs = {
|
||||
{1, 192, 6 * 26, 6 * 26},
|
||||
{1, 4, 6, 6}
|
||||
};
|
||||
|
||||
static std::vector<SizeVector> s_ReorgInputs_CustomLayer = {
|
||||
{1, 64, 26, 26},
|
||||
{1, 64, 128, 128},
|
||||
{1, 4, 6, 6}
|
||||
};
|
||||
|
@ -4,13 +4,26 @@
|
||||
|
||||
#include "myriad_layers_resample_test.hpp"
|
||||
|
||||
// #-31522
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
DISABLED_accuracy, myriadResampleLayerTests_smoke,
|
||||
accuracy, myriadResampleLayerTests_smoke,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(s_ResampleInput),
|
||||
::testing::Values<Factor>(2.0f, 0.5f),
|
||||
::testing::Values<Antialias>(false),
|
||||
::testing::Values<HwOptimization>(false, true),
|
||||
::testing::Values(""))
|
||||
);
|
||||
|
||||
#ifdef VPU_HAS_CUSTOM_KERNELS
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
accuracy_custom, myriadResampleLayerTests_smoke,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(s_ResampleInput),
|
||||
::testing::Values<Factor>(2.0f),
|
||||
::testing::Values<Antialias>(false, true),
|
||||
::testing::Values<HwOptimization>(false, true),
|
||||
::testing::ValuesIn(s_CustomConfig))
|
||||
::testing::Values(s_CustomConfig[1]))
|
||||
);
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user