[IE CLDNN] Fully connected MMAD kernel optimizations (#2115)
This commit is contained in:
parent
5403003d02
commit
3797a28e65
@ -17,10 +17,6 @@
|
|||||||
|
|
||||||
namespace kernel_selector {
|
namespace kernel_selector {
|
||||||
|
|
||||||
namespace {
|
|
||||||
static const size_t sub_group_size = 8;
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const {
|
ParamsKey FullyConnectedKernelMMAD::GetSupportedKey() const {
|
||||||
ParamsKey k;
|
ParamsKey k;
|
||||||
k.EnableInputDataType(Datatype::INT8);
|
k.EnableInputDataType(Datatype::INT8);
|
||||||
@ -65,14 +61,32 @@ bool FullyConnectedKernelMMAD::Validate(const Params& params, const optional_par
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::SetTuningParams(const fully_connected_params& params) const {
|
||||||
|
FullyConnectedTuningData tuning_data;
|
||||||
|
|
||||||
|
const auto& input = params.inputs[0];
|
||||||
|
|
||||||
|
size_t feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0 ?
|
||||||
|
input.Feature().v / 32 : CeilDiv(input.Feature().v, 32);
|
||||||
|
|
||||||
|
if (feature_blocks_count)
|
||||||
|
while (feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
|
||||||
|
(tuning_data.slm_div_factor * 2 <= params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size))
|
||||||
|
tuning_data.slm_div_factor *= 2;
|
||||||
|
|
||||||
|
tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
|
||||||
|
|
||||||
|
return tuning_data;
|
||||||
|
}
|
||||||
|
|
||||||
FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params,
|
FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(const fully_connected_params& params,
|
||||||
int) const {
|
int) const {
|
||||||
|
FullyConnectedTuningData tuning_data = SetTuningParams(params);
|
||||||
auto runInfo = Parent::SetDefault(params);
|
auto runInfo = Parent::SetDefault(params);
|
||||||
|
const auto& output = params.output;
|
||||||
|
|
||||||
const auto& out = params.output;
|
std::vector<size_t> global = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
|
||||||
|
std::vector<size_t> local = { tuning_data.work_group_size, 1, 1 };
|
||||||
std::vector<size_t> global = { Align(out.Feature().v, sub_group_size), out.Batch().v, 1 };
|
|
||||||
auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
|
|
||||||
|
|
||||||
runInfo.gws0 = global[0];
|
runInfo.gws0 = global[0];
|
||||||
runInfo.gws1 = global[1];
|
runInfo.gws1 = global[1];
|
||||||
@ -87,12 +101,14 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons
|
|||||||
|
|
||||||
JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params,
|
JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_params& params,
|
||||||
const DispatchData& runInfo) const {
|
const DispatchData& runInfo) const {
|
||||||
|
FullyConnectedTuningData tuning_data = SetTuningParams(params);
|
||||||
|
|
||||||
auto jit = Parent::GetJitConstants(params, runInfo);
|
auto jit = Parent::GetJitConstants(params, runInfo);
|
||||||
|
|
||||||
auto& input = params.inputs[0];
|
auto& input = params.inputs[0];
|
||||||
auto& weights = params.weights;
|
auto& weights = params.weights;
|
||||||
|
|
||||||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
|
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
|
||||||
if (input.GetDims().size() == 5) {
|
if (input.GetDims().size() == 5) {
|
||||||
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
|
jit.AddConstant(MakeJitConstant("FILTER_GET_OFFSET(f)", "GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_INDEX(FILTER, f, 0, 0, 0)"));
|
||||||
} else {
|
} else {
|
||||||
@ -137,13 +153,33 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
|
|||||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32));
|
jit.AddConstant(MakeJitConstant("MMAD_INPUT_FBLOCK_PITCH", input.Feature().pitch * 32));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
|
||||||
|
|
||||||
|
size_t feature_blocks_count;
|
||||||
|
size_t temp_unroll_factor = 9, unroll_factor, full_unroll_factor;
|
||||||
|
|
||||||
if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) {
|
if (input.GetLayout() == DataLayout::bfyx && input.Feature().v % 32 != 0) {
|
||||||
|
feature_blocks_count = input.Feature().v / 32;
|
||||||
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true));
|
jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", true));
|
||||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", input.Feature().v / 32));
|
|
||||||
} else {
|
} else {
|
||||||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", CeilDiv(input.Feature().v, 32)));
|
feature_blocks_count = CeilDiv(input.Feature().v, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
full_unroll_factor = feature_blocks_count / tuning_data.slm_div_factor;
|
||||||
|
|
||||||
|
if (full_unroll_factor > 9) {
|
||||||
|
while (full_unroll_factor % temp_unroll_factor)
|
||||||
|
temp_unroll_factor--;
|
||||||
|
unroll_factor = temp_unroll_factor;
|
||||||
|
} else {
|
||||||
|
unroll_factor = full_unroll_factor;
|
||||||
|
}
|
||||||
|
|
||||||
|
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", feature_blocks_count));
|
||||||
|
jit.AddConstant(MakeJitConstant("UNROLL_FACTOR", unroll_factor));
|
||||||
|
jit.AddConstant(MakeJitConstant("FULL_UNROLL_FACTOR", full_unroll_factor));
|
||||||
|
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
|
||||||
|
|
||||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch));
|
jit.AddConstant(MakeJitConstant("MMAD_INPUT_SPATIAL_PITCH", input_x_pitch));
|
||||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_X_PITCH", input_x_pitch));
|
jit.AddConstant(MakeJitConstant("MMAD_INPUT_X_PITCH", input_x_pitch));
|
||||||
jit.AddConstant(MakeJitConstant("MMAD_INPUT_Y_PITCH", input_y_pitch));
|
jit.AddConstant(MakeJitConstant("MMAD_INPUT_Y_PITCH", input_y_pitch));
|
||||||
@ -158,7 +194,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
|
|||||||
|
|
||||||
if (!params.fused_ops.empty()) {
|
if (!params.fused_ops.empty()) {
|
||||||
auto input_dt = GetActivationType(params);
|
auto input_dt = GetActivationType(params);
|
||||||
FusedOpsConfiguration conf = { "", {"b", "f", "0", "0"}, "dequantized", input_dt, 1 };
|
FusedOpsConfiguration conf = { "", {"batch", "feature", "0", "0"}, "dequantized", input_dt, 1 };
|
||||||
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,7 +216,7 @@ KernelsData FullyConnectedKernelMMAD::GetKernelsData(const Params& params, const
|
|||||||
options,
|
options,
|
||||||
input.GetLayout(),
|
input.GetLayout(),
|
||||||
w_layout,
|
w_layout,
|
||||||
FORCE_PRIORITY_9,
|
FORCE_PRIORITY_7,
|
||||||
static_cast<int>(i));
|
static_cast<int>(i));
|
||||||
if (!kd.empty()) {
|
if (!kd.empty()) {
|
||||||
res.emplace_back(kd[0]);
|
res.emplace_back(kd[0]);
|
||||||
|
@ -29,6 +29,12 @@ public:
|
|||||||
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
|
||||||
ParamsKey GetSupportedKey() const override;
|
ParamsKey GetSupportedKey() const override;
|
||||||
|
|
||||||
|
struct FullyConnectedTuningData {
|
||||||
|
const size_t sub_group_size = 8;
|
||||||
|
size_t slm_div_factor = 1;
|
||||||
|
size_t work_group_size = 1;
|
||||||
|
};
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
|
JitConstants GetJitConstants(const fully_connected_params& params, const DispatchData& kd) const override;
|
||||||
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
|
DispatchData SetDefault(const fully_connected_params& params, int autoTuneIndex = -1) const override;
|
||||||
@ -38,5 +44,6 @@ protected:
|
|||||||
FusedOpType::ACTIVATION };
|
FusedOpType::ACTIVATION };
|
||||||
}
|
}
|
||||||
bool Validate(const Params& params, const optional_params& options) const override;
|
bool Validate(const Params& params, const optional_params& options) const override;
|
||||||
|
FullyConnectedTuningData SetTuningParams(const fully_connected_params& params) const;
|
||||||
};
|
};
|
||||||
} // namespace kernel_selector
|
} // namespace kernel_selector
|
||||||
|
@ -37,25 +37,35 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
#endif
|
#endif
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
#if OUTPUT_BATCH_NUM == 1
|
const uint lid0 = (uint)get_local_id(0);
|
||||||
const uint f = (uint)get_global_id(0);
|
const uint feature_per_wg = (uint)get_local_size(0) / SLM_DIV_FACTOR;
|
||||||
const uint b = 0;
|
const uint feature = (uint)get_group_id(0) * feature_per_wg + (uint)get_global_id(0) % feature_per_wg;
|
||||||
#else
|
const uint feature_block = lid0 / feature_per_wg;
|
||||||
const uint f = (uint)get_global_id(0);
|
const uint batch = (uint)get_global_id(1);
|
||||||
const uint b = (uint)get_global_id(1);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int dotProd = 0;
|
int dotProd = 0;
|
||||||
|
|
||||||
const uint filter_offset = FILTER_GET_OFFSET(f);
|
const uint filter_offset = FILTER_GET_OFFSET(feature);
|
||||||
#if INPUT0_DIMS == 5
|
#if INPUT0_DIMS == 5
|
||||||
const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0, 0);
|
const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0, 0);
|
||||||
#else
|
#else
|
||||||
const uint input_offset = INPUT0_GET_INDEX(b, 0, 0, 0);
|
const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
__local int partial_summ[WORK_GROUP_SIZE];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SPATIAL_MAJOR
|
#if SPATIAL_MAJOR
|
||||||
for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) {
|
|
||||||
|
#if FULL_UNROLL_FACTOR < 2
|
||||||
|
for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k)
|
||||||
|
#elif UNROLL_FACTOR == FULL_UNROLL_FACTOR
|
||||||
|
uint k = feature_block * FULL_UNROLL_FACTOR;
|
||||||
|
#else
|
||||||
|
for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
# if !SPLIT_SPATIAL
|
# if !SPLIT_SPATIAL
|
||||||
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
||||||
# else
|
# else
|
||||||
@ -73,7 +83,15 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
|
for (uint xi = 0; xi < FILTER_SIZE_X; ++xi) {
|
||||||
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
|
const uint spatial = xi + yi * FILTER_SIZE_X + zi * FILTER_SIZE_X * FILTER_SIZE_Y;
|
||||||
# endif
|
# endif
|
||||||
for (uint k = 0; k < FEATURE_BLOCKS_COUNT; ++k) {
|
|
||||||
|
#if FULL_UNROLL_FACTOR < 2
|
||||||
|
for (uint k = feature_block * FULL_UNROLL_FACTOR; k < (feature_block + 1) * FULL_UNROLL_FACTOR; ++k)
|
||||||
|
#elif UNROLL_FACTOR == FULL_UNROLL_FACTOR
|
||||||
|
uint k = feature_block * FULL_UNROLL_FACTOR;
|
||||||
|
#else
|
||||||
|
for (uint k = feature_block * FULL_UNROLL_FACTOR; k + UNROLL_FACTOR <= (feature_block + 1) * FULL_UNROLL_FACTOR; k += UNROLL_FACTOR)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
#if !SPLIT_SPATIAL
|
#if !SPLIT_SPATIAL
|
||||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH;
|
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + k * MMAD_INPUT_FBLOCK_PITCH;
|
||||||
@ -82,10 +100,12 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
#endif
|
#endif
|
||||||
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH;
|
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + k * MMAD_FILTER_FBLOCK_PITCH;
|
||||||
|
|
||||||
|
#if UNROLL_FACTOR < 2
|
||||||
uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx));
|
uint input_data_u = intel_sub_group_block_read((const __global uint*)(input + input_idx));
|
||||||
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
||||||
|
|
||||||
INPUT_PACKED_TYPE_8 activations; //activations of all lanes
|
INPUT_PACKED_TYPE_8 activations;
|
||||||
|
|
||||||
activations.s0 = sub_group_broadcast(input_data, 0);
|
activations.s0 = sub_group_broadcast(input_data, 0);
|
||||||
activations.s1 = sub_group_broadcast(input_data, 1);
|
activations.s1 = sub_group_broadcast(input_data, 1);
|
||||||
activations.s2 = sub_group_broadcast(input_data, 2);
|
activations.s2 = sub_group_broadcast(input_data, 2);
|
||||||
@ -99,11 +119,50 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
|
FILTER_PACKED_TYPE_8 weights_data = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u);
|
||||||
|
|
||||||
dotProd = MMAD_8(activations, weights_data, dotProd);
|
dotProd = MMAD_8(activations, weights_data, dotProd);
|
||||||
|
#else
|
||||||
|
INPUT_PACKED_TYPE input_data[UNROLL_FACTOR];
|
||||||
|
FILTER_PACKED_TYPE_8 weights_data[UNROLL_FACTOR];
|
||||||
|
|
||||||
|
__attribute__((opencl_unroll_hint))
|
||||||
|
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||||
|
input_data[kb] = AS_TYPE(INPUT_PACKED_TYPE, intel_sub_group_block_read((const __global uint*)(input +
|
||||||
|
input_idx + kb * MMAD_INPUT_FBLOCK_PITCH)));
|
||||||
|
|
||||||
|
uint8 weights_data_u0 = intel_sub_group_block_read8((const __global uint*)(weights + filter_idx + kb * MMAD_FILTER_FBLOCK_PITCH));
|
||||||
|
weights_data[kb] = AS_TYPE(FILTER_PACKED_TYPE_8, weights_data_u0);
|
||||||
|
}
|
||||||
|
|
||||||
|
__attribute__((opencl_unroll_hint))
|
||||||
|
for (uint kb = 0; kb < UNROLL_FACTOR; kb++) {
|
||||||
|
INPUT_PACKED_TYPE_8 in;
|
||||||
|
|
||||||
|
in.s0 = sub_group_broadcast(input_data[kb], 0);
|
||||||
|
in.s1 = sub_group_broadcast(input_data[kb], 1);
|
||||||
|
in.s2 = sub_group_broadcast(input_data[kb], 2);
|
||||||
|
in.s3 = sub_group_broadcast(input_data[kb], 3);
|
||||||
|
in.s4 = sub_group_broadcast(input_data[kb], 4);
|
||||||
|
in.s5 = sub_group_broadcast(input_data[kb], 5);
|
||||||
|
in.s6 = sub_group_broadcast(input_data[kb], 6);
|
||||||
|
in.s7 = sub_group_broadcast(input_data[kb], 7);
|
||||||
|
|
||||||
|
dotProd = MMAD_8(in, weights_data[kb], dotProd);
|
||||||
|
}
|
||||||
|
#endif // UNROLL_FACTOR < 2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
partial_summ[lid0] = dotProd;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (feature_block == 0) {
|
||||||
|
__attribute__((opencl_unroll_hint))
|
||||||
|
for (uint i = 1; i < SLM_DIV_FACTOR; i++)
|
||||||
|
dotProd += partial_summ[lid0 % feature_per_wg + i * feature_per_wg];
|
||||||
|
#endif // SLM_DIV_FACTOR > 1
|
||||||
|
|
||||||
#if HAS_FEATURE_LEFTOVERS
|
#if HAS_FEATURE_LEFTOVERS
|
||||||
const uint lid = get_sub_group_local_id();
|
const uint sglid = get_sub_group_local_id();
|
||||||
#if SPATIAL_MAJOR
|
#if SPATIAL_MAJOR
|
||||||
#if !SPLIT_SPATIAL
|
#if !SPLIT_SPATIAL
|
||||||
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
for (uint spatial = 0; spatial < FILTER_SPATIAL_SIZE; ++spatial) {
|
||||||
@ -128,14 +187,14 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
#if !SPLIT_SPATIAL
|
#if !SPLIT_SPATIAL
|
||||||
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
|
uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
|
||||||
#else // !SPLIT_SPATIAL
|
#else // !SPLIT_SPATIAL
|
||||||
uint input_idx = input_offset + FEATURE_BLOCK_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH + zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
|
||||||
#endif // !SPLIT_SPATIAL
|
#endif // !SPLIT_SPATIAL
|
||||||
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
|
uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;
|
||||||
|
|
||||||
MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
|
MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
|
||||||
for (uint i = 0; i < 4; i++) {
|
for (uint i = 0; i < 4; i++) {
|
||||||
if (FEATURE_BLOCKS_COUNT*32 + lid*4 + i < INPUT0_FEATURE_NUM) {
|
if (FEATURE_BLOCKS_COUNT * 32 + sglid * 4 + i < INPUT0_FEATURE_NUM) {
|
||||||
input_data_u[i] = input[input_idx + (lid*4 + i)*INPUT0_FEATURE_PITCH];
|
input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
|
||||||
@ -157,14 +216,14 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
}
|
}
|
||||||
#endif // HAS_FEATURE_LEFTOVERS
|
#endif // HAS_FEATURE_LEFTOVERS
|
||||||
|
|
||||||
if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && f >= OUTPUT_FEATURE_NUM)
|
if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUTPUT_FEATURE_NUM)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
#if BIAS_TERM
|
#if BIAS_TERM
|
||||||
#if BIAS_PER_OUTPUT
|
#if BIAS_PER_OUTPUT
|
||||||
const uint bias_index = GET_DATA_INDEX(BIAS, b, f, 0, 0);
|
const uint bias_index = GET_DATA_INDEX(BIAS, batch, feature, 0, 0);
|
||||||
#elif BIAS_PER_OFM
|
#elif BIAS_PER_OFM
|
||||||
const uint bias_index = f;
|
const uint bias_index = feature;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
float dequantized = (float)dotProd + biases[bias_index];
|
float dequantized = (float)dotProd + biases[bias_index];
|
||||||
@ -172,7 +231,7 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
float dequantized = (float)dotProd;
|
float dequantized = (float)dotProd;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const uint out_idx = OUTPUT_GET_INDEX(b, f, 0, 0);
|
const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0);
|
||||||
|
|
||||||
#if HAS_FUSED_OPS
|
#if HAS_FUSED_OPS
|
||||||
FUSED_OPS;
|
FUSED_OPS;
|
||||||
@ -182,6 +241,10 @@ KERNEL(fully_connected_gpu_MMAD)(
|
|||||||
#else
|
#else
|
||||||
output[out_idx] = TO_OUTPUT_TYPE(dequantized);
|
output[out_idx] = TO_OUTPUT_TYPE(dequantized);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if SLM_DIV_FACTOR > 1
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef INPUT_PACKED_TYPE_8
|
#undef INPUT_PACKED_TYPE_8
|
||||||
|
Loading…
Reference in New Issue
Block a user