[IE CLDNN] Fixed fsv16 lrn kernel with fp16 input (#2086)

This commit is contained in:
Vladimir Paramuzov 2020-09-07 09:04:05 +03:00 committed by GitHub
parent 0cd0c1a551
commit 51564f415c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 2 deletions

View File

@ -41,14 +41,15 @@ KERNEL (lrn_gpu_across_channel_multiple_features_fsv16)(
for (uint i = 0; i < LOCAL_SIZE; ++i, ++input_offset_f) {
bool non_zero = input_offset_f >= 0 && input_offset_f < INPUT0_FEATURE_NUM;
uint input_idx = INPUT0_GET_INDEX(batch_id, input_offset_f, y, x);
val[i] = (int)non_zero * TO_INPUT0_TYPE(input[input_idx]);
val[i] = (int)non_zero * TO_INPUT0_TYPE(ALPHA_VAL_FACTOR_DIV_BY_SIZE) * TO_INPUT0_TYPE(input[input_idx]);
res = mad(val[i], val[i], res);
}
res = mad(res, TO_INPUT0_TYPE(ALPHA_DIV_BY_SIZE), TO_INPUT0_TYPE(K));
res = native_powr(res, -TO_INPUT0_TYPE(BETA));
uint output_idx = OUTPUT_GET_INDEX(batch_id, feature_id, y, x);
INPUT0_TYPE lrn_result = res * val[PADDING];
uint input_idx = INPUT0_GET_INDEX(batch_id, feature_id, y, x);
INPUT0_TYPE lrn_result = res * input[input_idx];
#if HAS_FUSED_OPS
FUSED_OPS;
output[output_idx] = TO_OUTPUT_TYPE(FUSED_OPS_RESULT);

View File

@ -125,6 +125,55 @@ TEST(lrn_fp32_gpu, basic2) {
}
}
TEST(lrn_fp16_gpu, basic1) {
// input : 1x16x1x1
// Output : 1x16x1x1
const auto& engine = get_test_engine();
const size_t b = 1;
const size_t f = 16;
const size_t y = 1;
const size_t x = 1;
auto input = memory::allocate(engine, { data_types::f16, format::b_fs_yx_fsv16, { b, f, x, y } });
std::vector<half_t> inputVals(b * f * y * x);
std::generate(inputVals.begin(), inputVals.end(), []() {
static float n = 0;
return half_t(n++);
});
set_values(input, inputVals);
topology topology;
topology.add(input_layout("input", input.get_layout()));
uint32_t size = 5;
float k = 0.5f;
float alpha = 9.9e-05f;
float beta = 1.f;
topology.add(lrn("lrn", "input", size, k, alpha, beta, cldnn::lrn_norm_region_across_channel));
network network(engine, topology);
network.set_input_data("input", input);
auto outputs = network.execute();
auto output = outputs.at("lrn").get_memory();
auto output_ptr = output.pointer<uint16_t>();
std::vector<float> expected_results = {
0.f, 1.99889f, 3.99525f, 5.98696f,
7.97159f, 9.94682f, 11.9104f, 13.86f,
15.7936f, 17.709f, 19.6041f, 21.4769f,
23.3257f, 25.1485f, 27.2091f, 29.3151f
};
ASSERT_EQ(output_ptr.size(), expected_results.size());
for (size_t i = 0; i < expected_results.size(); ++i) {
EXPECT_TRUE(are_equal(expected_results[i], half_to_float(output_ptr[i]))) << i;
}
}
TEST(lrn_fp32_gpu, basic3) {
// input : 2x16x4x4
// Output : 2x16x4x4