[GPU] Cum sum int32/64 support (#10629)

This commit is contained in:
Ilya Znamenskiy 2022-02-25 15:48:17 +03:00 committed by GitHub
parent e9e59cb954
commit 94cbbe063b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 176 additions and 161 deletions

View File

@ -72,6 +72,12 @@ namespace detail {
attach_cum_sum_impl::attach_cum_sum_impl() {
implementation_map<cum_sum>::add(impl_types::ocl, cum_sum_impl::create, {
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::i32, format::bfzyx),
std::make_tuple(data_types::i32, format::bfwzyx),
std::make_tuple(data_types::i64, format::bfyx),
std::make_tuple(data_types::i64, format::bfzyx),
std::make_tuple(data_types::i64, format::bfwzyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::f16, format::bfzyx),
std::make_tuple(data_types::f16, format::bfwzyx),

View File

@ -42,8 +42,12 @@ ParamsKey CumSumKernelBase::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableInputDataType(Datatype::INT32);
k.EnableInputDataType(Datatype::INT64);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::INT32);
k.EnableOutputDataType(Datatype::INT64);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableInputLayout(DataLayout::bfzyx);

View File

@ -13,6 +13,26 @@ namespace kernel_selector {
static constexpr size_t simd = 16;
static constexpr size_t BLOCK_SIZE = 16;
ParamsKey CumSumKernelPartialSum::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableInputLayout(DataLayout::bfzyx);
k.EnableInputLayout(DataLayout::bfwzyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfzyx);
k.EnableOutputLayout(DataLayout::bfwzyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
return k;
}
JitConstants CumSumKernelPartialSum::GetJitConstants(const cum_sum_params& params, DispatchData dispatchData) const {
auto jits = CumSumKernelBase::GetJitConstants(params, dispatchData);

View File

@ -11,6 +11,8 @@ class CumSumKernelPartialSum : public CumSumKernelBase {
public:
CumSumKernelPartialSum() : CumSumKernelBase("cum_sum_partial_sum") {}
virtual ~CumSumKernelPartialSum() = default;
ParamsKey GetSupportedKey() const override;
protected:
struct MultiDispatchData {
DispatchData stage_1;

View File

@ -26,6 +26,7 @@ static std::vector<T> cumsum(const std::vector<T>& input,
std::vector<T> output(input.size());
int dimNum = 0;
std::vector<int> reordered_shape = shape;
if (format == format::bfwzyx) {
dimNum = 6;
} else if (format == format::bfzyx) {
@ -39,6 +40,7 @@ static std::vector<T> cumsum(const std::vector<T>& input,
reordered_shape[i] = shape[i + 2];
}
}
std::vector<int> sizeDim(dimNum);
sizeDim[dimNum - 1] = 1;
for (size_t i = dimNum - 1, mult = 1; i > 0; --i) {
@ -55,6 +57,7 @@ static std::vector<T> cumsum(const std::vector<T>& input,
}
return fullInd;
};
auto getIndex = [&sizeDim](std::vector<int> fullInd) {
size_t index = 0;
for (size_t i = 0; i < fullInd.size(); ++i) {
@ -69,22 +72,23 @@ static std::vector<T> cumsum(const std::vector<T>& input,
int stopInd = fullInd[axis] + 1;
if (reverse) {
stopInd = reordered_shape[axis];
if (exclusive)
if (exclusive) {
++fullInd[axis];
}
else {
}
} else {
fullInd[axis] = 0;
if (exclusive) {
--stopInd;
}
}
float res = 0.f;
T res = (T)0;
for (; fullInd[axis] < stopInd; ++fullInd[axis]) {
auto ind = getIndex(fullInd);
res += input[ind];
}
output[i] = res;
output[i] = (T)res;
}
return output;
}
@ -98,16 +102,6 @@ static std::vector<T1> vectorCast(const std::vector<T2>& vec) {
return ret;
}
template<typename T = float>
static std::vector<T> generateVector(size_t sz) {
std::vector<T> vec(sz);
T n = 0;
std::generate(vec.begin(), vec.end(), [&n]() {
return n++;
});
return vec;
}
static cldnn::cum_sum::cum_sum_axis getCumSumAxis(int axis, unsigned sz) {
unsigned cldnn_axis = axis;
if (axis >= 2) {
@ -133,6 +127,31 @@ static cldnn::cum_sum::cum_sum_axis getCumSumAxis(int axis, unsigned sz) {
}
}
#define CASE_CUM_SUM_AXIS_0 ::testing::Values(5), ::testing::Values(1), ::testing::Values(1), \
::testing::Values(1), ::testing::Values(1), ::testing::Values(1), \
::testing::Values(format::bfyx), ::testing::ValuesIn(axes[0]), \
::testing::ValuesIn(variants), ::testing::ValuesIn(variants)
#define CASE_CUM_SUM_AXIS_1 ::testing::Values(2), ::testing::Values(5), ::testing::Values(1), \
::testing::Values(1), ::testing::Values(1), ::testing::Values(1), \
::testing::Values(format::bfyx), ::testing::ValuesIn(axes[1]), \
::testing::ValuesIn(variants), ::testing::ValuesIn(variants)
#define CASE_CUM_SUM_AXIS_2 ::testing::Values(5), ::testing::Values(5), ::testing::Values(1), \
::testing::Values(1), ::testing::Values(5), ::testing::Values(1), \
::testing::Values(format::bfyx), ::testing::ValuesIn(axes[2]), \
::testing::ValuesIn(variants), ::testing::ValuesIn(variants)
#define CASE_CUM_SUM_AXIS_3 ::testing::Values(5), ::testing::Values(5), ::testing::Values(1), \
::testing::Values(1), ::testing::Values(5), ::testing::Values(5), \
::testing::Values(format::bfyx), ::testing::ValuesIn(axes[3]), \
::testing::ValuesIn(variants), ::testing::ValuesIn(variants)
#define CASE_CUM_SUM_AXIS_4 ::testing::Values(5), ::testing::Values(5), ::testing::Values(1), \
::testing::Values(5), ::testing::Values(5), ::testing::Values(5), \
::testing::Values(format::bfzyx), ::testing::ValuesIn(axes[4]), \
::testing::ValuesIn(variants), ::testing::ValuesIn(variants)
#define CASE_CUM_SUM_AXIS_5 ::testing::Values(5), ::testing::Values(5), ::testing::Values(5), \
::testing::Values(5), ::testing::Values(5), ::testing::Values(5), \
::testing::Values(format::bfwzyx), ::testing::ValuesIn(axes[5]), \
::testing::ValuesIn(variants), ::testing::ValuesIn(variants)
using cum_sum_test_params = std::tuple<int, // batch
int, // feature
int, // w
@ -143,165 +162,129 @@ using cum_sum_test_params = std::tuple<int, // batch
int, // axis
bool, // exclusive
bool>; // reverse
class cum_sum_gpu : public ::testing::TestWithParam<cum_sum_test_params> {};
TEST_P(cum_sum_gpu, basic_test) {
auto p = GetParam();
auto& engine = get_test_engine();
template <typename cum_sum_params, typename input_type = float, typename output_type = float>
class cum_sum_gpu : public ::testing::TestWithParam<cum_sum_params> {
public:
auto b = std::get<0>(p);
auto f = std::get<1>(p);
auto w = std::get<2>(p);
auto z = std::get<3>(p);
auto y = std::get<4>(p);
auto x = std::get<5>(p);
tensor shape = tensor{batch(b), feature(f), spatial(x, y, z, w)};
auto in_out_format = std::get<6>(p);
auto axis = std::get<7>(p);
auto exclusive = std::get<8>(p);
auto reverse = std::get<9>(p);
auto size = 4;
if (in_out_format == format::bfwzyx)
size = 6;
else if (in_out_format == format::bfzyx)
size = 5;
auto input = engine.allocate_memory({ data_types::f32, in_out_format, shape });
const int inputSize = b * f * w * z * y * x;
auto inputVals = generateVector(inputSize);
set_values(input, inputVals);
topology topology;
topology.add(input_layout("Input0", input->get_layout()));
topology.add(cum_sum("cum_sum", "Input0", getCumSumAxis(axis, size), exclusive, reverse));
network network(engine, topology);
network.set_input_data("Input0", input);
auto outputs = network.execute();
EXPECT_EQ(outputs.size(), size_t(1));
EXPECT_EQ(outputs.begin()->first, "cum_sum");
auto output = outputs.at("cum_sum").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
auto answers = cumsum(inputVals, in_out_format, { b, f, w, z, y, x }, axis, exclusive, reverse);
ASSERT_EQ(output_ptr.size(), answers.size());
for (size_t i = 0; i < answers.size(); ++i)
{
EXPECT_TRUE(are_equal(answers[i], output_ptr[i])) << i;
data_types get_alloc_data_type(void) {
if (std::is_same<input_type, float>::value)
return data_types::f32;
else if (std::is_same<input_type, FLOAT16>::value)
return data_types::f16;
else if (std::is_same<input_type, int32_t>::value)
return data_types::i32;
else if (std::is_same<input_type, int64_t>::value)
return data_types::i64;
else
throw std::runtime_error("Unsupported cum sum data type in cum_sum_gpu_test.cpp");
}
}
void execute(cum_sum_params& p) {
auto& engine = get_test_engine();
auto b = std::get<0>(p);
auto f = std::get<1>(p);
auto w = std::get<2>(p);
auto z = std::get<3>(p);
auto y = std::get<4>(p);
auto x = std::get<5>(p);
tensor shape = tensor{ batch(b), feature(f), spatial(x, y, z, w) };
auto in_out_format = std::get<6>(p);
auto axis = std::get<7>(p);
auto exclusive = std::get<8>(p);
auto reverse = std::get<9>(p);
auto size = 4;
if (in_out_format == format::bfzyx)
size = 5;
else if (in_out_format == format::bfwzyx)
size = 6;
auto input = engine.allocate_memory({ get_alloc_data_type(), in_out_format, shape });
const int inputSize = b * f * w * z * y * x;
VF<input_type> inputVals = std::is_same<input_type, FLOAT16>::value ?
generate_random_1d<input_type>(inputSize, -1, 1, 1) :
generate_random_1d<input_type>(inputSize, -100, 100, 8);
set_values(input, inputVals);
topology topology;
topology.add(input_layout("Input0", input->get_layout()));
topology.add(cum_sum("cum_sum", "Input0", getCumSumAxis(axis, size), exclusive, reverse));
network network(engine, topology);
network.set_input_data("Input0", input);
auto outputs = network.execute();
EXPECT_EQ(outputs.size(), size_t(1));
EXPECT_EQ(outputs.begin()->first, "cum_sum");
auto output = outputs.at("cum_sum").get_memory();
cldnn::mem_lock<output_type> output_ptr(output, get_test_stream());
auto answers = cumsum<output_type>(inputVals, in_out_format, { b, f, w, z, y, x }, axis, exclusive, reverse);
ASSERT_EQ(output_ptr.size(), answers.size());
for (size_t i = 0; i < answers.size(); ++i) {
EXPECT_TRUE(are_equal(answers[i], output_ptr[i])) << i;
}
}
};
class cum_sum_gpu_fp16 : public ::cum_sum_gpu<cum_sum_test_params, FLOAT16, FLOAT16> {};
class cum_sum_gpu_fp32 : public ::cum_sum_gpu<cum_sum_test_params, float, float> {};
class cum_sum_gpu_int32 : public ::cum_sum_gpu<cum_sum_test_params, int32_t, int32_t> {};
class cum_sum_gpu_int64 : public ::cum_sum_gpu<cum_sum_test_params, int64_t, int64_t> {};
TEST_P(cum_sum_gpu_fp16, basic) { auto p = GetParam(); execute(p); }
TEST_P(cum_sum_gpu_fp32, basic) { auto p = GetParam(); execute(p); }
TEST_P(cum_sum_gpu_int32, basic) { auto p = GetParam(); execute(p); }
TEST_P(cum_sum_gpu_int64, basic) { auto p = GetParam(); execute(p); }
namespace {
std::vector<std::vector<int>> axes = {
{0},
{0, 1},
{0, 1, 2},
{0, 1, 2, 3},
{0, 1, 2, 3, 4},
{0, 1, 2, 3, 4, 5},
{ 0 },
{ 0, 1 },
{ 0, 1, 2 },
{ 0, 1, 2, 3 },
{ 0, 1, 2, 3, 4 },
{ 0, 1, 2, 3, 4, 5 },
};
std::vector<bool> variants = {false, true};
std::vector<bool> variants = { false, true };
}
INSTANTIATE_TEST_SUITE_P(
axis_0,
cum_sum_gpu,
::testing::Combine(
::testing::Values(5),
::testing::Values(1),
::testing::Values(1),
::testing::Values(1),
::testing::Values(1),
::testing::Values(1),
::testing::Values(format::bfyx),
::testing::ValuesIn(axes[0]),
::testing::ValuesIn(variants),
::testing::ValuesIn(variants)
));
INSTANTIATE_TEST_SUITE_P(
axis_1,
cum_sum_gpu,
::testing::Combine(
::testing::Values(2),
::testing::Values(5),
::testing::Values(1),
::testing::Values(1),
::testing::Values(1),
::testing::Values(1),
::testing::Values(format::bfyx),
::testing::ValuesIn(axes[1]),
::testing::ValuesIn(variants),
::testing::ValuesIn(variants)
));
INSTANTIATE_TEST_SUITE_P(axis_0, cum_sum_gpu_fp16, ::testing::Combine(CASE_CUM_SUM_AXIS_0));
INSTANTIATE_TEST_SUITE_P(axis_0, cum_sum_gpu_fp32, ::testing::Combine(CASE_CUM_SUM_AXIS_0));
INSTANTIATE_TEST_SUITE_P(axis_0, cum_sum_gpu_int32, ::testing::Combine(CASE_CUM_SUM_AXIS_0));
INSTANTIATE_TEST_SUITE_P(axis_0, cum_sum_gpu_int64, ::testing::Combine(CASE_CUM_SUM_AXIS_0));
INSTANTIATE_TEST_SUITE_P(
axis_2,
cum_sum_gpu,
::testing::Combine(
::testing::Values(5),
::testing::Values(5),
::testing::Values(1),
::testing::Values(1),
::testing::Values(5),
::testing::Values(1),
::testing::Values(format::bfyx),
::testing::ValuesIn(axes[2]),
::testing::ValuesIn(variants),
::testing::ValuesIn(variants)
));
INSTANTIATE_TEST_SUITE_P(axis_1, cum_sum_gpu_fp16, ::testing::Combine(CASE_CUM_SUM_AXIS_1));
INSTANTIATE_TEST_SUITE_P(axis_1, cum_sum_gpu_fp32, ::testing::Combine(CASE_CUM_SUM_AXIS_1));
INSTANTIATE_TEST_SUITE_P(axis_1, cum_sum_gpu_int32, ::testing::Combine(CASE_CUM_SUM_AXIS_1));
INSTANTIATE_TEST_SUITE_P(axis_1, cum_sum_gpu_int64, ::testing::Combine(CASE_CUM_SUM_AXIS_1));
INSTANTIATE_TEST_SUITE_P(
axis_3,
cum_sum_gpu,
::testing::Combine(
::testing::Values(5),
::testing::Values(5),
::testing::Values(1),
::testing::Values(1),
::testing::Values(5),
::testing::Values(5),
::testing::Values(format::bfyx),
::testing::ValuesIn(axes[3]),
::testing::ValuesIn(variants),
::testing::ValuesIn(variants)
));
INSTANTIATE_TEST_SUITE_P(axis_2, cum_sum_gpu_fp16, ::testing::Combine(CASE_CUM_SUM_AXIS_2));
INSTANTIATE_TEST_SUITE_P(axis_2, cum_sum_gpu_fp32, ::testing::Combine(CASE_CUM_SUM_AXIS_2));
INSTANTIATE_TEST_SUITE_P(axis_2, cum_sum_gpu_int32, ::testing::Combine(CASE_CUM_SUM_AXIS_2));
INSTANTIATE_TEST_SUITE_P(axis_2, cum_sum_gpu_int64, ::testing::Combine(CASE_CUM_SUM_AXIS_2));
INSTANTIATE_TEST_SUITE_P(
axis_4,
cum_sum_gpu,
::testing::Combine(
::testing::Values(5),
::testing::Values(5),
::testing::Values(1),
::testing::Values(5),
::testing::Values(5),
::testing::Values(5),
::testing::Values(format::bfzyx),
::testing::ValuesIn(axes[4]),
::testing::ValuesIn(variants),
::testing::ValuesIn(variants)
));
INSTANTIATE_TEST_SUITE_P(axis_3, cum_sum_gpu_fp16, ::testing::Combine(CASE_CUM_SUM_AXIS_3));
INSTANTIATE_TEST_SUITE_P(axis_3, cum_sum_gpu_fp32, ::testing::Combine(CASE_CUM_SUM_AXIS_3));
INSTANTIATE_TEST_SUITE_P(axis_3, cum_sum_gpu_int32, ::testing::Combine(CASE_CUM_SUM_AXIS_3));
INSTANTIATE_TEST_SUITE_P(axis_3, cum_sum_gpu_int64, ::testing::Combine(CASE_CUM_SUM_AXIS_3));
INSTANTIATE_TEST_SUITE_P(
axis_5,
cum_sum_gpu,
::testing::Combine(
::testing::Values(5),
::testing::Values(5),
::testing::Values(5),
::testing::Values(5),
::testing::Values(5),
::testing::Values(5),
::testing::Values(format::bfwzyx),
::testing::ValuesIn(axes[5]),
::testing::ValuesIn(variants),
::testing::ValuesIn(variants)
));
INSTANTIATE_TEST_SUITE_P(axis_4, cum_sum_gpu_fp16, ::testing::Combine(CASE_CUM_SUM_AXIS_4));
INSTANTIATE_TEST_SUITE_P(axis_4, cum_sum_gpu_fp32, ::testing::Combine(CASE_CUM_SUM_AXIS_4));
INSTANTIATE_TEST_SUITE_P(axis_4, cum_sum_gpu_int32, ::testing::Combine(CASE_CUM_SUM_AXIS_4));
INSTANTIATE_TEST_SUITE_P(axis_4, cum_sum_gpu_int64, ::testing::Combine(CASE_CUM_SUM_AXIS_4));
INSTANTIATE_TEST_SUITE_P(axis_5, cum_sum_gpu_fp16, ::testing::Combine(CASE_CUM_SUM_AXIS_5));
INSTANTIATE_TEST_SUITE_P(axis_5, cum_sum_gpu_fp32, ::testing::Combine(CASE_CUM_SUM_AXIS_5));
INSTANTIATE_TEST_SUITE_P(axis_5, cum_sum_gpu_int32, ::testing::Combine(CASE_CUM_SUM_AXIS_5));
INSTANTIATE_TEST_SUITE_P(axis_5, cum_sum_gpu_int64, ::testing::Combine(CASE_CUM_SUM_AXIS_5));
// FIXME: This test fails on some driver versions. Looks like UB in impl or driver issue
TEST(cum_sum_gpu_f16, DISABLED_basic_1d) {