[GPU] Fix strided_slice for dynamic cases (#12979)

2022-09-13 09:17:50 +04:00 · 2022-09-13 09:17:50 +04:00 · a73fc2dce1
commit a73fc2dce1
parent 3c24ee6cda
6 changed files with 165 additions and 54 deletions
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/strided_slice.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/strided_slice.cpp
@ -14,6 +14,34 @@

 using namespace cldnn;

+namespace {
+template <typename T, typename DT, typename = typename std::enable_if<std::is_convertible<DT, T>::value>::type>
+std::vector<T>& pad_vector_to_size(std::vector<T>& data, size_t size, DT value) {
+    for (size_t i = data.size(); i < size; ++i) {
+        data.push_back(static_cast<T>(value));
+    }
+    return data;
+}
+
+template <typename T, typename MT>
+std::vector<T>& vector_assign_if_not_mask(std::vector<T>& dst, const T& src, const std::vector<MT>& mask) {
+    for (size_t i = 0; i < dst.size(); ++i) {
+        if (!mask[i])
+            dst[i] = src;
+    }
+    return dst;
+}
+
+template <typename T, typename MT>
+std::vector<T>& vector_assign_if_not_mask(std::vector<T>& dst, const std::vector<T>& src, const std::vector<MT>& mask) {
+    for (size_t i = 0; i < dst.size(); ++i) {
+        if (!mask[i])
+            dst[i] = src[i];
+    }
+    return dst;
+}
+}  // namespace
+
 namespace cldnn {
 namespace ocl {

@ -27,16 +55,16 @@ struct strided_slice_impl : typed_primitive_impl_ocl<strided_slice> {

 public:
    static primitive_impl* create(const strided_slice_node& arg, const kernel_impl_params& impl_param) {
-        const auto& prim = arg.get_primitive();
+        const auto& prim = impl_param.typed_desc<strided_slice>();
        auto params = get_default_params<kernel_selector::strided_slice_params>(impl_param);
        auto op_params = get_default_optional_params<kernel_selector::strided_slice_optional_params>(arg.get_program());
        const size_t dims_num = params.inputs[0].Dimentions();

        // Getting data from constant inputs. There are 3 args: Begin, End, Stride
        for (size_t i = 1; i < arg.get_dependencies().size(); ++i) {
-            auto& input = arg.get_dependency(i).as<data>();
-            auto mem = input.get_attached_memory_ptr();
-            std::vector<int32_t> sizes = read_vector<int32_t>(mem, arg.get_program().get_stream());
+            OPENVINO_ASSERT(impl_param.memory_deps.count(i) > 0, "[GPU] Can't find StridedSlice memory dependency");
+            auto mem = impl_param.memory_deps.at(i);
+            std::vector<int32_t> sizes = read_vector<int32_t>(mem, impl_param.prog.get_stream());
            pad_vector_to_size(sizes, dims_num, i != 1);  // for "begin" completion used 0 value, for other - 1
            params.striding_params.push_back(sizes);
        }
--- a/src/plugins/intel_gpu/src/graph/include/strided_slice_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/strided_slice_inst.h
@ -14,53 +14,6 @@

 namespace cldnn {

-template <typename T, typename DT, typename = typename std::enable_if<std::is_convertible<DT, T>::value>::type>
-std::vector<T>& pad_vector_to_size(std::vector<T>& data, size_t size, DT value) {
-    for (size_t i = data.size(); i < size; ++i) {
-        data.push_back(static_cast<T>(value));
-    }
-    return data;
-}
-
-template <typename T, typename MT>
-std::vector<T>& vector_assign_if_not_mask(std::vector<T>& dst, const T& src, const std::vector<MT>& mask) {
-    for (size_t i = 0; i < dst.size(); ++i) {
-        if (!mask[i])
-            dst[i] = src;
-    }
-    return dst;
-}
-
-template <typename T, typename MT>
-std::vector<T>& vector_assign_if_not_mask(std::vector<T>& dst, const std::vector<T>& src, const std::vector<MT>& mask) {
-    for (size_t i = 0; i < dst.size(); ++i) {
-        if (!mask[i])
-            dst[i] = src[i];
-    }
-    return dst;
-}
-
-inline format get_default_format_for_dim(size_t dimension) {
-    format dimensionFormat = format::bfyx;
-    switch (dimension) {
-    case 1:
-    case 2:
-    case 3:
-    case 4:
-        dimensionFormat = format::bfyx;
-        break;
-    case 5:
-        dimensionFormat = format::bfzyx;
-        break;
-    case 6:
-        dimensionFormat = format::bfwzyx;
-        break;
-    default:
-        CLDNN_ERROR_MESSAGE("Function get_default_format_for_dim", "Unsupported dimension number: " + std::to_string(dimension));
-    }
-    return dimensionFormat;
-}
-
 template <>
 struct typed_program_node<strided_slice> : public typed_program_node_base<strided_slice> {
    using parent = typed_program_node_base<strided_slice>;
@ -69,6 +22,7 @@ public:
    using parent::parent;

    program_node& input(size_t index = 0) const { return get_dependency(index); }
+    std::vector<size_t> get_shape_infer_dependencies() const override { return {1, 2, 3}; }
 };

 using strided_slice_node = typed_program_node<strided_slice>;
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@ -12,6 +12,7 @@
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
 #include "shape_of_inst.h"
+#include "strided_slice_inst.h"
 #include "experimental_detectron_roi_feature_extractor_inst.hpp"

 #include "intel_gpu/graph/network.hpp"
@ -156,7 +157,19 @@ void primitive_inst::update_shape() {
    if (_node.is_type<shape_of>())
        return;

-    if (!input_shape_changed && !_node.generates_dynamic_output() && _impl_params->output_layout.is_static())
+    // Strided slice loads data from {1,2,3} dependencies in impl::create method.
+    // It means that this data must be put into impl_params map
+    // Thus we treat it as "dynamic" case
+    // TODO: Remove once strided slice impl support runtime tensors for begin/end/stride
+    bool strided_slice_wa = false;
+    if (_node.is_type<strided_slice>()) {
+        for (size_t i = 1; i < _node.get_dependencies().size(); i++) {
+            if (!_node.get_dependency(i).is_type<data>())
+                strided_slice_wa = true;
+        }
+    }
+
+    if (!strided_slice_wa && !input_shape_changed && !_node.generates_dynamic_output() && _impl_params->output_layout.is_static())
        return;

    auto memory_deps = _node.get_const_memory_deps();
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@ -6,6 +6,7 @@
 #include "program_helpers.h"
 #include "primitive_inst.h"
 #include "loop_inst.h"
+#include "strided_slice_inst.h"
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #include "convolution_inst.h"
@ -281,6 +282,16 @@ bool program_node::recalc_output_layout(bool invalidate_users_if_changed) {
 }

 bool program_node::is_dynamic() const {
+    // Strided slice loads data from {1,2,3} dependencies in impl::create method.
+    // It means that this data must be put into impl_params map
+    // Thus we treat it as "dynamic" case
+    // TODO: Remove once strided slice impl support runtime tensors for begin/end/stride
+    if (is_type<strided_slice>()) {
+        for (size_t i = 1; i < get_dependencies().size(); i++) {
+            if (!get_dependency(i).is_type<data>())
+                return true;
+        }
+    }
    for (const auto* input : get_dependencies()) {
        if (input->get_output_layout().is_dynamic())
            return true;
@ -290,6 +301,17 @@ bool program_node::is_dynamic() const {
 }

 bool program_node::is_dynamic() {
+    // Strided slice loads data from {1,2,3} dependencies in impl::create method.
+    // It means that this data must be put into impl_params map
+    // Thus we treat it as "dynamic" case
+    // TODO: Remove once strided slice impl support runtime tensors for begin/end/stride
+    if (is_type<strided_slice>()) {
+        for (size_t i = 1; i < get_dependencies().size(); i++) {
+            if (!get_dependency(i).is_type<data>())
+                return true;
+        }
+    }
+
    for (auto& input : get_dependencies()) {
        if (input->get_output_layout(true).is_dynamic())
            return true;
--- a/src/plugins/intel_gpu/src/graph/strided_slice.cpp
+++ b/src/plugins/intel_gpu/src/graph/strided_slice.cpp
@ -39,8 +39,8 @@ std::vector<layout> strided_slice_inst::calc_output_layouts(strided_slice_node c

    auto& constant_mem = impl_param.memory_deps;

-    if (constant_mem.empty()) {
-        auto out_shape = ov::PartialShape::dynamic(input0_layout.get_rank());
+    if (!constant_mem.count(1) || !constant_mem.count(2) || !constant_mem.count(3)) {
+        auto out_shape = ov::PartialShape::dynamic(input0_layout.get_partial_shape().size());
        return { layout{out_shape, input0_layout.data_type, format::get_default_format(out_shape.rank().get_length())} };
    }

--- a/src/plugins/intel_gpu/tests/test_cases/strided_slice_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/test_cases/strided_slice_gpu_test.cpp
@ -1459,3 +1459,97 @@ TEST(strided_slice_gpu_f32_i64, test_2x2x2x1x1_2_negative_all) {
        EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
    }
 }
+
+
+
+TEST(strided_slice_gpu_f32_i64, test_2x2x2x1x1_2_negative_all_dynamic) {
+    // Input (BFZYX): 2x2x2x1x1
+    // Output (BFZYX): 2x1x1x1x1
+
+    auto& engine = get_test_engine();
+    auto input_lay = layout{ ov::PartialShape::dynamic(3), data_types::f32, format::bfyx };
+    auto input = engine.allocate_memory({  ov::PartialShape{ 2, 2, 2 }, data_types::f32, format::bfyx, });
+    auto begin = engine.allocate_memory({ ov::PartialShape{ 3 }, data_types::i64, format::bfyx });
+    auto end = engine.allocate_memory({ ov::PartialShape{ 3 }, data_types::i64, format::bfyx });
+    auto strides = engine.allocate_memory({ ov::PartialShape{ 3 }, data_types::i64, format::bfyx });
+
+    set_values(input, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f});
+    set_values<int64_t>(begin, {0, 0, 0});
+    set_values<int64_t>(end, {2, 2, 2});
+    set_values<int64_t>(strides, {1, 2, 2});
+
+    topology topology;
+    topology.add(input_layout("input", input_lay));
+    topology.add(data("input2", begin));
+    topology.add(data("input3", end));
+    topology.add(data("input4", strides));
+    topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {}, {}));
+
+    build_options bo;
+    bo.set_option(build_option::allow_new_shape_infer(true));
+    network network(engine, topology, bo);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "strided_slice");
+
+    auto output = outputs.at("strided_slice").get_memory();
+
+    std::vector<float> answers = {
+            0.0f, 4.0f
+    };
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < answers.size(); ++i)
+    {
+        EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
+    }
+}
+
+TEST(strided_slice_gpu_f32_i64, test_2x2x2x1x1_2_negative_all_dynamic_begin) {
+    auto& engine = get_test_engine();
+    auto input = engine.allocate_memory({  ov::PartialShape{ 2, 2, 2 }, data_types::f32, format::bfyx, });
+    auto begin = engine.allocate_memory({ ov::PartialShape{ 3 }, data_types::i64, format::bfyx });
+    auto end = engine.allocate_memory({ ov::PartialShape{ 3 }, data_types::i64, format::bfyx });
+    auto strides = engine.allocate_memory({ ov::PartialShape{ 3 }, data_types::i64, format::bfyx });
+
+    set_values(input, {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f});
+    set_values<int64_t>(begin, {0, 0, 0});
+    set_values<int64_t>(end, {2, 2, 2});
+    set_values<int64_t>(strides, {1, 2, 2});
+
+    topology topology;
+    topology.add(data("input", input));
+    topology.add(input_layout("input2", begin->get_layout()));
+    topology.add(data("input3", end));
+    topology.add(data("input4", strides));
+    topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {}, {}));
+
+    build_options bo;
+    bo.set_option(build_option::allow_new_shape_infer(true));
+    network network(engine, topology, bo);
+
+    network.set_input_data("input2", begin);
+
+    auto outputs = network.execute();
+
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "strided_slice");
+
+    auto output = outputs.at("strided_slice").get_memory();
+
+    std::vector<float> answers = {
+            0.0f, 4.0f
+    };
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    for (size_t i = 0; i < answers.size(); ++i)
+    {
+        EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
+    }
+}