[GPU] Change priority of CPU implementations (#17829)

2023-06-05 11:21:26 +04:00 · 2023-06-05 11:21:26 +04:00 · db8d23231a
commit db8d23231a
parent a9ddc2b553
3 changed files with 48 additions and 31 deletions
--- a/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp
+++ b/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp
@ -16,15 +16,15 @@

 namespace cldnn {

-template <typename T, typename U>
-class singleton_map : public std::map<T, U> {
-    singleton_map() : std::map<T, U>() {}
-    singleton_map(singleton_map const&) = delete;
-    void operator=(singleton_map const&) = delete;
+template <typename T>
+class singleton_list : public std::vector<T> {
+    singleton_list() : std::vector<T>() {}
+    singleton_list(singleton_list const&) = delete;
+    void operator=(singleton_list const&) = delete;

 public:
-    static singleton_map& instance() {
-        static singleton_map instance_;
+    static singleton_list& instance() {
+        static singleton_list instance_;
        return instance_;
    }
 };
@ -47,20 +47,20 @@ public:
    using key_builder = implementation_key;
    using key_type = typename key_builder::type;
    using factory_type = std::function<std::unique_ptr<primitive_impl>(const typed_program_node<primitive_kind>&, const kernel_impl_params&)>;
-    using map_type = singleton_map<std::pair<impl_types, shape_types>, std::pair<std::set<key_type>, factory_type>>;
+    using list_type = singleton_list<std::tuple<impl_types, shape_types, std::set<key_type>, factory_type>>;

    static factory_type get(const kernel_impl_params& impl_params, impl_types preferred_impl_type, shape_types target_shape_type) {
        auto input_layout = !impl_params.input_layouts.empty() ? impl_params.input_layouts[0] : layout{ov::PartialShape{}, data_types::f32, format::any};
        auto key = key_builder()(input_layout);
-        for (auto& kv : map_type::instance()) {
-            impl_types impl_type = kv.first.first;
-            shape_types supported_shape_type = kv.first.second;
+        for (auto& kv : list_type::instance()) {
+            impl_types impl_type = std::get<0>(kv);
+            shape_types supported_shape_type = std::get<1>(kv);
            if ((preferred_impl_type & impl_type) != impl_type)
                continue;
            if ((target_shape_type & supported_shape_type) != target_shape_type)
                continue;
-            std::set<key_type>& keys_set = kv.second.first;
-            auto& factory = kv.second.second;
+            std::set<key_type>& keys_set = std::get<2>(kv);
+            auto& factory = std::get<3>(kv);
            if (keys_set.empty() || keys_set.find(key) != keys_set.end())  {
                return factory;
            }
@ -85,14 +85,14 @@ public:
    }

    static bool check_key(impl_types target_impl_type, key_type key, shape_types target_shape_type) {
-        for (auto& kv : map_type::instance()) {
-            impl_types impl_type = kv.first.first;
-            shape_types supported_shape_type = kv.first.second;
+        for (auto& kv : list_type::instance()) {
+            impl_types impl_type = std::get<0>(kv);
+            shape_types supported_shape_type = std::get<1>(kv);
            if ((target_impl_type & impl_type) != impl_type)
                continue;
            if ((target_shape_type & supported_shape_type) != target_shape_type)
                continue;
-            std::set<key_type>& keys_set = kv.second.first;
+            std::set<key_type>& keys_set = std::get<2>(kv);
            if (keys_set.empty())
                return true;
            return keys_set.find(key) != keys_set.end();
@ -117,7 +117,7 @@ public:

    static void add(impl_types impl_type, shape_types shape_type, factory_type factory, std::set<key_type> keys) {
        OPENVINO_ASSERT(impl_type != impl_types::any, "[GPU] Can't register impl with type any");
-        map_type::instance().insert({{impl_type, shape_type}, {keys, factory}});
+        list_type::instance().push_back({impl_type, shape_type, keys, factory});
    }

    static std::set<key_type> combine(const std::vector<data_types>& types, const std::vector<format::type>& formats) {
@ -133,22 +133,22 @@ public:

 struct WeightsReordersFactory {
    using factory_type = std::function<std::unique_ptr<primitive_impl>(const kernel_impl_params&)>;
-    using map_type = singleton_map<std::pair<impl_types, shape_types>, factory_type>;
+    using list_type = singleton_list<std::tuple<impl_types, shape_types, factory_type>>;
    static void add(impl_types impl_type, shape_types shape_type, factory_type factory) {
        OPENVINO_ASSERT(impl_type != impl_types::any, "[GPU] Can't register WeightsReordersFactory with type any");
-        map_type::instance().insert({{impl_type, shape_type}, factory});
+        list_type::instance().push_back({impl_type, shape_type, factory});
    }

    static factory_type get(impl_types preferred_impl_type, shape_types target_shape_type) {
-        for (auto& kv : map_type::instance()) {
-            impl_types impl_type = kv.first.first;
-            shape_types supported_shape_type = kv.first.second;
+        for (auto& kv : list_type::instance()) {
+            impl_types impl_type = std::get<0>(kv);
+            shape_types supported_shape_type = std::get<1>(kv);
            if ((preferred_impl_type & impl_type) != impl_type)
                continue;
            if ((target_shape_type & supported_shape_type) != target_shape_type)
                continue;

-            return kv.second;
+            return std::get<2>(kv);
        }
        OPENVINO_THROW("[GPU] WeightsReordersFactory doesn't have any implementation for "
                       " impl_type: ", preferred_impl_type, ", shape_type: ", target_shape_type);
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@ -177,14 +177,18 @@ void program::init_program() {
 }

 void program::init_primitives() {
+    // Register implementations in order of their selection priority: common, OCL, oneDNN, CPU
+    // We register OCL implementation before oneDNN, because oneDNN is not always preferable (in case of iGPU)
+    // This order will only apply to primitives with preferrable implementation type equal to impl_types::any
+
    static bool is_initialized = false;
    if (!is_initialized) {
        common::register_implementations();
-        cpu::register_implementations();
        ocl::register_implementations();
 #ifdef ENABLE_ONEDNN_FOR_GPU
        onednn::register_implementations();
 #endif
+        cpu::register_implementations();
        is_initialized = true;
    }
 }
--- a/src/plugins/intel_gpu/tests/unit/test_cases/detection_output_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/detection_output_test.cpp
@ -5,6 +5,7 @@
 #include "test_utils.h"

 #include <intel_gpu/primitives/input_layout.hpp>
+#include <intel_gpu/primitives/reorder.hpp>
 #include <intel_gpu/primitives/detection_output.hpp>

 using namespace cldnn;
@ -429,8 +430,12 @@ public:
            top_k, eta, code_type, variance_encoded_in_target, confidence_threshold, prior_info_size,
            prior_coordinates_offset, prior_is_normalized, input_width, input_height, decrease_label_id
        ));
+        topology.add(reorder("output_reorder", input_info("detection_output"), format::bfyx, type_to_data_type<T>::value));

-        cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
+        auto config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"detection_output", {format::bfyx, "", impl_types::cpu}}}));
+
+        cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);

        network->set_input_data("input_location", input_location);
        network->set_input_data("input_confidence", input_confidence);
@ -439,7 +444,7 @@ public:
        auto outputs = network->execute();

        ASSERT_EQ(outputs.size(), size_t(1));
-        ASSERT_EQ(outputs.begin()->first, "detection_output");
+        ASSERT_EQ(outputs.begin()->first, "output_reorder");

        ASSERT_EQ(outputs.begin()->second.get_memory()->get_layout().batch(), 1);
        ASSERT_EQ(outputs.begin()->second.get_memory()->get_layout().feature(), 1);
@ -685,8 +690,12 @@ public:
        topology.add(reorder("input_confidence_padded", input_info("input_confidence"), input_location->get_layout().with_padding(padding{ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } })));

        topology.add(detection_output("detection_output", input_info("input_location_padded"), input_info("input_confidence_padded"), input_info("input_prior_box"), this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
+        topology.add(reorder("output_reorder", input_info("detection_output"), format::bfyx, type_to_data_type<T>::value));

-        cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
+        auto config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"detection_output", {format::bfyx, "", impl_types::cpu}}}));
+
+        cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);

        network->set_input_data("input_location", input_location);
        network->set_input_data("input_confidence", input_confidence);
@ -695,7 +704,7 @@ public:
        auto outputs = network->execute();

        ASSERT_EQ(outputs.size(), size_t(1));
-        ASSERT_EQ(outputs.begin()->first, "detection_output");
+        ASSERT_EQ(outputs.begin()->first, "output_reorder");

        ASSERT_EQ(outputs.begin()->second.get_memory()->get_layout().batch(), 1);
        ASSERT_EQ(outputs.begin()->second.get_memory()->get_layout().feature(), 1);
@ -742,6 +751,7 @@ public:
        topology.add(input_layout("input_prior_box", input_prior_box->get_layout()));
        topology.add(reorder("input_location_padded", input_info("input_location"), input_location->get_layout().with_padding(padding{ { 0, 0, 12, 3 },{ 0, 0, 5, 11 } })));
        topology.add(reorder("input_confidence_padded", input_info("input_confidence"), input_location->get_layout().with_padding(padding{ { 0, 0, 2, 7 },{ 0, 0, 13, 1 } })));
+        topology.add(reorder("output_reorder", input_info("detection_output"), format::bfyx, type_to_data_type<T>::value));

        topology.add(detection_output("detection_output", input_info("input_location_padded"), input_info("input_confidence_padded"), input_info("input_prior_box"),
            this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k,
@ -749,7 +759,10 @@ public:
            prior_is_normalized, this->img_size, this->img_size
        ));

-        cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
+        auto config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"detection_output", {format::bfyx, "", impl_types::cpu}}}));
+
+        cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);

        network->set_input_data("input_location", input_location);
        network->set_input_data("input_confidence", input_confidence);
@ -758,7 +771,7 @@ public:
        auto outputs = network->execute();

        ASSERT_EQ(outputs.size(), size_t(1));
-        ASSERT_EQ(outputs.begin()->first, "detection_output");
+        ASSERT_EQ(outputs.begin()->first, "output_reorder");

        ASSERT_EQ(outputs.begin()->second.get_memory()->get_layout().batch(), 1);
        ASSERT_EQ(outputs.begin()->second.get_memory()->get_layout().feature(), 1);