[GPU] Onednn integration for reorder primitive (#7687)

2021-09-28 17:10:35 +09:00 · 2021-09-28 17:10:35 +09:00 · acc14c6469
commit acc14c6469
parent 204c17cc21
13 changed files with 337 additions and 14 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp
@ -233,7 +233,12 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr<IInfe
        bool enable_profiling = (m_config.useProfiling ||
                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_tune_and_cache) ||
                (m_config.tuningConfig.mode == cldnn::tuning_mode::tuning_retune_and_cache));
-        cldnn::queue_types queue_type = cldnn::queue_types::out_of_order;
+        cldnn::queue_types queue_type;
        if (dev->get_info().supports_immad)
            queue_type = cldnn::queue_types::in_order;
        else
            queue_type = cldnn::queue_types::out_of_order;
        bool use_unified_shared_memory = true;
        m_engine = cldnn::engine::create(engine_type, runtime_type, dev, cldnn::engine_configuration(enable_profiling,
                                                                                                     queue_type,
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/concatenation_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/concatenation_onednn.cpp
@ -0,0 +1,29 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "concatenation_inst.h"
 #include "eltwise_inst.h"
 #include "quantize_inst.h"
 #include "primitive_onednn_base.h"
 #include "impls/implementation_map.hpp"
 #include "kernel_selector_common.h"
 #include <oneapi/dnnl/dnnl.hpp>
 #include <algorithm>
 #include <memory>
 namespace cldnn {
 namespace onednn {
 namespace detail {
 attach_concatenation_onednn::attach_concatenation_onednn() {
 }
 }  // namespace detail
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/eltwise_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/eltwise_onednn.cpp
@ -0,0 +1,25 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "eltwise_inst.h"
 #include "primitive_onednn_base.h"
 #include "impls/implementation_map.hpp"
 #include "kernel_selector_common.h"
 #include <oneapi/dnnl/dnnl.hpp>
 #include <algorithm>
 #include <memory>
 namespace cldnn {
 namespace onednn {
 namespace detail {
 attach_eltwise_onednn::attach_eltwise_onednn() {
 }
 }  // namespace detail
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/fully_connected_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/fully_connected_onednn.cpp
@ -0,0 +1,25 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "fully_connected_inst.h"
 #include "primitive_onednn_base.h"
 #include "impls/implementation_map.hpp"
 #include "kernel_selector_common.h"
 #include <oneapi/dnnl/dnnl.hpp>
 #include <algorithm>
 #include <memory>
 namespace cldnn {
 namespace onednn {
 namespace detail {
 attach_fully_connected_onednn::attach_fully_connected_onednn() {
 }
 }  // namespace detail
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/gemm_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/gemm_onednn.cpp
@ -0,0 +1,25 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "gemm_inst.h"
 #include "primitive_onednn_base.h"
 #include "impls/implementation_map.hpp"
 #include "kernel_selector_common.h"
 #include <oneapi/dnnl/dnnl.hpp>
 #include <algorithm>
 #include <memory>
 namespace cldnn {
 namespace onednn {
 namespace detail {
 attach_gemm_onednn::attach_gemm_onednn() {
 }
 }  // namespace detail
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/pooling_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/pooling_onednn.cpp
@ -0,0 +1,26 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "pooling_inst.h"
 #include "primitive_onednn_base.h"
 #include "impls/implementation_map.hpp"
 #include "kernel_selector_common.h"
 #include <oneapi/dnnl/dnnl.hpp>
 #include <algorithm>
 #include <memory>
 namespace cldnn {
 namespace onednn {
 namespace detail {
 attach_pooling_onednn::attach_pooling_onednn() {
 }
 }  // namespace detail
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/register.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/register.cpp
@ -12,7 +12,12 @@ namespace onednn {
 void register_implementations() {
    REGISTER_ONEDNN_IMPL(convolution);
-}
+    REGISTER_ONEDNN_IMPL(concatenation);
    REGISTER_ONEDNN_IMPL(eltwise);
    REGISTER_ONEDNN_IMPL(gemm);
    REGISTER_ONEDNN_IMPL(pooling);
    REGISTER_ONEDNN_IMPL(reorder);
    REGISTER_ONEDNN_IMPL(fully_connected);}
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/register.hpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/register.hpp
@ -18,6 +18,12 @@ namespace detail {
    }
 REGISTER_ONEDNN_IMPL(convolution);
 REGISTER_ONEDNN_IMPL(concatenation);
 REGISTER_ONEDNN_IMPL(eltwise);
 REGISTER_ONEDNN_IMPL(gemm);
 REGISTER_ONEDNN_IMPL(pooling);
 REGISTER_ONEDNN_IMPL(reorder);
 REGISTER_ONEDNN_IMPL(fully_connected);
 #undef REGISTER_ONEDNN_IMPL
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/reorder_onednn.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/reorder_onednn.cpp
@ -0,0 +1,80 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "reorder_inst.h"
 #include "primitive_onednn_base.h"
 #include "impls/implementation_map.hpp"
 #include "kernel_selector_common.h"
 #include <oneapi/dnnl/dnnl.hpp>
 #include <algorithm>
 #include <memory>
 namespace cldnn {
 namespace onednn {
 struct reorder_onednn : typed_primitive_onednn_impl<reorder, void, dnnl::reorder::primitive_desc, dnnl::reorder> {
    using parent = typed_primitive_onednn_impl<reorder, void, dnnl::reorder::primitive_desc, dnnl::reorder>;
    using parent::parent;
 protected:
    std::unique_ptr<primitive_impl> clone() const override {
        return make_unique<reorder_onednn>(*this);
    }
    std::unordered_map<int, dnnl::memory> get_arguments(reorder_inst& instance) const override {
        std::unordered_map<int, dnnl::memory> args;
        int input_idx = DNNL_ARG_FROM;
        for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
            auto& input = instance.input_memory(i);
            args.insert({input_idx++, input.get_onednn_memory(_pd.src_desc())});
        }
        {
            auto& output = instance.output_memory();
            args.insert({DNNL_ARG_TO, output.get_onednn_memory(_pd.dst_desc())});
        }
        return args;
    }
    static std::shared_ptr<dnnl::reorder::primitive_desc> get_reorder_descriptor(const reorder_node& arg) {
        auto prim = arg.get_primitive();
        auto& input = arg.get_dependency(0);
        auto& engine = arg.get_program().get_engine();
        auto input_md = onednn::layout_to_memory_desc(input.get_output_layout());
        auto output_md = onednn::layout_to_memory_desc(arg.get_output_layout());
        return std::make_shared<dnnl::reorder::primitive_desc>(
            engine.get_onednn_engine(),
            input_md,
            engine.get_onednn_engine(),
            output_md,
            *get_primitive_attributes(arg));
    }
 public:
    static primitive_impl* create(const reorder_node& arg) {
        auto desc = get_reorder_descriptor(arg);
        auto attr = get_primitive_attributes(arg);
        std::shared_ptr<void> dummy = nullptr;
        return new reorder_onednn(arg, dummy, attr, *desc);
    }
 };
 namespace detail {
 attach_reorder_onednn::attach_reorder_onednn() {
    implementation_map<reorder>::add(impl_types::onednn, reorder_onednn::create, {});
 }
 }  // namespace detail
 }  // namespace onednn
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/impls/onednn/utils.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/onednn/utils.cpp
@ -83,16 +83,16 @@ dnnl::memory::format_tag convert_data_format(cldnn::format fmt) {
        case cldnn::format::bfyx: return dnnl::memory::format_tag::nchw;
        case cldnn::format::bfzyx: return dnnl::memory::format_tag::ncdhw;
        case cldnn::format::byxf: return dnnl::memory::format_tag::nhwc;
-        // case cldnn::format::b_fs_yx_fsv16: return dnnl::memory::format_tag::nChw16c;
+        case cldnn::format::b_fs_yx_fsv16: return dnnl::memory::format_tag::nChw16c;
-        // case cldnn::format::b_fs_yx_fsv32: return dnnl::memory::format_tag::aBcd32b;
+        case cldnn::format::b_fs_yx_fsv32: return dnnl::memory::format_tag::aBcd32b;
-        // case cldnn::format::b_fs_zyx_fsv16: return dnnl::memory::format_tag::nCdhw16c;
+        case cldnn::format::b_fs_zyx_fsv16: return dnnl::memory::format_tag::nCdhw16c;
-        // case cldnn::format::b_fs_zyx_fsv32: return dnnl::memory::format_tag::aBcde32b;
+        case cldnn::format::b_fs_zyx_fsv32: return dnnl::memory::format_tag::aBcde32b;
-        // case cldnn::format::bs_fs_yx_bsv16_fsv16: return dnnl::memory::format_tag::NChw16n16c;
+        case cldnn::format::bs_fs_yx_bsv16_fsv16: return dnnl::memory::format_tag::NChw16n16c;
-        // case cldnn::format::bs_fs_yx_bsv32_fsv32: return dnnl::memory::format_tag::NChw32n32c;
+        case cldnn::format::bs_fs_yx_bsv32_fsv32: return dnnl::memory::format_tag::NChw32n32c;
-        // case cldnn::format::bs_fs_yx_bsv4_fsv4: return dnnl::memory::format_tag::ABcd4a4b;
+        case cldnn::format::bs_fs_yx_bsv4_fsv4: return dnnl::memory::format_tag::ABcd4a4b;
-        // case cldnn::format::bs_fs_yx_bsv4_fsv2: return dnnl::memory::format_tag::ABcd4a2b;
+        case cldnn::format::bs_fs_yx_bsv4_fsv2: return dnnl::memory::format_tag::ABcd4a2b;
-        // case cldnn::format::bs_fs_yx_bsv32_fsv16: return dnnl::memory::format_tag::NChw32n16c;
+        case cldnn::format::bs_fs_yx_bsv32_fsv16: return dnnl::memory::format_tag::NChw32n16c;
-        // case cldnn::format::bs_fs_zyx_bsv16_fsv16: return dnnl::memory::format_tag::NCdhw16n16c;
+        case cldnn::format::bs_fs_zyx_bsv16_fsv16: return dnnl::memory::format_tag::NCdhw16n16c;
        default: throw std::invalid_argument("[clDNN] Unsupported conversion from cldnn to ondnn layout " + fmt_to_str(fmt));
    }
 }
--- a/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
+++ b/inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
@ -842,6 +842,48 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node) {
        const size_t kNStreams = static_cast<size_t>(node.get_program().get_engine().configuration().n_streams);
        const size_t kKeyValue = kBatchNum * std::min(kClassNum, static_cast<size_t>(8)) * kNStreams;
        preferred_impl = (kKeyValue > 64) ? impl_types::ocl : impl_types::cpu;
    } else if (node.is_type<reorder>()) {
        if (!node.get_program().get_engine().get_device_info().supports_immad)
            return impl_types::ocl;
        std::vector<format> onednn_optimized_fmt = {
            format::bfyx,
            format::b_fs_zyx_fsv16,
            format::b_fs_yx_fsv16,
            format::b_fs_yx_fsv32,
            format::bs_fs_yx_bsv16_fsv16,
            format::bs_fs_zyx_bsv16_fsv16,
            format::bs_fs_yx_bsv32_fsv16,
            format::bs_fs_yx_bsv32_fsv32,
        };
        auto input_layout = node.get_dependency(0).get_output_layout();
        auto output_layout = node.get_output_layout();
        auto input_fmt = input_layout.format;
        auto output_fmt = output_layout.format;
        preferred_impl = impl_types::onednn;
        if (std::find(onednn_optimized_fmt.begin(), onednn_optimized_fmt.end(), input_fmt) == onednn_optimized_fmt.end() ||
            std::find(onednn_optimized_fmt.begin(), onednn_optimized_fmt.end(), output_fmt) == onednn_optimized_fmt.end()) {
            preferred_impl = impl_types::ocl;
        }
        // onednn doesn't support paddings
        if (input_layout.data_padding || output_layout.data_padding) {
            preferred_impl = impl_types::ocl;
        }
        // Native impl works faster for this type of reorder
        if (input_layout.format == format::bfyx && output_layout.format == format::bfyx) {
            preferred_impl = impl_types::ocl;
        }
        // onednn reorder doesn't support different number of dimensions in input and output layouts
        if (input_layout.format.dimension() != output_layout.format.dimension()) {
            preferred_impl = impl_types::ocl;
        }
    }
    return preferred_impl;
--- a/inference-engine/thirdparty/clDNN/src/program_dump_graph.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program_dump_graph.cpp
@ -211,8 +211,9 @@ void dump_graph_init(std::ofstream& graph,
            !node->can_be_optimized()) {
            graph << "\\n Selected kernel: "
                  << (node->get_selected_impl() == nullptr ? "none"
-                                                           : node->get_selected_impl()->get_kernel_name()) +
+                                                           : node->get_selected_impl()->get_kernel_name()) + " / "
-                         "\n" + dump_mem_info(node);
+                  << node->get_preferred_impl_type()
                  << "\n" + dump_mem_info(node);
        }
        graph << "\"";
 #ifdef __clang__
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
@ -2394,3 +2394,57 @@ INSTANTIATE_TEST_SUITE_P(DISABLED_REORDER,
                        reorder_test,
                        ::testing::ValuesIn(reorder_test::generate_specific_test_params()),
                        tests::generic_test::custom_param_name_functor());
 #ifdef ENABLE_ONEDNN_FOR_GPU
 TEST(reorder_onednn_gpu, basic_convert_int8) {
    auto& engine = get_onednn_test_engine();
    layout in_layout = { type_to_data_type<float>::value, format::byxf, { 1, 1, 3, 3 } };
    layout byte_layout = { type_to_data_type<int8_t>::value, format::bfyx, { 1, 1, 3, 3 } };
    std::initializer_list<float> input_f = { 1.0f, -2.6f, 3.1f, -4.0f, 5.03f, -6.99f, 7.0f, -8.0f, 9.0f };
    std::list<float> final_results = { 1.0f, -3.0f, 3.0f, -4.0f, 5.0f, -7.0f, 7.0f, -8.0f, 9.0f };
    // Allocate memory for input image.
    auto input_memory = engine.allocate_memory(in_layout);
    set_values(input_memory, input_f);
    // Create input_layout description
    // "input" - is the primitive id inside topology
    input_layout input("input", in_layout);
    topology topology(
        // 1. input layout primitive.
        input,
        // 2. reorder primitive with id "reorder_input"
        reorder("reorder_input",
            // input primitive for reorder (implicitly converted to primitive_id)
            input,
            // output layout for reorder
            byte_layout),
        reorder("reorder2", "reorder_input", in_layout)
    );
    build_options options_target;
    options_target.set_option(build_option::outputs({ "reorder_input", "reorder2"}));
    implementation_desc impl = { format::bfyx, std::string(""), impl_types::onednn };
    options_target.set_option(build_option::force_implementations({{ "reorder_input", impl }}));
    network network(
        engine,
        topology,
        options_target);
    network.set_input_data("input", input_memory);
    auto outputs = network.execute();
    auto interm = outputs.at("reorder2").get_memory();
    cldnn::mem_lock<float> interm_ptr(interm, get_test_stream());
    unsigned int cntr = 0;
    for (const auto& exp : final_results)
    {
        EXPECT_EQ(exp, interm_ptr[cntr++]);
    }
 }
 #endif