From 0cf319f8552faf28a347800d35f6457257254e3f Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Mon, 21 Nov 2022 22:52:18 +0900 Subject: [PATCH] [GPU] Apply cached_blob to make engine (#13781) + Updated oneDNN to use cache_blob + Updated oneDNN to fix group conv failure + Add g_os_iyx_osv8 format and relevant reorder to support oneDNN update + Used cached_blob to make engine if cache_dir config is used Signed-off-by: Min, Byungil --- .../include/intel_gpu/runtime/format.hpp | 1 + .../src/graph/impls/onednn/utils.cpp | 1 + .../src/graph/kernel_selector_helper.cpp | 4 +++ .../cl_kernels/reorder_weights.cl | 4 +++ .../intel_gpu/src/kernel_selector/jitter.cpp | 4 ++- .../kernel_selector_common.cpp | 1 + .../kernels/reorder/reorder_kernel_base.cpp | 1 + .../src/kernel_selector/tensor_type.cpp | 5 +++ .../src/kernel_selector/tensor_type.h | 1 + src/plugins/intel_gpu/src/runtime/format.cpp | 7 ++-- .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 34 ++++++++++++++++++- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 12 files changed, 59 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp index 2f76e3d43c9..cde74d528a6 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp @@ -222,6 +222,7 @@ struct format { gyxio, ///< format used for weights for 2D convolution goizyx, ///< format used for weights for 3D convolution giozyx, ///< format used for weights for 3D deconvolution + g_os_iyx_osv8, ///< format used for weights for 2D convolution g_os_iyx_osv16, ///< format used for weights for 2D convolution g_os_iyx_osv32, ///< format used for weights for 2D convolution gs_oiyx_gsv16, ///< format used for weights for 2D convolution diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index b2ce2930fd2..9e47e5b03e8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -445,6 +445,7 @@ static cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_groupe case dnnl::memory::format_tag::aBCde4b8c8b2c: return cldnn::format::g_os_is_yx_osa4_isa8_osv8_isv2; case dnnl::memory::format_tag::aBCde8b2c: return cldnn::format::g_os_is_yx_osv8_isv2; case dnnl::memory::format_tag::aBCde8b4c: return cldnn::format::g_os_is_yx_osv8_isv4; + case dnnl::memory::format_tag::aBcde8b: return cldnn::format::g_os_iyx_osv8; case dnnl::memory::format_tag::aBCd2b8c16b4c: return cldnn::format::g_os_is_yx_osa2_isa8_osv16_isv4; case dnnl::memory::format_tag::aBCd2b8c16b2c: return cldnn::format::g_os_is_yx_osa2_isa8_osv16_isv2; case dnnl::memory::format_tag::aBCdef16c16b: return cldnn::format::g_os_is_zyx_isv16_osv16; diff --git a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp index 7fb3b1b6947..8721234761a 100644 --- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp @@ -421,6 +421,8 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::goizyx; case format::giozyx: return kernel_selector::weights_layout::giozyx; + case format::g_os_iyx_osv8: + return kernel_selector::weights_layout::g_os_iyx_osv8; case format::g_os_iyx_osv16: return kernel_selector::weights_layout::g_os_iyx_osv16; case format::g_os_iyx_osv32: @@ -691,6 +693,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { return cldnn::format::goiyx; case kernel_selector::weights_layout::goizyx: return cldnn::format::goizyx; + case kernel_selector::weights_layout::g_os_iyx_osv8: + return cldnn::format::g_os_iyx_osv8; case kernel_selector::weights_layout::g_os_iyx_osv16: return cldnn::format::g_os_iyx_osv16; case kernel_selector::weights_layout::g_os_iyx_osv32: diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl index e8c42ffae5f..e97309fe5a5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl @@ -123,6 +123,8 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x return GET_FILTER_GOIZYX(INPUT0, g, o, i, z, y, x); #elif defined INPUT0_LAYOUT_GIOZYX return GET_FILTER_GIOZYX(INPUT0, g, o, i, z, y, x); +#elif defined INPUT0_LAYOUT_G_OS_IYX_OSV8 + return GET_FILTER_G_OS_IYX_OSV16(INPUT0, g, o, i, y, x, 8); #elif defined INPUT0_LAYOUT_G_OS_IYX_OSV16 return GET_FILTER_G_OS_IYX_OSV16(INPUT0, g, o, i, y, x, 16); #elif defined INPUT0_LAYOUT_G_OS_IYX_OSV32 @@ -371,6 +373,8 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint return GET_FILTER_OS_ZY_IS_X_OSV8_ISV4_INDEX(OUTPUT, o, i, z, y, x); #elif defined OUTPUT_LAYOUT_GOIZYX || defined OUTPUT_LAYOUT_GIOZYX return GET_FILTER_INDEX_5D(OUTPUT, g, o, i, z, y, x); +#elif defined OUTPUT_LAYOUT_G_OS_IYX_OSV8 + return GET_FILTER_G_OS_IYX_OSV16(OUTPUT, g, o, i, y, x, 8); #elif defined OUTPUT_LAYOUT_G_OS_IYX_OSV16 return GET_FILTER_G_OS_IYX_OSV16(OUTPUT, g, o, i, y, x, 16); #elif defined OUTPUT_LAYOUT_G_OS_IYX_OSV32 diff --git a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp index 306782d4318..cf5c35cec3c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp @@ -649,7 +649,7 @@ class WeightTensorJitConstant : public TensorBaseTJitConstantcalcFunction = FuncBody(layout_name); @@ -881,6 +881,8 @@ JitDefinitions WeightTensorJitConstant::GetDefinitions() const { index_func_val = called_func_name + "(" + _name + ", g, o, i, 0, y, x)"; else if (layout == WeightsLayout::g_os_is_yx_isv16_osv16) index_func_val = called_func_name + "(" + _name + ", g, o, i, 0, y, x, 16)"; + else if (layout == WeightsLayout::g_os_iyx_osv8) + index_func_val = called_func_name + "(" + _name + ", g, o, i, y, x, 8)"; else if (layout == WeightsLayout::g_os_iyx_osv16) index_func_val = called_func_name + "(" + _name + ", g, o, i, y, x, 16)"; else if (layout == WeightsLayout::g_is_os_yx_isv16_osv16) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp index 34b8c560ccb..183e97c68fd 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.cpp @@ -379,6 +379,7 @@ std::string toString(WeightsLayout layout) { case WeightsLayout::gyxio: return "GYXIO"; case WeightsLayout::goizyx: return "GOIZYX"; case WeightsLayout::giozyx: return "GIOZYX"; + case WeightsLayout::g_os_iyx_osv8: return "G_OS_IYX_OSV8"; case WeightsLayout::g_os_iyx_osv16: return "G_OS_IYX_OSV16"; case WeightsLayout::g_os_iyx_osv32: return "G_OS_IYX_OSV32"; case WeightsLayout::gs_oiyx_gsv16: return "GS_OIYX_GSV16"; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp index 8ffcb32d7f1..1a2ca80a035 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel_base.cpp @@ -48,6 +48,7 @@ inline uint32_t SubGroupSize(WeightsLayout l) { case WeightsLayout::os_i_osv8__ai8: case WeightsLayout::iy_xs_os_xsv2_osv8__ao32: case WeightsLayout::giy_xs_os_xsv2_osv8__ao32: + case WeightsLayout::g_os_iyx_osv8: return 8; default: return 1; diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp index e2dcdecd7ed..3aa287b7c5e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.cpp @@ -166,6 +166,7 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{ { WeightsLayout::gioyx, { 0, 1, -1, 3, 2, 4 } }, { WeightsLayout::goizyx, { 0, 1, 2, 3, 4, 5 } }, { WeightsLayout::giozyx, { 0, 1, 2, 4, 3, 5 } }, + { WeightsLayout::g_os_iyx_osv8, { 0, 1, -1, 2, 3, 4 } }, { WeightsLayout::g_os_iyx_osv16, { 0, 1, -1, 2, 3, 4 } }, { WeightsLayout::g_os_iyx_osv32, { 0, 1, -1, 2, 3, 4 } }, { WeightsLayout::gs_oiyx_gsv16, { 0, 1, -1, 2, 3, 4 } }, @@ -766,6 +767,10 @@ NDims WeightsTensor::GetSimpleDims(const std::vector& d, WeightsLayout l case os_i_yxs_osv4_yxsv4: newDims[3] = RoundUp(newDims[3], 4); break; + case g_os_iyx_osv8: + assert(newDims.size() == 5); + newDims[3] = RoundUp(newDims[3], 8); + break; case g_os_iyx_osv16: case g_os_iyx_osv16_rotate_180: assert(newDims.size() == 5); diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h index a38cfa50434..2a214e7f02c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h @@ -184,6 +184,7 @@ enum WeightsLayout { goizyx, giozyx, gyxio, + g_os_iyx_osv8, g_os_iyx_osv16, g_os_iyx_osv32, gs_oiyx_gsv16, diff --git a/src/plugins/intel_gpu/src/runtime/format.cpp b/src/plugins/intel_gpu/src/runtime/format.cpp index fad495981ec..cb156477b96 100644 --- a/src/plugins/intel_gpu/src/runtime/format.cpp +++ b/src/plugins/intel_gpu/src/runtime/format.cpp @@ -87,7 +87,6 @@ static const std::map format_traits_map { FMT_TRAITS(image_2d_weights_c4_fyx_b, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {}), FMT_TRAITS(image_2d_weights_c1_b_fyx, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {}), FMT_TRAITS(lstm_weights_dio, 1, 1, 2, 0, {0, 1, 3, 2}, "oixy", "oixy?", {}), - FMT_TRAITS(os_is_yx_isa8_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {}), FMT_TRAITS(os_is_yx_isa8_osv16_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {}), FMT_TRAITS(os_is_yx_isa8_osv8_isv4_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {}), FMT_TRAITS(os_is_yx_osa4_isa8_osv8_isv2, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {{0, 32}, {1, 16}}), @@ -98,8 +97,6 @@ static const std::map format_traits_map { FMT_TRAITS(os_is_yx_osa2_isa8_osv16_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}), FMT_TRAITS(os_is_yx_osa2_isa8_osv8_isv2, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 16}, {1, 16}}), FMT_TRAITS(os_is_zyx_osa2_isa8_osv8_isv2, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 16}, {1, 16}}), - FMT_TRAITS(os_is_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}), - FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}), FMT_TRAITS(os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {{0, 32}, {1, 32}}), FMT_TRAITS(os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 32}, {1, 32}}), FMT_TRAITS(is_os_yx_osa4_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{0, 32}, {1, 32}}), @@ -125,9 +122,12 @@ static const std::map format_traits_map { FMT_TRAITS(is_os_yx_isv16_osv8, 1, 1, 2, 0, {1, 0, 2, 3, 4}, "ioyx", "oixy", {{1, 16}, {0, 8}}), FMT_TRAITS(is_os_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "ioxyz", {{1, 8}, {0, 8}, {1, 2}}), FMT_TRAITS(is_os_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "ioxyz", {{1, 8}, {0, 8}, {1, 4}}), + FMT_TRAITS(os_is_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}), FMT_TRAITS(os_is_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 2}}), + FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}), FMT_TRAITS(is_os_yx_isa8_osv8_isv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy?", {{1, 8}, {0, 8}, {1, 2}}), FMT_TRAITS(is_os_yx_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy?", {{1, 8}, {0, 8}, {1, 4}}), + FMT_TRAITS(os_is_yx_isa8_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {{1, 8}, {0, 8}, {1, 4}}), FMT_TRAITS(os_is_yx_isa8_osv8_isv2, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy?", {{1, 8}, {0, 8}, {1, 2}}), FMT_TRAITS(os_is_osv32_isv32_swizzled_by_4, 1, 1, 0, 0, {0, 1, 2, 3}, "oixy", "oixy?", {{0, 32}, {1, 32}}), FMT_TRAITS(os_is_zyx_isv8_osv16_isv2, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 2}}), @@ -156,6 +156,7 @@ static const std::map format_traits_map { FMT_TRAITS(gioyx, 1, 1, 2, 1, {0, 2, 1, 3, 4}, "gioyx", "oixy??g", {}), FMT_TRAITS(goizyx, 1, 1, 3, 1, {0, 1, 2, 3, 4, 5}, "goizyx", "oixyz?g", {}), FMT_TRAITS(giozyx, 1, 1, 3, 1, {0, 2, 1, 3, 4, 5}, "giozyx", "oixyz?g", {}), + FMT_TRAITS(g_os_iyx_osv8, 1, 1, 2, 1, {0, 1, 2, 3, 4}, "goiyx", "oixy??g", {{0, 8}}), FMT_TRAITS(g_os_iyx_osv16, 1, 1, 2, 1, {0, 1, 2, 3, 4}, "goiyx", "oixy??g", {{0, 16}}), FMT_TRAITS(g_os_iyx_osv32, 1, 1, 2, 1, {0, 1, 2, 3, 4}, "goiyx", "oixy??g", {{0, 32}}), FMT_TRAITS(gs_oiyx_gsv16, 1, 1, 2, 1, {0, 1, 2, 3, 4}, "goiyx", "oixy??g", {{6, 16}}), diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index e6a06941b75..bf3ea878018 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -32,6 +32,7 @@ cl::PFN_clCreateFromD3D11Buffer cl::BufferDX::pfn_clCreateFromD3D11Buffer = NULL #ifdef ENABLE_ONEDNN_FOR_GPU #include +#include "openvino/util/file_util.hpp" #endif namespace cldnn { @@ -64,7 +65,38 @@ dnnl::engine& ocl_engine::get_onednn_engine() const { if (!casted) throw ov::Exception("[GPU] Invalid device type stored in ocl_engine"); - _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get())); + auto config = this->configuration(); + if (config.kernels_cache_path.empty()) { + _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get())); + } else { + // Use cached blob + auto path = config.kernels_cache_path; + if (path.back() != '/' && path.back() != '\\') { + path += "/"; + } + + auto blob_id = dnnl::ocl_interop::get_engine_cache_blob_id(casted->get_device().get()); + if (blob_id.empty()) { + // Create engine without cache_blob + _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get())); + return *_onednn_engine; + } + + std::string id_str(blob_id.begin(), blob_id.end()); + size_t hash = std::hash()(id_str); + path = path + std::to_string(hash) + ".onednn.cl_cache"; + + auto onednn_cache_blob = ov::util::load_binary(path); + if (onednn_cache_blob.empty()) { + _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get())); + + onednn_cache_blob = dnnl::ocl_interop::get_engine_cache_blob(*_onednn_engine); + ov::util::save_binary(path, onednn_cache_blob); + } else { + _onednn_engine = std::make_shared(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get(), + onednn_cache_blob)); + } + } } return *_onednn_engine; diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index b5faa77a4a6..e5a70f43639 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit b5faa77a4a651f1e44fa77348eded54ea3ec3eef +Subproject commit e5a70f43639ba968869a99931d77116791ace355