[GPU] Baseline for enabling GPUs from other vendors (#12577)

2022-11-01 18:02:29 +04:00
parent 7595fd4c4e
commit af9724e8da
13 changed files with 281 additions and 153 deletions
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device.hpp
@@ -6,11 +6,14 @@

 #include "device_info.hpp"
 #include "memory_caps.hpp"
+#include "layout.hpp"

 #include <memory>

 namespace cldnn {

+const uint32_t INTEL_VENDOR_ID = 0x8086;
+
 /// @brief Represents detected GPU device object. Use device_query to get list of available objects.
 struct device {
 public:
@@ -20,6 +23,8 @@ public:

    virtual bool is_same(const device::ptr other) = 0;

+    float get_gops(cldnn::data_types dt) const;
+
    virtual ~device() = default;
 };

--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1548,7 +1548,9 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {

 #ifdef ENABLE_ONEDNN_FOR_GPU
    auto& engine = get_engine();
-    if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order)
+    if (engine.get_device_info().supports_immad &&
+        engine.get_device_info().vendor_id == INTEL_VENDOR_ID &&
+        engine.configuration().queue_type == queue_types::in_order)
        lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
 #endif
 }
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_biplanar_nv12.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_biplanar_nv12.cl
@@ -85,25 +85,25 @@ KERNEL(reorder_biplanar_nv12)(
    B -= VALUE_TO_SUBTRACT[2];
 #elif defined MEAN_SUBTRACT_IN_BUFFER
    uint8 msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, 0, w, z, y, x);
-    R -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
+    R -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];

    msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, 1, w, z, y, x);
-    G -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
+    G -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];

    msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, 2, w, z, y, x);
-    B -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
+    B -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];
 #endif

    uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
-    uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    uint output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(R), NL_M, NL_N);

    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(G), NL_M, NL_N);

    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);


--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data.cl
@@ -117,7 +117,7 @@ KERNEL (reorder_data)(
 #else
    uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, f, w, z, y, x);
    const uint input_idx  = FUNC_CALL(get_input_index)(b, f, w, z, y, x);
-    const uint output_idx = FUNC_CALL(get_output_index)(ov[1],ov[2],ov[3],ov[4], ov[5], ov[6]);
+    const uint output_idx = FUNC_CALL(get_output_index)(ov.s1,ov.s2,ov.s3,ov.s4,ov.s5,ov.s6);

 #if defined MEAN_SUBTRACT_INSIDE_PARAMS
    float res = TO_MEAN_TYPE(input[input_idx]);
@@ -130,7 +130,7 @@ KERNEL (reorder_data)(
    // TODO Add support for 6D mean
    MEAN_SUBTRACT_TYPE res = TO_MEAN_TYPE(input[input_idx]);
    uint8 msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, f, w, z, y, x);
-    res = MEAN_OP(res, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], /*msv[3], msv[4],*/ msv[5], msv[6])]);
+    res = MEAN_OP(res, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, /*msv.s3, msv.s4,*/ msv.s5, msv.s6)]);
 #endif
 #else
    CALC_TYPE res = TO_CALC_TYPE(input[input_idx]);
@@ -139,27 +139,27 @@ KERNEL (reorder_data)(

 #if defined INPUT0_LAYOUT_NV12
    uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
-    uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    uint output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(R), NL_M, NL_N);
    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(G), NL_M, NL_N);
    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);
 #elif INPUT0_LAYOUT_IMAGE_2D_RGBA
    uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
-    uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    uint output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s0), NL_M, NL_N);
    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s1), NL_M, NL_N);
    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s2), NL_M, NL_N);
 #if INPUT0_FEATURE_NUM == 4
    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 3, w, z, y, x);
-    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s3), NL_M, NL_N);
 #endif
 #elif OUTPUT_LAYOUT_IMAGE_2D_RGBA
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_fast_b1.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_data_fast_b1.cl
@@ -229,7 +229,7 @@ KERNEL (reorder_data_fast_b1)(
    const uint output_idx = data_idx;
 #else
    uint8 ov = RESHAPE_DIMS(OUTPUT, INPUT0, b, f, w, z, y, x);
-    const uint input_idx = FUNC_CALL(get_input_index)(ov[1], ov[2], ov[3], ov[4], ov[5],ov[6]);
+    const uint input_idx = FUNC_CALL(get_input_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
    const uint output_idx  = FUNC_CALL(get_output_index)(b, f, w, z, y, x);
 #endif

@@ -239,7 +239,7 @@ KERNEL (reorder_data_fast_b1)(
 #elif defined MEAN_SUBTRACT_IN_BUFFER
    MEAN_SUBTRACT_TYPE res = TO_MEAN_TYPE(input[input_idx]);
    uint8 msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, f, w, z, y, x);
-    res -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
+    res -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];
 #else
    CALC_TYPE res = TO_CALC_TYPE(input[input_idx]);
 #endif
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/reorder_weights.cl
@@ -452,7 +452,7 @@ KERNEL (reorder_weights)(const __global INPUT0_TYPE* input, write_only image2d_t
    MAKE_VECTOR_TYPE(UNIT_TYPE, 4) input_val = (MAKE_VECTOR_TYPE(UNIT_TYPE, 4))(UNIT_VAL_ZERO, UNIT_VAL_ZERO, UNIT_VAL_ZERO, UNIT_VAL_ZERO);
    const int2 coord = (int2)(o, iyx);
    uint8 ir = RESHAPE_WEIGHT_DIMS(OUTPUT, INPUT0, o, i, 0, 0, y, x);
-    input_val.s0 = TO_OUTPUT_TYPE(input[FUNC_CALL(get_input_index)(ir[0],ir[1],ir[2],ir[4],ir[5],ir[6])]);
+    input_val.s0 = TO_OUTPUT_TYPE(input[FUNC_CALL(get_input_index)(ir.s0,ir.s1,ir.s2,ir.s4,ir.s5,ir.s6)]);
    IMAGE_WRITE(output, coord, input_val);
 }
 #else
@@ -489,7 +489,7 @@ KERNEL (reorder_weights)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE
    uint8 ir = RESHAPE_WEIGHT_DIMS(OUTPUT, INPUT0, o, i, 0, z, y, x);
 #endif

-    uint input_idx = FUNC_CALL(get_input_index)(ir[0],ir[1],ir[2],ir[4],ir[5],ir[6]);
+    uint input_idx = FUNC_CALL(get_input_index)(ir.s0,ir.s1,ir.s2,ir.s4,ir.s5,ir.s6);
 #if !REORDER_ROTATE
    uint output_idx = FUNC_CALL(get_output_index)(g, o, i, z, y, x);
 #else
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -512,58 +512,6 @@ auto StringRightTrim = [](std::string string, std::string substring, bool case_s
    return ret_str;
 };

-static float GetGOPS(cldnn::device_info info, cldnn::data_types dt) {
-    auto freqGHz = info.gpu_frequency / 1000.f;
-    auto numEUs = info.execution_units_count;
-    auto opsPerComputeBlock = 0;
-    auto computeBlockIPC = 1.0f;
-    switch (dt) {
-    case cldnn::data_types::u8:
-    case cldnn::data_types::i8: {
-        if (info.supports_immad) {
-            if (info.gfx_ver.major == 12) {
-                if (info.gfx_ver.minor == 5)
-                    opsPerComputeBlock = 512;
-                else if (info.gfx_ver.minor == 7)
-                    opsPerComputeBlock = 256;
-            }
-        } else if (info.supports_imad) {
-            // fma * simd size
-            opsPerComputeBlock = 2 * 32;
-        } else {
-            // separate mul + add instructions for int8 data type
-            opsPerComputeBlock = 2 * 16;
-            // mul/add instructions can't be executed in parallel, so we need 2 clocks to execute compute block
-            computeBlockIPC = 0.5f;
-        }
-        break;
-    }
-    case cldnn::data_types::f16: {
-        if (info.supports_immad) {
-            if (info.gfx_ver.major == 12) {
-                if (info.gfx_ver.minor == 5)
-                    opsPerComputeBlock = 256;
-                else if (info.gfx_ver.minor == 7)
-                    opsPerComputeBlock = 128;
-            }
-        } else {
-            // fma * simd size
-            opsPerComputeBlock = 2 * 16;
-        }
-        break;
-    }
-    case cldnn::data_types::f32: {
-        // fma * simd size
-        opsPerComputeBlock = 2 * 8;
-        break;
-    }
-
-    default: throw std::runtime_error("GetGOPS: Unsupported precision");
-    }
-
-    return freqGHz * opsPerComputeBlock * computeBlockIPC * numEUs;
-}
-
 Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string, Parameter>& options) const {
    OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::GetMetric");
    GPU_DEBUG_GET_INSTANCE(debug_config);
@@ -648,17 +596,17 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
    } else if (name == ov::device::gops) {
        if (is_new_api) {
            std::map<element::Type, float> gops;
-            gops[element::i8] = GetGOPS(device_info, cldnn::data_types::i8);
-            gops[element::u8] = GetGOPS(device_info, cldnn::data_types::u8);
-            gops[element::f16] = GetGOPS(device_info, cldnn::data_types::f16);
-            gops[element::f32] = GetGOPS(device_info, cldnn::data_types::f32);
+            gops[element::i8] = device->get_gops(cldnn::data_types::i8);
+            gops[element::u8] = device->get_gops(cldnn::data_types::u8);
+            gops[element::f16] = device->get_gops(cldnn::data_types::f16);
+            gops[element::f32] = device->get_gops(cldnn::data_types::f32);
            return decltype(ov::device::gops)::value_type {gops};
        } else {
            std::map<InferenceEngine::Precision, float> gops;
-            gops[InferenceEngine::Precision::I8] = GetGOPS(device_info, cldnn::data_types::i8);
-            gops[InferenceEngine::Precision::U8] = GetGOPS(device_info, cldnn::data_types::u8);
-            gops[InferenceEngine::Precision::FP16] = GetGOPS(device_info, cldnn::data_types::f16);
-            gops[InferenceEngine::Precision::FP32] = GetGOPS(device_info, cldnn::data_types::f32);
+            gops[InferenceEngine::Precision::I8] = device->get_gops(cldnn::data_types::i8);
+            gops[InferenceEngine::Precision::U8] = device->get_gops(cldnn::data_types::u8);
+            gops[InferenceEngine::Precision::FP16] = device->get_gops(cldnn::data_types::f16);
+            gops[InferenceEngine::Precision::FP32] = device->get_gops(cldnn::data_types::f32);
            IE_SET_METRIC_RETURN(DEVICE_GOPS, gops);
        }
    } else if (name == ov::intel_gpu::execution_units_count) {
--- a/src/plugins/intel_gpu/src/runtime/device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/device.cpp
@@ -0,0 +1,66 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/runtime/device.hpp"
+
+namespace cldnn {
+
+float device::get_gops(cldnn::data_types dt) const {
+    auto info = get_info();
+    if (info.vendor_id != INTEL_VENDOR_ID) {
+        // GOPS calculation is not supported for non Intel GPUs
+        return 0.0f;
+    }
+    auto freqGHz = info.gpu_frequency / 1000.f;
+    auto numEUs = info.execution_units_count;
+    auto opsPerComputeBlock = 0;
+    auto computeBlockIPC = 1.0f;
+    switch (dt) {
+    case cldnn::data_types::u8:
+    case cldnn::data_types::i8: {
+        if (info.supports_immad) {
+            if (info.gfx_ver.major == 12) {
+                if (info.gfx_ver.minor == 5)
+                    opsPerComputeBlock = 512;
+                else if (info.gfx_ver.minor == 7)
+                    opsPerComputeBlock = 256;
+            }
+        } else if (info.supports_imad) {
+            // fma * simd size
+            opsPerComputeBlock = 2 * 32;
+        } else {
+            // separate mul + add instructions for int8 data type
+            opsPerComputeBlock = 2 * 16;
+            // mul/add instructions can't be executed in parallel, so we need 2 clocks to execute compute block
+            computeBlockIPC = 0.5f;
+        }
+        break;
+    }
+    case cldnn::data_types::f16: {
+        if (info.supports_immad) {
+            if (info.gfx_ver.major == 12) {
+                if (info.gfx_ver.minor == 5)
+                    opsPerComputeBlock = 256;
+                else if (info.gfx_ver.minor == 7)
+                    opsPerComputeBlock = 128;
+            }
+        } else {
+            // fma * simd size
+            opsPerComputeBlock = 2 * 16;
+        }
+        break;
+    }
+    case cldnn::data_types::f32: {
+        // fma * simd size
+        opsPerComputeBlock = 2 * 8;
+        break;
+    }
+
+    default: OPENVINO_ASSERT(false, "[GPU] get_gops: unsupported precision: ", dt);
+    }
+
+    return freqGHz * opsPerComputeBlock * computeBlockIPC * numEUs;
+}
+
+}  // namespace cldnn
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.cpp
@@ -22,25 +22,43 @@

 namespace {
 bool does_device_match_config(bool out_of_order, const cl::Device& device) {
-// Is it intel gpu
-if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU ||
-    device.getInfo<CL_DEVICE_VENDOR_ID>() != 0x8086) {
-    return false;
-}
-
-// Does device support OOOQ?
-if (out_of_order) {
-    auto queue_properties = device.getInfo<CL_DEVICE_QUEUE_PROPERTIES>();
-    using cmp_t = std::common_type<decltype(queue_properties),
-        typename std::underlying_type<cl::QueueProperties>::type>::type;
-    if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
+    if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
        return false;
    }
+
+    // TODO: Remove the check below once kernels are fixed
+    if (device.getInfo<CL_DEVICE_VENDOR_ID>() != cldnn::INTEL_VENDOR_ID)
+        return false;
+
+    // Does device support OOOQ?
+    if (out_of_order) {
+        auto queue_properties = device.getInfo<CL_DEVICE_QUEUE_PROPERTIES>();
+        using cmp_t = std::common_type<decltype(queue_properties),
+            typename std::underlying_type<cl::QueueProperties>::type>::type;
+        if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
+            return false;
+        }
+    }
+
+    return true;
 }

-return true;
+// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration
+// Lower priority value means lower device ID
+// Current behavior is: Intel iGPU < Intel dGPU < any other GPU
+// Order of Intel dGPUs is undefined and depends on the OCL impl
+// Order of other vendor GPUs is undefined and depends on the OCL impl
+size_t get_device_priority(const cldnn::device_info& info) {
+    if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) {
+        return 0;
+    } else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) {
+        return 1;
+    } else {
+        return std::numeric_limits<size_t>::max();
+    }
 }
 }  // namespace
+
 namespace cldnn {
 namespace ocl {
 static constexpr auto INTEL_PLATFORM_VENDOR = "Intel(R) Corporation";
@@ -56,10 +74,8 @@ static std::vector<cl::Device> getSubDevices(cl::Device& rootDevice) {
                                     sizeof(maxSubDevices),
                                     &maxSubDevices, &maxSubDevicesSize);

-    if (err != CL_SUCCESS || maxSubDevicesSize != sizeof(maxSubDevices)) {
-        throw cl::Error(err, "clGetDeviceInfo(..., CL_DEVICE_PARTITION_MAX_SUB_DEVICES,...)");
-    }
-
+    OPENVINO_ASSERT(err == CL_SUCCESS && maxSubDevicesSize == sizeof(maxSubDevices),
+                    "[GPU] clGetDeviceInfo(..., CL_DEVICE_PARTITION_MAX_SUB_DEVICES,...)");
    if (maxSubDevices == 0) {
        return {};
    }
@@ -91,47 +107,50 @@ static std::vector<cl::Device> getSubDevices(cl::Device& rootDevice) {
    return subDevices;
 }

+std::vector<device::ptr> ocl_device_detector::sort_devices(const std::vector<device::ptr>& devices_list) {
+    std::vector<device::ptr> sorted_list = devices_list;
+    std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1,  device::ptr d2) {
+        return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info());
+    });
+
+    return sorted_list;
+}
+
 std::map<std::string, device::ptr> ocl_device_detector::get_available_devices(void* user_context,
                                                                              void* user_device,
                                                                              int ctx_device_id,
                                                                              int target_tile_id) const {
    bool host_out_of_order = true;  // Change to false, if debug requires in-order queue.
-    std::vector<device::ptr> dev_orig, dev_sorted;
+    std::vector<device::ptr> devices_list;
    if (user_context != nullptr) {
-        dev_orig = create_device_list_from_user_context(host_out_of_order, user_context, ctx_device_id);
+        devices_list = create_device_list_from_user_context(host_out_of_order, user_context, ctx_device_id);
    } else if (user_device != nullptr) {
-        dev_orig = create_device_list_from_user_device(host_out_of_order, user_device);
+        devices_list = create_device_list_from_user_device(host_out_of_order, user_device);
    } else {
-        dev_orig = create_device_list(host_out_of_order);
+        devices_list = create_device_list(host_out_of_order);
    }

+    devices_list = sort_devices(devices_list);
+
    std::map<std::string, device::ptr> ret;
-    for (auto& dptr : dev_orig) {
-        if (dptr->get_info().dev_type == cldnn::device_type::integrated_gpu)
-            dev_sorted.insert(dev_sorted.begin(), dptr);
-        else
-            dev_sorted.push_back(dptr);
-    }
    uint32_t idx = 0;
-    for (auto& dptr : dev_sorted) {
+    for (auto& dptr : devices_list) {
        auto map_id = std::to_string(idx++);
        ret[map_id] = dptr;

-        auto rootDevice = std::dynamic_pointer_cast<ocl_device>(dptr);
-        if (!rootDevice) {
-            throw std::runtime_error("Invalid device type created in ocl_device_detector");
-        }
+        auto root_device = std::dynamic_pointer_cast<ocl_device>(dptr);
+        OPENVINO_ASSERT(root_device != nullptr, "[GPU] Invalid device type created in ocl_device_detector");

-        auto subDevices = getSubDevices(rootDevice->get_device());
-        if (!subDevices.empty()) {
+        auto sub_devices = getSubDevices(root_device->get_device());
+        if (!sub_devices.empty()) {
            uint32_t sub_idx = 0;
-            for (auto& subdevice : subDevices) {
+            for (auto& sub_device : sub_devices) {
                if (target_tile_id != -1 && static_cast<int>(sub_idx) != target_tile_id) {
                    sub_idx++;
                    continue;
                }
-                auto subdPtr = std::make_shared<ocl_device>(subdevice, cl::Context(subdevice), rootDevice->get_platform());
-                ret[map_id+"."+std::to_string(sub_idx++)] = subdPtr;
+                auto sub_device_ptr = std::make_shared<ocl_device>(sub_device, cl::Context(sub_device), root_device->get_platform());
+                ret[map_id + "." + std::to_string(sub_idx++)] = sub_device_ptr;
            }
        }
    }
@@ -142,72 +161,56 @@ std::vector<device::ptr> ocl_device_detector::create_device_list(bool out_out_or
    cl_uint n = 0;
    // Get number of platforms availible
    cl_int err = clGetPlatformIDs(0, NULL, &n);
-    if (err != CL_SUCCESS) {
-        throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
-    }
-
+    OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ",  err);
    // Get platform list
    std::vector<cl_platform_id> platform_ids(n);
    err = clGetPlatformIDs(n, platform_ids.data(), NULL);
-    if (err != CL_SUCCESS) {
-        throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
-    }
+    OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ",  err);

-    std::vector<device::ptr> ret;
+    std::vector<device::ptr> supported_devices;
    for (auto& id : platform_ids) {
        cl::Platform platform = cl::Platform(id);

-        if (platform.getInfo<CL_PLATFORM_VENDOR>() != INTEL_PLATFORM_VENDOR)
-            continue;
-
        std::vector<cl::Device> devices;
        platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
        for (auto& device : devices) {
            if (!does_device_match_config(out_out_order, device))
                continue;
-            ret.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device), id));
+            supported_devices.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device), id));
        }
    }
-    if (ret.empty()) {
-        throw std::runtime_error("[CLDNN ERROR]. No GPU device was found.");
-    }
-    return ret;
+    OPENVINO_ASSERT(!supported_devices.empty(), "[GPU] No GPU device was found.");
+    return supported_devices;
 }

 std::vector<device::ptr>  ocl_device_detector::create_device_list_from_user_context(bool out_out_order, void* user_context, int ctx_device_id) const {
    cl::Context ctx = cl::Context(static_cast<cl_context>(user_context), true);
    auto all_devices = ctx.getInfo<CL_CONTEXT_DEVICES>();

-    std::vector<device::ptr> ret;
+    std::vector<device::ptr> supported_devices;
    for (size_t i = 0; i < all_devices.size(); i++) {
        auto& device = all_devices[i];
        if (!does_device_match_config(out_out_order, device) || static_cast<int>(i) != ctx_device_id)
            continue;
-        ret.emplace_back(std::make_shared<ocl_device>(device, ctx, device.getInfo<CL_DEVICE_PLATFORM>()));
+        supported_devices.emplace_back(std::make_shared<ocl_device>(device, ctx, device.getInfo<CL_DEVICE_PLATFORM>()));
    }

-    if (ret.empty()) {
-        throw std::runtime_error("[CLDNN ERROR]. User defined context does not have GPU device included!");
-    }
-    return ret;
+    OPENVINO_ASSERT(!supported_devices.empty(), "[GPU] User defined context does not have GPU device included.");
+    return supported_devices;
 }

-std::vector<device::ptr>  ocl_device_detector::create_device_list_from_user_device(bool out_out_order, void* user_device) const {
+std::vector<device::ptr> ocl_device_detector::create_device_list_from_user_device(bool out_out_order, void* user_device) const {
    cl_uint n = 0;
    // Get number of platforms availible
    cl_int err = clGetPlatformIDs(0, NULL, &n);
-    if (err != CL_SUCCESS) {
-        throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
-    }
+    OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ",  err);

    // Get platform list
    std::vector<cl_platform_id> platform_ids(n);
    err = clGetPlatformIDs(n, platform_ids.data(), NULL);
-    if (err != CL_SUCCESS) {
-        throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
-    }
+    OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ",  err);

-    std::vector<device::ptr> ret;
+    std::vector<device::ptr> supported_devices;
    for (auto& id : platform_ids) {
        cl::PlatformVA platform = cl::PlatformVA(id);

@@ -250,13 +253,11 @@ std::vector<device::ptr>  ocl_device_detector::create_device_list_from_user_devi
                CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
                CL_CONTEXT_PLATFORM, (cl_context_properties)id,
                0 };
-            ret.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device, props), id));
+            supported_devices.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device, props), id));
        }
    }
-    if (ret.empty()) {
-        throw std::runtime_error("[CLDNN ERROR]. No corresponding GPU device was found.");
-    }
-    return ret;
+    OPENVINO_ASSERT(!supported_devices.empty(), "[GPU] User specified device is not supported.");
+    return supported_devices;
 }

 }  // namespace ocl
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device_detector.hpp
@@ -21,6 +21,8 @@ public:

    std::map<std::string, device::ptr> get_available_devices(void *user_context, void *user_device, int ctx_device_id = 0, int target_tile_id = -1) const;

+    static std::vector<device::ptr> sort_devices(const std::vector<device::ptr>& devices_list);
+
 private:
    std::vector<device::ptr> create_device_list(bool out_out_order) const;
    std::vector<device::ptr> create_device_list_from_user_context(bool out_out_order, void* user_context, int ctx_device_id = 0) const;
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp
@@ -57,6 +57,7 @@ ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type,
 #ifdef ENABLE_ONEDNN_FOR_GPU
 dnnl::engine& ocl_engine::get_onednn_engine() const {
    const std::lock_guard<std::mutex> lock(onednn_mutex);
+    OPENVINO_ASSERT(_device->get_info().vendor_id == INTEL_VENDOR_ID, "[GPU] OneDNN engine can be used for Intel GPUs only");
    if (!_onednn_engine) {
        auto casted = std::dynamic_pointer_cast<ocl_device>(_device);
        if (!casted)
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
@@ -291,7 +291,7 @@ ocl_stream::ocl_stream(const ocl_engine &engine)
    _command_queue = queue_builder.build(context, device);

 #ifdef ENABLE_ONEDNN_FOR_GPU
-    if (config.queue_type == queue_types::in_order) {
+    if (config.queue_type == queue_types::in_order && engine.get_device_info().vendor_id == INTEL_VENDOR_ID) {
        auto onednn_engine = engine.get_onednn_engine();
        _onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
    }
--- a/src/plugins/intel_gpu/tests/module_tests/device_test.cpp
+++ b/src/plugins/intel_gpu/tests/module_tests/device_test.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils.h"
+#include "intel_gpu/runtime/device.hpp"
+#include "runtime/ocl/ocl_device_detector.hpp"
+#include <memory>
+
+using namespace cldnn;
+using namespace ::tests;
+
+namespace {
+
+struct dummy_device : public device {
+public:
+    dummy_device(uint32_t vendor_id, device_type type, size_t device_id) : _mem_caps({}) {
+        _info = device_info{};
+        _info.vendor_id = vendor_id;
+        _info.dev_type = type;
+        _info.device_id = device_id;
+    }
+
+    device_info get_info() const override { return _info; }
+    memory_capabilities get_mem_caps() const override { return _mem_caps; }
+    bool is_same(const device::ptr other) override {
+        return this == other.get();
+    }
+
+    ~dummy_device() = default;
+
+private:
+    device_info _info;
+    memory_capabilities _mem_caps;
+};
+
+}  // namespace
+
+TEST(devices_test, sort_order_single_vendor) {
+    size_t device_id = 0;
+    std::vector<device::ptr> devices_list;
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
+
+    auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
+
+    std::vector<size_t> expected_devices_order = {2, 0, 1, 3, 4};
+
+    std::vector<size_t> actual_devices_order;
+    std::transform(sorted_list.begin(), sorted_list.end(), std::back_inserter(actual_devices_order), [](const device::ptr& d) -> size_t {
+        return d->get_info().device_id;
+    });
+
+    ASSERT_EQ(expected_devices_order, actual_devices_order);
+}
+
+TEST(devices_test, sort_order_two_vendors) {
+    size_t device_id = 0;
+    const auto OTHER_VENDOR_ID = 0x123;
+    std::vector<device::ptr> devices_list;
+    devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
+
+    auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
+
+    std::vector<size_t> expected_devices_order = {3, 2, 0, 1};
+
+    std::vector<size_t> actual_devices_order;
+    std::transform(sorted_list.begin(), sorted_list.end(), std::back_inserter(actual_devices_order), [](const device::ptr& d) -> size_t {
+        return d->get_info().device_id;
+    });
+
+    ASSERT_EQ(expected_devices_order, actual_devices_order);
+}
+
+TEST(devices_test, sort_order_three_vendors) {
+    size_t device_id = 0;
+    const auto OTHER_VENDOR_ID1 = 0x123;
+    const auto OTHER_VENDOR_ID2 = 0x1234;
+    std::vector<device::ptr> devices_list;
+    devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID1, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID1, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++));
+    devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++));
+
+    auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
+
+    std::vector<size_t> expected_devices_order = {2, 3, 0, 1, 4, 5};
+
+    std::vector<size_t> actual_devices_order;
+    std::transform(sorted_list.begin(), sorted_list.end(), std::back_inserter(actual_devices_order), [](const device::ptr& d) -> size_t {
+        return d->get_info().device_id;
+    });
+
+    ASSERT_EQ(expected_devices_order, actual_devices_order);
+}