[GPU] Baseline for enabling GPUs from other vendors (#12577)
This commit is contained in:
committed by
GitHub
parent
7595fd4c4e
commit
af9724e8da
@@ -6,11 +6,14 @@
|
||||
|
||||
#include "device_info.hpp"
|
||||
#include "memory_caps.hpp"
|
||||
#include "layout.hpp"
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
const uint32_t INTEL_VENDOR_ID = 0x8086;
|
||||
|
||||
/// @brief Represents detected GPU device object. Use device_query to get list of available objects.
|
||||
struct device {
|
||||
public:
|
||||
@@ -20,6 +23,8 @@ public:
|
||||
|
||||
virtual bool is_same(const device::ptr other) = 0;
|
||||
|
||||
float get_gops(cldnn::data_types dt) const;
|
||||
|
||||
virtual ~device() = default;
|
||||
};
|
||||
|
||||
|
||||
@@ -1548,7 +1548,9 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
auto& engine = get_engine();
|
||||
if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order)
|
||||
if (engine.get_device_info().supports_immad &&
|
||||
engine.get_device_info().vendor_id == INTEL_VENDOR_ID &&
|
||||
engine.configuration().queue_type == queue_types::in_order)
|
||||
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -85,25 +85,25 @@ KERNEL(reorder_biplanar_nv12)(
|
||||
B -= VALUE_TO_SUBTRACT[2];
|
||||
#elif defined MEAN_SUBTRACT_IN_BUFFER
|
||||
uint8 msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, 0, w, z, y, x);
|
||||
R -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
|
||||
R -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];
|
||||
|
||||
msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, 1, w, z, y, x);
|
||||
G -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
|
||||
G -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];
|
||||
|
||||
msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, 2, w, z, y, x);
|
||||
B -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
|
||||
B -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];
|
||||
#endif
|
||||
|
||||
uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
|
||||
uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
uint output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(R), NL_M, NL_N);
|
||||
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(G), NL_M, NL_N);
|
||||
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);
|
||||
|
||||
|
||||
|
||||
@@ -117,7 +117,7 @@ KERNEL (reorder_data)(
|
||||
#else
|
||||
uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, f, w, z, y, x);
|
||||
const uint input_idx = FUNC_CALL(get_input_index)(b, f, w, z, y, x);
|
||||
const uint output_idx = FUNC_CALL(get_output_index)(ov[1],ov[2],ov[3],ov[4], ov[5], ov[6]);
|
||||
const uint output_idx = FUNC_CALL(get_output_index)(ov.s1,ov.s2,ov.s3,ov.s4,ov.s5,ov.s6);
|
||||
|
||||
#if defined MEAN_SUBTRACT_INSIDE_PARAMS
|
||||
float res = TO_MEAN_TYPE(input[input_idx]);
|
||||
@@ -130,7 +130,7 @@ KERNEL (reorder_data)(
|
||||
// TODO Add support for 6D mean
|
||||
MEAN_SUBTRACT_TYPE res = TO_MEAN_TYPE(input[input_idx]);
|
||||
uint8 msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, f, w, z, y, x);
|
||||
res = MEAN_OP(res, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], /*msv[3], msv[4],*/ msv[5], msv[6])]);
|
||||
res = MEAN_OP(res, mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, /*msv.s3, msv.s4,*/ msv.s5, msv.s6)]);
|
||||
#endif
|
||||
#else
|
||||
CALC_TYPE res = TO_CALC_TYPE(input[input_idx]);
|
||||
@@ -139,27 +139,27 @@ KERNEL (reorder_data)(
|
||||
|
||||
#if defined INPUT0_LAYOUT_NV12
|
||||
uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
|
||||
uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
uint output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(R), NL_M, NL_N);
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(G), NL_M, NL_N);
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);
|
||||
#elif INPUT0_LAYOUT_IMAGE_2D_RGBA
|
||||
uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
|
||||
uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
uint output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s0), NL_M, NL_N);
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s1), NL_M, NL_N);
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s2), NL_M, NL_N);
|
||||
#if INPUT0_FEATURE_NUM == 4
|
||||
ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 3, w, z, y, x);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
|
||||
output_idx = FUNC_CALL(get_output_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s3), NL_M, NL_N);
|
||||
#endif
|
||||
#elif OUTPUT_LAYOUT_IMAGE_2D_RGBA
|
||||
|
||||
@@ -229,7 +229,7 @@ KERNEL (reorder_data_fast_b1)(
|
||||
const uint output_idx = data_idx;
|
||||
#else
|
||||
uint8 ov = RESHAPE_DIMS(OUTPUT, INPUT0, b, f, w, z, y, x);
|
||||
const uint input_idx = FUNC_CALL(get_input_index)(ov[1], ov[2], ov[3], ov[4], ov[5],ov[6]);
|
||||
const uint input_idx = FUNC_CALL(get_input_index)(ov.s1, ov.s2, ov.s3, ov.s4, ov.s5, ov.s6);
|
||||
const uint output_idx = FUNC_CALL(get_output_index)(b, f, w, z, y, x);
|
||||
#endif
|
||||
|
||||
@@ -239,7 +239,7 @@ KERNEL (reorder_data_fast_b1)(
|
||||
#elif defined MEAN_SUBTRACT_IN_BUFFER
|
||||
MEAN_SUBTRACT_TYPE res = TO_MEAN_TYPE(input[input_idx]);
|
||||
uint8 msv = RESHAPE_DIMS(INPUT0, MEAN_SUBTRACT, b, f, w, z, y, x);
|
||||
res -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv[1], msv[2], msv[5], msv[6])];
|
||||
res -= mean_subtract[GET_DATA_INDEX_SAFE(MEAN_SUBTRACT, msv.s1, msv.s2, msv.s5, msv.s6)];
|
||||
#else
|
||||
CALC_TYPE res = TO_CALC_TYPE(input[input_idx]);
|
||||
#endif
|
||||
|
||||
@@ -452,7 +452,7 @@ KERNEL (reorder_weights)(const __global INPUT0_TYPE* input, write_only image2d_t
|
||||
MAKE_VECTOR_TYPE(UNIT_TYPE, 4) input_val = (MAKE_VECTOR_TYPE(UNIT_TYPE, 4))(UNIT_VAL_ZERO, UNIT_VAL_ZERO, UNIT_VAL_ZERO, UNIT_VAL_ZERO);
|
||||
const int2 coord = (int2)(o, iyx);
|
||||
uint8 ir = RESHAPE_WEIGHT_DIMS(OUTPUT, INPUT0, o, i, 0, 0, y, x);
|
||||
input_val.s0 = TO_OUTPUT_TYPE(input[FUNC_CALL(get_input_index)(ir[0],ir[1],ir[2],ir[4],ir[5],ir[6])]);
|
||||
input_val.s0 = TO_OUTPUT_TYPE(input[FUNC_CALL(get_input_index)(ir.s0,ir.s1,ir.s2,ir.s4,ir.s5,ir.s6)]);
|
||||
IMAGE_WRITE(output, coord, input_val);
|
||||
}
|
||||
#else
|
||||
@@ -489,7 +489,7 @@ KERNEL (reorder_weights)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE
|
||||
uint8 ir = RESHAPE_WEIGHT_DIMS(OUTPUT, INPUT0, o, i, 0, z, y, x);
|
||||
#endif
|
||||
|
||||
uint input_idx = FUNC_CALL(get_input_index)(ir[0],ir[1],ir[2],ir[4],ir[5],ir[6]);
|
||||
uint input_idx = FUNC_CALL(get_input_index)(ir.s0,ir.s1,ir.s2,ir.s4,ir.s5,ir.s6);
|
||||
#if !REORDER_ROTATE
|
||||
uint output_idx = FUNC_CALL(get_output_index)(g, o, i, z, y, x);
|
||||
#else
|
||||
|
||||
@@ -512,58 +512,6 @@ auto StringRightTrim = [](std::string string, std::string substring, bool case_s
|
||||
return ret_str;
|
||||
};
|
||||
|
||||
static float GetGOPS(cldnn::device_info info, cldnn::data_types dt) {
|
||||
auto freqGHz = info.gpu_frequency / 1000.f;
|
||||
auto numEUs = info.execution_units_count;
|
||||
auto opsPerComputeBlock = 0;
|
||||
auto computeBlockIPC = 1.0f;
|
||||
switch (dt) {
|
||||
case cldnn::data_types::u8:
|
||||
case cldnn::data_types::i8: {
|
||||
if (info.supports_immad) {
|
||||
if (info.gfx_ver.major == 12) {
|
||||
if (info.gfx_ver.minor == 5)
|
||||
opsPerComputeBlock = 512;
|
||||
else if (info.gfx_ver.minor == 7)
|
||||
opsPerComputeBlock = 256;
|
||||
}
|
||||
} else if (info.supports_imad) {
|
||||
// fma * simd size
|
||||
opsPerComputeBlock = 2 * 32;
|
||||
} else {
|
||||
// separate mul + add instructions for int8 data type
|
||||
opsPerComputeBlock = 2 * 16;
|
||||
// mul/add instructions can't be executed in parallel, so we need 2 clocks to execute compute block
|
||||
computeBlockIPC = 0.5f;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case cldnn::data_types::f16: {
|
||||
if (info.supports_immad) {
|
||||
if (info.gfx_ver.major == 12) {
|
||||
if (info.gfx_ver.minor == 5)
|
||||
opsPerComputeBlock = 256;
|
||||
else if (info.gfx_ver.minor == 7)
|
||||
opsPerComputeBlock = 128;
|
||||
}
|
||||
} else {
|
||||
// fma * simd size
|
||||
opsPerComputeBlock = 2 * 16;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case cldnn::data_types::f32: {
|
||||
// fma * simd size
|
||||
opsPerComputeBlock = 2 * 8;
|
||||
break;
|
||||
}
|
||||
|
||||
default: throw std::runtime_error("GetGOPS: Unsupported precision");
|
||||
}
|
||||
|
||||
return freqGHz * opsPerComputeBlock * computeBlockIPC * numEUs;
|
||||
}
|
||||
|
||||
Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string, Parameter>& options) const {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::GetMetric");
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
@@ -648,17 +596,17 @@ Parameter Plugin::GetMetric(const std::string& name, const std::map<std::string,
|
||||
} else if (name == ov::device::gops) {
|
||||
if (is_new_api) {
|
||||
std::map<element::Type, float> gops;
|
||||
gops[element::i8] = GetGOPS(device_info, cldnn::data_types::i8);
|
||||
gops[element::u8] = GetGOPS(device_info, cldnn::data_types::u8);
|
||||
gops[element::f16] = GetGOPS(device_info, cldnn::data_types::f16);
|
||||
gops[element::f32] = GetGOPS(device_info, cldnn::data_types::f32);
|
||||
gops[element::i8] = device->get_gops(cldnn::data_types::i8);
|
||||
gops[element::u8] = device->get_gops(cldnn::data_types::u8);
|
||||
gops[element::f16] = device->get_gops(cldnn::data_types::f16);
|
||||
gops[element::f32] = device->get_gops(cldnn::data_types::f32);
|
||||
return decltype(ov::device::gops)::value_type {gops};
|
||||
} else {
|
||||
std::map<InferenceEngine::Precision, float> gops;
|
||||
gops[InferenceEngine::Precision::I8] = GetGOPS(device_info, cldnn::data_types::i8);
|
||||
gops[InferenceEngine::Precision::U8] = GetGOPS(device_info, cldnn::data_types::u8);
|
||||
gops[InferenceEngine::Precision::FP16] = GetGOPS(device_info, cldnn::data_types::f16);
|
||||
gops[InferenceEngine::Precision::FP32] = GetGOPS(device_info, cldnn::data_types::f32);
|
||||
gops[InferenceEngine::Precision::I8] = device->get_gops(cldnn::data_types::i8);
|
||||
gops[InferenceEngine::Precision::U8] = device->get_gops(cldnn::data_types::u8);
|
||||
gops[InferenceEngine::Precision::FP16] = device->get_gops(cldnn::data_types::f16);
|
||||
gops[InferenceEngine::Precision::FP32] = device->get_gops(cldnn::data_types::f32);
|
||||
IE_SET_METRIC_RETURN(DEVICE_GOPS, gops);
|
||||
}
|
||||
} else if (name == ov::intel_gpu::execution_units_count) {
|
||||
|
||||
66
src/plugins/intel_gpu/src/runtime/device.cpp
Normal file
66
src/plugins/intel_gpu/src/runtime/device.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "intel_gpu/runtime/device.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
float device::get_gops(cldnn::data_types dt) const {
|
||||
auto info = get_info();
|
||||
if (info.vendor_id != INTEL_VENDOR_ID) {
|
||||
// GOPS calculation is not supported for non Intel GPUs
|
||||
return 0.0f;
|
||||
}
|
||||
auto freqGHz = info.gpu_frequency / 1000.f;
|
||||
auto numEUs = info.execution_units_count;
|
||||
auto opsPerComputeBlock = 0;
|
||||
auto computeBlockIPC = 1.0f;
|
||||
switch (dt) {
|
||||
case cldnn::data_types::u8:
|
||||
case cldnn::data_types::i8: {
|
||||
if (info.supports_immad) {
|
||||
if (info.gfx_ver.major == 12) {
|
||||
if (info.gfx_ver.minor == 5)
|
||||
opsPerComputeBlock = 512;
|
||||
else if (info.gfx_ver.minor == 7)
|
||||
opsPerComputeBlock = 256;
|
||||
}
|
||||
} else if (info.supports_imad) {
|
||||
// fma * simd size
|
||||
opsPerComputeBlock = 2 * 32;
|
||||
} else {
|
||||
// separate mul + add instructions for int8 data type
|
||||
opsPerComputeBlock = 2 * 16;
|
||||
// mul/add instructions can't be executed in parallel, so we need 2 clocks to execute compute block
|
||||
computeBlockIPC = 0.5f;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case cldnn::data_types::f16: {
|
||||
if (info.supports_immad) {
|
||||
if (info.gfx_ver.major == 12) {
|
||||
if (info.gfx_ver.minor == 5)
|
||||
opsPerComputeBlock = 256;
|
||||
else if (info.gfx_ver.minor == 7)
|
||||
opsPerComputeBlock = 128;
|
||||
}
|
||||
} else {
|
||||
// fma * simd size
|
||||
opsPerComputeBlock = 2 * 16;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case cldnn::data_types::f32: {
|
||||
// fma * simd size
|
||||
opsPerComputeBlock = 2 * 8;
|
||||
break;
|
||||
}
|
||||
|
||||
default: OPENVINO_ASSERT(false, "[GPU] get_gops: unsupported precision: ", dt);
|
||||
}
|
||||
|
||||
return freqGHz * opsPerComputeBlock * computeBlockIPC * numEUs;
|
||||
}
|
||||
|
||||
} // namespace cldnn
|
||||
@@ -22,25 +22,43 @@
|
||||
|
||||
namespace {
|
||||
bool does_device_match_config(bool out_of_order, const cl::Device& device) {
|
||||
// Is it intel gpu
|
||||
if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU ||
|
||||
device.getInfo<CL_DEVICE_VENDOR_ID>() != 0x8086) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Does device support OOOQ?
|
||||
if (out_of_order) {
|
||||
auto queue_properties = device.getInfo<CL_DEVICE_QUEUE_PROPERTIES>();
|
||||
using cmp_t = std::common_type<decltype(queue_properties),
|
||||
typename std::underlying_type<cl::QueueProperties>::type>::type;
|
||||
if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
|
||||
if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: Remove the check below once kernels are fixed
|
||||
if (device.getInfo<CL_DEVICE_VENDOR_ID>() != cldnn::INTEL_VENDOR_ID)
|
||||
return false;
|
||||
|
||||
// Does device support OOOQ?
|
||||
if (out_of_order) {
|
||||
auto queue_properties = device.getInfo<CL_DEVICE_QUEUE_PROPERTIES>();
|
||||
using cmp_t = std::common_type<decltype(queue_properties),
|
||||
typename std::underlying_type<cl::QueueProperties>::type>::type;
|
||||
if (!(static_cast<cmp_t>(queue_properties) & static_cast<cmp_t>(cl::QueueProperties::OutOfOrder))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return true;
|
||||
// The priority return by this function impacts the order of devices reported by GPU plugin and devices enumeration
|
||||
// Lower priority value means lower device ID
|
||||
// Current behavior is: Intel iGPU < Intel dGPU < any other GPU
|
||||
// Order of Intel dGPUs is undefined and depends on the OCL impl
|
||||
// Order of other vendor GPUs is undefined and depends on the OCL impl
|
||||
size_t get_device_priority(const cldnn::device_info& info) {
|
||||
if (info.vendor_id == cldnn::INTEL_VENDOR_ID && info.dev_type == cldnn::device_type::integrated_gpu) {
|
||||
return 0;
|
||||
} else if (info.vendor_id == cldnn::INTEL_VENDOR_ID) {
|
||||
return 1;
|
||||
} else {
|
||||
return std::numeric_limits<size_t>::max();
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
static constexpr auto INTEL_PLATFORM_VENDOR = "Intel(R) Corporation";
|
||||
@@ -56,10 +74,8 @@ static std::vector<cl::Device> getSubDevices(cl::Device& rootDevice) {
|
||||
sizeof(maxSubDevices),
|
||||
&maxSubDevices, &maxSubDevicesSize);
|
||||
|
||||
if (err != CL_SUCCESS || maxSubDevicesSize != sizeof(maxSubDevices)) {
|
||||
throw cl::Error(err, "clGetDeviceInfo(..., CL_DEVICE_PARTITION_MAX_SUB_DEVICES,...)");
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(err == CL_SUCCESS && maxSubDevicesSize == sizeof(maxSubDevices),
|
||||
"[GPU] clGetDeviceInfo(..., CL_DEVICE_PARTITION_MAX_SUB_DEVICES,...)");
|
||||
if (maxSubDevices == 0) {
|
||||
return {};
|
||||
}
|
||||
@@ -91,47 +107,50 @@ static std::vector<cl::Device> getSubDevices(cl::Device& rootDevice) {
|
||||
return subDevices;
|
||||
}
|
||||
|
||||
std::vector<device::ptr> ocl_device_detector::sort_devices(const std::vector<device::ptr>& devices_list) {
|
||||
std::vector<device::ptr> sorted_list = devices_list;
|
||||
std::stable_sort(sorted_list.begin(), sorted_list.end(), [](device::ptr d1, device::ptr d2) {
|
||||
return get_device_priority(d1->get_info()) < get_device_priority(d2->get_info());
|
||||
});
|
||||
|
||||
return sorted_list;
|
||||
}
|
||||
|
||||
std::map<std::string, device::ptr> ocl_device_detector::get_available_devices(void* user_context,
|
||||
void* user_device,
|
||||
int ctx_device_id,
|
||||
int target_tile_id) const {
|
||||
bool host_out_of_order = true; // Change to false, if debug requires in-order queue.
|
||||
std::vector<device::ptr> dev_orig, dev_sorted;
|
||||
std::vector<device::ptr> devices_list;
|
||||
if (user_context != nullptr) {
|
||||
dev_orig = create_device_list_from_user_context(host_out_of_order, user_context, ctx_device_id);
|
||||
devices_list = create_device_list_from_user_context(host_out_of_order, user_context, ctx_device_id);
|
||||
} else if (user_device != nullptr) {
|
||||
dev_orig = create_device_list_from_user_device(host_out_of_order, user_device);
|
||||
devices_list = create_device_list_from_user_device(host_out_of_order, user_device);
|
||||
} else {
|
||||
dev_orig = create_device_list(host_out_of_order);
|
||||
devices_list = create_device_list(host_out_of_order);
|
||||
}
|
||||
|
||||
devices_list = sort_devices(devices_list);
|
||||
|
||||
std::map<std::string, device::ptr> ret;
|
||||
for (auto& dptr : dev_orig) {
|
||||
if (dptr->get_info().dev_type == cldnn::device_type::integrated_gpu)
|
||||
dev_sorted.insert(dev_sorted.begin(), dptr);
|
||||
else
|
||||
dev_sorted.push_back(dptr);
|
||||
}
|
||||
uint32_t idx = 0;
|
||||
for (auto& dptr : dev_sorted) {
|
||||
for (auto& dptr : devices_list) {
|
||||
auto map_id = std::to_string(idx++);
|
||||
ret[map_id] = dptr;
|
||||
|
||||
auto rootDevice = std::dynamic_pointer_cast<ocl_device>(dptr);
|
||||
if (!rootDevice) {
|
||||
throw std::runtime_error("Invalid device type created in ocl_device_detector");
|
||||
}
|
||||
auto root_device = std::dynamic_pointer_cast<ocl_device>(dptr);
|
||||
OPENVINO_ASSERT(root_device != nullptr, "[GPU] Invalid device type created in ocl_device_detector");
|
||||
|
||||
auto subDevices = getSubDevices(rootDevice->get_device());
|
||||
if (!subDevices.empty()) {
|
||||
auto sub_devices = getSubDevices(root_device->get_device());
|
||||
if (!sub_devices.empty()) {
|
||||
uint32_t sub_idx = 0;
|
||||
for (auto& subdevice : subDevices) {
|
||||
for (auto& sub_device : sub_devices) {
|
||||
if (target_tile_id != -1 && static_cast<int>(sub_idx) != target_tile_id) {
|
||||
sub_idx++;
|
||||
continue;
|
||||
}
|
||||
auto subdPtr = std::make_shared<ocl_device>(subdevice, cl::Context(subdevice), rootDevice->get_platform());
|
||||
ret[map_id+"."+std::to_string(sub_idx++)] = subdPtr;
|
||||
auto sub_device_ptr = std::make_shared<ocl_device>(sub_device, cl::Context(sub_device), root_device->get_platform());
|
||||
ret[map_id + "." + std::to_string(sub_idx++)] = sub_device_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -142,72 +161,56 @@ std::vector<device::ptr> ocl_device_detector::create_device_list(bool out_out_or
|
||||
cl_uint n = 0;
|
||||
// Get number of platforms availible
|
||||
cl_int err = clGetPlatformIDs(0, NULL, &n);
|
||||
if (err != CL_SUCCESS) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ", err);
|
||||
// Get platform list
|
||||
std::vector<cl_platform_id> platform_ids(n);
|
||||
err = clGetPlatformIDs(n, platform_ids.data(), NULL);
|
||||
if (err != CL_SUCCESS) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
|
||||
}
|
||||
OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ", err);
|
||||
|
||||
std::vector<device::ptr> ret;
|
||||
std::vector<device::ptr> supported_devices;
|
||||
for (auto& id : platform_ids) {
|
||||
cl::Platform platform = cl::Platform(id);
|
||||
|
||||
if (platform.getInfo<CL_PLATFORM_VENDOR>() != INTEL_PLATFORM_VENDOR)
|
||||
continue;
|
||||
|
||||
std::vector<cl::Device> devices;
|
||||
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
|
||||
for (auto& device : devices) {
|
||||
if (!does_device_match_config(out_out_order, device))
|
||||
continue;
|
||||
ret.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device), id));
|
||||
supported_devices.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device), id));
|
||||
}
|
||||
}
|
||||
if (ret.empty()) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. No GPU device was found.");
|
||||
}
|
||||
return ret;
|
||||
OPENVINO_ASSERT(!supported_devices.empty(), "[GPU] No GPU device was found.");
|
||||
return supported_devices;
|
||||
}
|
||||
|
||||
std::vector<device::ptr> ocl_device_detector::create_device_list_from_user_context(bool out_out_order, void* user_context, int ctx_device_id) const {
|
||||
cl::Context ctx = cl::Context(static_cast<cl_context>(user_context), true);
|
||||
auto all_devices = ctx.getInfo<CL_CONTEXT_DEVICES>();
|
||||
|
||||
std::vector<device::ptr> ret;
|
||||
std::vector<device::ptr> supported_devices;
|
||||
for (size_t i = 0; i < all_devices.size(); i++) {
|
||||
auto& device = all_devices[i];
|
||||
if (!does_device_match_config(out_out_order, device) || static_cast<int>(i) != ctx_device_id)
|
||||
continue;
|
||||
ret.emplace_back(std::make_shared<ocl_device>(device, ctx, device.getInfo<CL_DEVICE_PLATFORM>()));
|
||||
supported_devices.emplace_back(std::make_shared<ocl_device>(device, ctx, device.getInfo<CL_DEVICE_PLATFORM>()));
|
||||
}
|
||||
|
||||
if (ret.empty()) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. User defined context does not have GPU device included!");
|
||||
}
|
||||
return ret;
|
||||
OPENVINO_ASSERT(!supported_devices.empty(), "[GPU] User defined context does not have GPU device included.");
|
||||
return supported_devices;
|
||||
}
|
||||
|
||||
std::vector<device::ptr> ocl_device_detector::create_device_list_from_user_device(bool out_out_order, void* user_device) const {
|
||||
std::vector<device::ptr> ocl_device_detector::create_device_list_from_user_device(bool out_out_order, void* user_device) const {
|
||||
cl_uint n = 0;
|
||||
// Get number of platforms availible
|
||||
cl_int err = clGetPlatformIDs(0, NULL, &n);
|
||||
if (err != CL_SUCCESS) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
|
||||
}
|
||||
OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ", err);
|
||||
|
||||
// Get platform list
|
||||
std::vector<cl_platform_id> platform_ids(n);
|
||||
err = clGetPlatformIDs(n, platform_ids.data(), NULL);
|
||||
if (err != CL_SUCCESS) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. clGetPlatformIDs error " + std::to_string(err));
|
||||
}
|
||||
OPENVINO_ASSERT(err == CL_SUCCESS, "[GPU] clGetPlatformIDs error ", err);
|
||||
|
||||
std::vector<device::ptr> ret;
|
||||
std::vector<device::ptr> supported_devices;
|
||||
for (auto& id : platform_ids) {
|
||||
cl::PlatformVA platform = cl::PlatformVA(id);
|
||||
|
||||
@@ -250,13 +253,11 @@ std::vector<device::ptr> ocl_device_detector::create_device_list_from_user_devi
|
||||
CL_CONTEXT_INTEROP_USER_SYNC, CL_FALSE,
|
||||
CL_CONTEXT_PLATFORM, (cl_context_properties)id,
|
||||
0 };
|
||||
ret.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device, props), id));
|
||||
supported_devices.emplace_back(std::make_shared<ocl_device>(device, cl::Context(device, props), id));
|
||||
}
|
||||
}
|
||||
if (ret.empty()) {
|
||||
throw std::runtime_error("[CLDNN ERROR]. No corresponding GPU device was found.");
|
||||
}
|
||||
return ret;
|
||||
OPENVINO_ASSERT(!supported_devices.empty(), "[GPU] User specified device is not supported.");
|
||||
return supported_devices;
|
||||
}
|
||||
|
||||
} // namespace ocl
|
||||
|
||||
@@ -21,6 +21,8 @@ public:
|
||||
|
||||
std::map<std::string, device::ptr> get_available_devices(void *user_context, void *user_device, int ctx_device_id = 0, int target_tile_id = -1) const;
|
||||
|
||||
static std::vector<device::ptr> sort_devices(const std::vector<device::ptr>& devices_list);
|
||||
|
||||
private:
|
||||
std::vector<device::ptr> create_device_list(bool out_out_order) const;
|
||||
std::vector<device::ptr> create_device_list_from_user_context(bool out_out_order, void* user_context, int ctx_device_id = 0) const;
|
||||
|
||||
@@ -57,6 +57,7 @@ ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type,
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
dnnl::engine& ocl_engine::get_onednn_engine() const {
|
||||
const std::lock_guard<std::mutex> lock(onednn_mutex);
|
||||
OPENVINO_ASSERT(_device->get_info().vendor_id == INTEL_VENDOR_ID, "[GPU] OneDNN engine can be used for Intel GPUs only");
|
||||
if (!_onednn_engine) {
|
||||
auto casted = std::dynamic_pointer_cast<ocl_device>(_device);
|
||||
if (!casted)
|
||||
|
||||
@@ -291,7 +291,7 @@ ocl_stream::ocl_stream(const ocl_engine &engine)
|
||||
_command_queue = queue_builder.build(context, device);
|
||||
|
||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||
if (config.queue_type == queue_types::in_order) {
|
||||
if (config.queue_type == queue_types::in_order && engine.get_device_info().vendor_id == INTEL_VENDOR_ID) {
|
||||
auto onednn_engine = engine.get_onednn_engine();
|
||||
_onednn_stream = std::make_shared<dnnl::stream>(dnnl::ocl_interop::make_stream(engine.get_onednn_engine(), _command_queue.get()));
|
||||
}
|
||||
|
||||
103
src/plugins/intel_gpu/tests/module_tests/device_test.cpp
Normal file
103
src/plugins/intel_gpu/tests/module_tests/device_test.cpp
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
#include "intel_gpu/runtime/device.hpp"
|
||||
#include "runtime/ocl/ocl_device_detector.hpp"
|
||||
#include <memory>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
namespace {
|
||||
|
||||
struct dummy_device : public device {
|
||||
public:
|
||||
dummy_device(uint32_t vendor_id, device_type type, size_t device_id) : _mem_caps({}) {
|
||||
_info = device_info{};
|
||||
_info.vendor_id = vendor_id;
|
||||
_info.dev_type = type;
|
||||
_info.device_id = device_id;
|
||||
}
|
||||
|
||||
device_info get_info() const override { return _info; }
|
||||
memory_capabilities get_mem_caps() const override { return _mem_caps; }
|
||||
bool is_same(const device::ptr other) override {
|
||||
return this == other.get();
|
||||
}
|
||||
|
||||
~dummy_device() = default;
|
||||
|
||||
private:
|
||||
device_info _info;
|
||||
memory_capabilities _mem_caps;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(devices_test, sort_order_single_vendor) {
|
||||
size_t device_id = 0;
|
||||
std::vector<device::ptr> devices_list;
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
|
||||
auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
|
||||
|
||||
std::vector<size_t> expected_devices_order = {2, 0, 1, 3, 4};
|
||||
|
||||
std::vector<size_t> actual_devices_order;
|
||||
std::transform(sorted_list.begin(), sorted_list.end(), std::back_inserter(actual_devices_order), [](const device::ptr& d) -> size_t {
|
||||
return d->get_info().device_id;
|
||||
});
|
||||
|
||||
ASSERT_EQ(expected_devices_order, actual_devices_order);
|
||||
}
|
||||
|
||||
TEST(devices_test, sort_order_two_vendors) {
|
||||
size_t device_id = 0;
|
||||
const auto OTHER_VENDOR_ID = 0x123;
|
||||
std::vector<device::ptr> devices_list;
|
||||
devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
|
||||
|
||||
auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
|
||||
|
||||
std::vector<size_t> expected_devices_order = {3, 2, 0, 1};
|
||||
|
||||
std::vector<size_t> actual_devices_order;
|
||||
std::transform(sorted_list.begin(), sorted_list.end(), std::back_inserter(actual_devices_order), [](const device::ptr& d) -> size_t {
|
||||
return d->get_info().device_id;
|
||||
});
|
||||
|
||||
ASSERT_EQ(expected_devices_order, actual_devices_order);
|
||||
}
|
||||
|
||||
TEST(devices_test, sort_order_three_vendors) {
|
||||
size_t device_id = 0;
|
||||
const auto OTHER_VENDOR_ID1 = 0x123;
|
||||
const auto OTHER_VENDOR_ID2 = 0x1234;
|
||||
std::vector<device::ptr> devices_list;
|
||||
devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID1, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID1, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::integrated_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(INTEL_VENDOR_ID, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++));
|
||||
devices_list.push_back(std::make_shared<dummy_device>(OTHER_VENDOR_ID2, device_type::discrete_gpu, device_id++));
|
||||
|
||||
auto sorted_list = ocl::ocl_device_detector::sort_devices(devices_list);
|
||||
|
||||
std::vector<size_t> expected_devices_order = {2, 3, 0, 1, 4, 5};
|
||||
|
||||
std::vector<size_t> actual_devices_order;
|
||||
std::transform(sorted_list.begin(), sorted_list.end(), std::back_inserter(actual_devices_order), [](const device::ptr& d) -> size_t {
|
||||
return d->get_info().device_id;
|
||||
});
|
||||
|
||||
ASSERT_EQ(expected_devices_order, actual_devices_order);
|
||||
}
|
||||
Reference in New Issue
Block a user