// Copyright (C) 2018-2022 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "remote_tensors_filling.hpp" #include #include #include #include #include #include #ifdef HAVE_DEVICE_MEM_SUPPORT # include # include #endif namespace gpu { template using uniformDistribution = typename std::conditional< std::is_floating_point::value, std::uniform_real_distribution, typename std::conditional::value, std::uniform_int_distribution, void>::type>::type; template void fill_buffer_random(void* inputBuffer, size_t elementsNum, T rand_min = std::numeric_limits::min(), T rand_max = std::numeric_limits::max()) { std::mt19937 gen(0); uniformDistribution distribution(rand_min, rand_max); auto inputBufferData = static_cast(inputBuffer); for (size_t i = 0; i < elementsNum; i++) { inputBufferData[i] = static_cast(distribution(gen)); } } void fill_buffer(void* inputBuffer, size_t elementsNum, const ov::element::Type& type) { if (type == ov::element::f32) { fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::f16) { fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::i32) { fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::i64) { fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::u8) { // uniform_int_distribution is not allowed in the C++17 // standard and vs2017/19 fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::i8) { // uniform_int_distribution is not allowed in the C++17 standard // and vs2017/19 fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::u16) { fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::i16) { fill_buffer_random(inputBuffer, elementsNum); } else if (type == ov::element::boolean) { fill_buffer_random(inputBuffer, elementsNum, 0, 1); } else { throw ov::Exception("Requested type is not supported"); } } std::map get_remote_input_tensors( const std::map>& inputFiles, const std::vector& app_inputs_info, const ov::CompiledModel& compiledModel, std::vector& clBuffer, size_t num_requests) { #ifdef HAVE_DEVICE_MEM_SUPPORT slog::info << "Device memory will be used for input and output blobs" << slog::endl; if (inputFiles.size()) { slog::warn << "Device memory supports only random data at this moment, input images will be ignored" << slog::endl; } std::map remoteTensors; auto context = compiledModel.get_context(); auto& oclContext = static_cast(context); auto oclInstance = std::make_shared(oclContext.get()); for (int i = 0; i < num_requests; i++) { for (auto& inputs_info : app_inputs_info) { for (auto& input : inputs_info) { // Fill random slog::info << "Prepare remote blob for input '" << input.first << "' with random values (" << std::string((input.second.is_image() ? "image" : "some binary data")) << " is expected)" << slog::endl; // Creating and filling shared buffers cl_int err; auto elementsNum = std::accumulate(begin(input.second.dataShape), end(input.second.dataShape), 1, std::multiplies()); auto inputSize = elementsNum * input.second.type.bitwidth() / 8; clBuffer.push_back( cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err)); void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(), CL_TRUE, CL_MEM_READ_WRITE, 0, (cl::size_type)inputSize); auto tensor = oclContext.create_tensor(input.second.type, input.second.dataShape, clBuffer.back().get()); remoteTensors[input.first].push_back(tensor); if (inputFiles.empty()) { // Filling in random data fill_buffer(mappedPtr, elementsNum, input.second.type); } else { // TODO: add filling with real image data } oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr); } } } return remoteTensors; #else throw ov::Exception("Device memory requested for GPU device, but OpenCL was not linked"); #endif } ov::Shape get_static_shape(const ov::Output& compiled_output) { // FIXME: this is a WA for case when original model has internal dynamism (NonMaxSuppression) // and runtime has static output due to conversions to legacy op and lack of dynamism support // to be removed along with dynamism support const auto& compiled_pshape = compiled_output.get_partial_shape(); if (compiled_pshape.is_static()) return compiled_pshape.to_shape(); else if (compiled_pshape.rank().is_dynamic()) OPENVINO_UNREACHABLE( "Benchmark App - NOT IMPLEMENTED - Output of dynamic rank is not supported for remote tensor. ", "Output: ", compiled_output); ov::Shape shape; for (const auto& dimension : compiled_pshape) { if (dimension.get_interval().has_upper_bound()) shape.push_back(static_cast(dimension.get_max_length())); else OPENVINO_UNREACHABLE("Benchmark App - NOT IMPLEMENTED - Fully dynamic output dimensions are not supported " "for remote tensor. ", "Output: ", compiled_output); } return shape; } std::map get_remote_output_tensors(const ov::CompiledModel& compiledModel, std::map& clBuffer) { #ifdef HAVE_DEVICE_MEM_SUPPORT std::map outputTensors; std::shared_ptr runtime_model = nullptr; for (auto& output : compiledModel.outputs()) { auto context = compiledModel.get_context(); auto& oclContext = static_cast(context); auto oclInstance = std::make_shared(oclContext.get()); ov::Shape shape = get_static_shape(output); cl_int err; auto elementsNum = shape_size(shape); auto inputSize = elementsNum * output.get_element_type().bitwidth() / 8; cl::size_type bufferSize = 0; if (clBuffer.find(output.get_any_name()) == clBuffer.end()) { clBuffer[output.get_any_name()] = cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); } else { auto& buff = clBuffer[output.get_any_name()]; buff.getInfo(CL_MEM_SIZE, &bufferSize); if (inputSize != bufferSize) { buff = cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); } } outputTensors[output.get_any_name()] = oclContext.create_tensor(output.get_element_type(), shape, clBuffer[output.get_any_name()].get()); } return outputTensors; #else throw ov::Exception("Device memory requested for GPU device, but OpenCL was not linked"); #endif } } // namespace gpu