Add use_device_mem option to benchmark_app (#7433)

2021-09-17 11:04:50 +03:00 · 2021-09-17 11:04:50 +03:00 · 1f85d4230d
commit 1f85d4230d
parent 8690e14a5b
7 changed files with 313 additions and 3 deletions
--- a/inference-engine/samples/benchmark_app/CMakeLists.txt
+++ b/inference-engine/samples/benchmark_app/CMakeLists.txt
@ -2,11 +2,46 @@
 # SPDX-License-Identifier: Apache-2.0
 #

+set(TARGET_NAME "benchmark_app")
+
 file (GLOB SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 file (GLOB HDR ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)

-ie_add_sample(NAME benchmark_app
+ie_add_sample(NAME ${TARGET_NAME}
              SOURCES ${SRC}
              HEADERS ${HDR}
              DEPENDENCIES format_reader ie_samples_utils
              OPENCV_DEPENDENCIES core)
+
+find_package(OpenCL)
+
+find_path(OpenCL_HPP_INCLUDE_DIR
+    NAMES
+      CL/cl2.hpp OpenCL/cl2.hpp
+    HINTS
+      ${opencl_root_hints}
+      ENV "PROGRAMFILES(X86)"
+      ENV AMDAPPSDKROOT
+      ENV INTELOCLSDKROOT
+      ENV NVSDKCOMPUTE_ROOT
+      ENV CUDA_PATH
+      ENV ATISTREAMSDKROOT
+      ENV OCL_ROOT
+    PATH_SUFFIXES
+      include
+      OpenCL/common/inc
+      "AMD APP/include")
+
+if(OPENCL_HEADERS_DIR)
+    # Use OpenCL CPP headers from sources if present
+    set(OpenCL_HEADERS OPENCL_HEADERS_DIR)
+elseif(OpenCL_HPP_INCLUDE_DIR)
+    # Append OpenCL CPP headers to C headers and use both
+    set(OpenCL_HEADERS OpenCL_INCLUDE_DIR OpenCL_HPP_INCLUDE_DIR)
+endif()
+
+if(OpenCL_FOUND AND OpenCL_HEADERS)
+    target_link_libraries(${TARGET_NAME} PRIVATE OpenCL::OpenCL)
+    target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_HEADERS})
+    target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_GPU_DEVICE_MEM_SUPPORT)
+endif()
--- a/inference-engine/samples/benchmark_app/benchmark_app.hpp
+++ b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@ -4,6 +4,10 @@

 #pragma once

+#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
+#    define HAVE_DEVICE_MEM_SUPPORT
+#endif
+
 #include <gflags/gflags.h>

 #include <iostream>
@ -132,6 +136,12 @@ static const char progress_message[] =
 // @brief message for performance counters option
 static const char pc_message[] = "Optional. Report performance counters.";

+#ifdef HAVE_DEVICE_MEM_SUPPORT
+// @brief message for switching memory allocation type option
+static const char use_device_mem_message[] =
+    "Optional. Switch between host and device memory allocation for input and output buffers.";
+#endif
+
 #ifdef USE_OPENCV
 // @brief message for load config option
 static const char load_config_message[] =
@ -266,6 +276,11 @@ DEFINE_bool(progress, false, progress_message);
 /// @brief Define flag for showing performance counters <br>
 DEFINE_bool(pc, false, pc_message);

+#ifdef HAVE_DEVICE_MEM_SUPPORT
+/// @brief Define flag for switching beetwen host and device memory allocation for input and output buffers
+DEFINE_bool(use_device_mem, false, use_device_mem_message);
+#endif
+
 #ifdef USE_OPENCV
 /// @brief Define flag for loading configuration file <br>
 DEFINE_string(load_config, "", load_config_message);
@ -339,6 +354,9 @@ static void showUsage() {
    std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;
    std::cout << "    -enforcebf16=<true/false>     " << enforce_bf16_message << std::endl;
    std::cout << "    -pin \"YES\"/\"HYBRID_AWARE\"/\"NO\"/\"NUMA\"   " << infer_threads_pinning_message << std::endl;
+#ifdef HAVE_DEVICE_MEM_SUPPORT
+    std::cout << "    -use_device_mem           " << use_device_mem_message << std::endl;
+#endif
    std::cout << std::endl << "  Statistics dumping options:" << std::endl;
    std::cout << "    -report_type \"<type>\"     " << report_type_message << std::endl;
    std::cout << "    -report_folder            " << report_folder_message << std::endl;
--- a/inference-engine/samples/benchmark_app/infer_request_wrap.hpp
+++ b/inference-engine/samples/benchmark_app/infer_request_wrap.hpp
@ -65,6 +65,10 @@ public:
        return _request.GetBlob(name);
    }

+    void setBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) {
+        _request.SetBlob(name, data);
+    }
+
    double getExecutionTimeInMilliseconds() const {
        auto execTime = std::chrono::duration_cast<ns>(_endTime - _startTime);
        return static_cast<double>(execTime.count()) * 0.000001;
--- a/inference-engine/samples/benchmark_app/main.cpp
+++ b/inference-engine/samples/benchmark_app/main.cpp
@ -21,6 +21,7 @@
 #include "infer_request_wrap.hpp"
 #include "inputs_filling.hpp"
 #include "progress_bar.hpp"
+#include "remote_blobs_filling.hpp"
 #include "statistics_report.hpp"
 #include "utils.hpp"

@ -592,7 +593,16 @@ int main(int argc, char* argv[]) {
        next_step();

        InferRequestsQueue inferRequestsQueue(exeNetwork, nireq);
-        fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
+        if (isFlagSetInCommandLine("use_device_mem")) {
+            if (device_name.find("GPU") == 0)
+                ::gpu::fillRemoteBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests, exeNetwork);
+            else if (device_name.find("CPU") == 0)
+                fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
+            else
+                IE_THROW() << "Requested device doesn't support `use_device_mem` option.";
+        } else {
+            fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
+        }

        // ----------------- 10. Measuring performance
        // ------------------------------------------------------------------
--- a/inference-engine/samples/benchmark_app/remote_blobs_filling.cpp
+++ b/inference-engine/samples/benchmark_app/remote_blobs_filling.cpp
@ -0,0 +1,140 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "remote_blobs_filling.hpp"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace gpu {
+
+template <typename T>
+using uniformDistribution = typename std::conditional<
+    std::is_floating_point<T>::value,
+    std::uniform_real_distribution<T>,
+    typename std::conditional<std::is_integral<T>::value, std::uniform_int_distribution<T>, void>::type>::type;
+
+template <typename T, typename T2>
+void fillBufferRandom(void* inputBuffer,
+                      size_t elementsNum,
+                      T rand_min = std::numeric_limits<uint8_t>::min(),
+                      T rand_max = std::numeric_limits<uint8_t>::max()) {
+    std::mt19937 gen(0);
+    uniformDistribution<T2> distribution(rand_min, rand_max);
+    auto inputBufferData = static_cast<T*>(inputBuffer);
+    for (size_t i = 0; i < elementsNum; i++) {
+        inputBufferData[i] = static_cast<T>(distribution(gen));
+    }
+}
+
+void fillBuffer(void* inputBuffer, size_t elementsNum, InferenceEngine::Precision precision) {
+    if (precision == InferenceEngine::Precision::FP32) {
+        fillBufferRandom<float, float>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::FP16) {
+        fillBufferRandom<short, short>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::I32) {
+        fillBufferRandom<int32_t, int32_t>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::I64) {
+        fillBufferRandom<int64_t, int64_t>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::U8) {
+        // uniform_int_distribution<uint8_t> is not allowed in the C++17
+        // standard and vs2017/19
+        fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::I8) {
+        // uniform_int_distribution<int8_t> is not allowed in the C++17 standard
+        // and vs2017/19
+        fillBufferRandom<int8_t, int32_t>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::U16) {
+        fillBufferRandom<uint16_t, uint16_t>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::I16) {
+        fillBufferRandom<int16_t, int16_t>(inputBuffer, elementsNum);
+    } else if (precision == InferenceEngine::Precision::BOOL) {
+        fillBufferRandom<uint8_t, uint32_t>(inputBuffer, elementsNum, 0, 1);
+    } else {
+        IE_THROW() << "Requested precision is not supported";
+    }
+}
+
+size_t getBytesPerElement(InferenceEngine::Precision precision) {
+    switch (precision) {
+    case InferenceEngine::Precision::FP32:
+        return 4;
+    case InferenceEngine::Precision::FP16:
+        return 2;
+    case InferenceEngine::Precision::I32:
+        return 4;
+    case InferenceEngine::Precision::I64:
+        return 8;
+    case InferenceEngine::Precision::U8:
+        return 1;
+    case InferenceEngine::Precision::I8:
+        return 1;
+    case InferenceEngine::Precision::U16:
+        return 2;
+    case InferenceEngine::Precision::I16:
+        return 2;
+    case InferenceEngine::Precision::BOOL:
+        return 1;
+    default:
+        IE_THROW() << "Requested precision is not supported";
+    }
+}
+
+void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
+                     const size_t& batchSize,
+                     benchmark_app::InputsInfo& app_inputs_info,
+                     std::vector<InferReqWrap::Ptr> requests,
+                     const InferenceEngine::ExecutableNetwork& exeNetwork) {
+#ifdef HAVE_DEVICE_MEM_SUPPORT
+    slog::info << "Device memory will be used for input and output blobs" << slog::endl;
+    if (inputFiles.size()) {
+        slog::warn << "Device memory supports only random data at this moment, input images will be ignored"
+                   << slog::endl;
+    }
+    auto context = exeNetwork.GetContext();
+    auto oclContext = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context)->get();
+    auto oclInstance = std::make_shared<OpenCL>(oclContext);
+
+    auto setShared = [&](size_t requestId,
+                         const std::string name,
+                         const InferenceEngine::TensorDesc& desc,
+                         bool fillRandom = false) {
+        cl_int err;
+        auto inputDims = desc.getDims();
+        auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies<size_t>());
+        auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision());
+
+        cl::Buffer sharedBuffer =
+            cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err);
+
+        if (fillRandom) {
+            void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(sharedBuffer,
+                                                                   CL_TRUE,
+                                                                   CL_MEM_READ_WRITE,
+                                                                   0,
+                                                                   (cl::size_type)inputSize);
+            fillBuffer(mappedPtr, elementsNum, desc.getPrecision());
+            oclInstance->_queue.enqueueUnmapMemObject(sharedBuffer, mappedPtr);
+        }
+
+        InferenceEngine::Blob::Ptr sharedBlob = InferenceEngine::gpu::make_shared_blob(desc, context, sharedBuffer);
+
+        requests.at(requestId)->setBlob(name, sharedBlob);
+    };
+
+    for (size_t requestId = 0; requestId < requests.size(); requestId++) {
+        for (auto& item : exeNetwork.GetInputsInfo())
+            setShared(requestId, item.first, item.second->getTensorDesc(), true);
+
+        for (auto& item : exeNetwork.GetOutputsInfo())
+            setShared(requestId, item.first, item.second->getTensorDesc());
+    }
+#else
+    IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked";
+#endif
+}
+
+}  // namespace gpu
--- a/inference-engine/samples/benchmark_app/remote_blobs_filling.hpp
+++ b/inference-engine/samples/benchmark_app/remote_blobs_filling.hpp
@ -0,0 +1,64 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#if defined(HAVE_GPU_DEVICE_MEM_SUPPORT)
+#    define HAVE_DEVICE_MEM_SUPPORT
+#    include <gpu/gpu_context_api_ocl.hpp>
+#endif
+
+#include <inference_engine.hpp>
+
+#include "infer_request_wrap.hpp"
+#include "utils.hpp"
+
+namespace gpu {
+
+#ifdef HAVE_DEVICE_MEM_SUPPORT
+struct OpenCL {
+    cl::Context _context;
+    cl::Device _device;
+    cl::CommandQueue _queue;
+
+    explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
+        // get Intel GPU OCL device, create context and queue
+        {
+            std::vector<cl::Device> devices;
+            std::vector<cl::Platform> platforms;
+            const unsigned int refVendorID = 0x8086;
+
+            cl::Platform::get(&platforms);
+            for (auto& p : platforms) {
+                p.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+                for (auto& d : devices) {
+                    if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
+                        _device = d;
+                        _context = cl::Context(_device);
+                        break;
+                    }
+                }
+            }
+
+            cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+            _queue = cl::CommandQueue(_context, _device, props);
+        }
+    }
+
+    explicit OpenCL(cl_context context) {
+        // user-supplied context handle
+        _context = cl::Context(context, true);
+        _device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
+
+        cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+        _queue = cl::CommandQueue(_context, _device, props);
+    }
+};
+#endif
+
+void fillRemoteBlobs(const std::vector<std::string>& inputFiles,
+                     const size_t& batchSize,
+                     benchmark_app::InputsInfo& app_inputs_info,
+                     std::vector<InferReqWrap::Ptr> requests,
+                     const InferenceEngine::ExecutableNetwork& exeNetwork);
+
+}  // namespace gpu
--- a/thirdparty/ocl/CMakeLists.txt
+++ b/thirdparty/ocl/CMakeLists.txt
@ -2,6 +2,40 @@
 # SPDX-License-Identifier: Apache-2.0
 #

+if(NOT ENABLE_CLDNN)
+    return()
+endif()
+
+function(get_lib_name TARGET_NAME LIBRARY_NAME)
+    if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+        set(BUILD_SUFFIX ${IE_DEBUG_POSTFIX})
+    else()
+        set(BUILD_SUFFIX ${IE_RELEASE_POSTFIX})
+    endif()
+
+    if(WIN32)
+        set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_LINK_LIBRARY_SUFFIX}")
+    else()
+        set(LIB_SUFFIX "${BUILD_SUFFIX}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    endif()
+
+    set("${LIBRARY_NAME}" "${CMAKE_SHARED_MODULE_PREFIX}${TARGET_NAME}${LIB_SUFFIX}" PARENT_SCOPE)
+endfunction()
+
+function(get_lib_path OUTPUT_DIR FINAL_OUTPUT_DIR)
+    if(WIN32)
+        set(LIB_DIR "")
+    else()
+        set(LIB_DIR "lib")
+    endif()
+
+    if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT UNIX)
+        set(OUTPUT_DIR "${OUTPUT_DIR}/${CMAKE_BUILD_TYPE}")
+    endif()
+
+    set("${FINAL_OUTPUT_DIR}" "${OUTPUT_DIR}/${LIB_DIR}" PARENT_SCOPE)
+endfunction()
+
 set(OPENCL_ICD_LOADER_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" CACHE PATH "Path to OCL includes" FORCE)

 set(OPENCL_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cl_headers/" "${CMAKE_CURRENT_SOURCE_DIR}/clhpp_headers/include/" CACHE PATH "Path to OCL (CL and CLHPP) includes" FORCE)
@ -20,5 +54,10 @@ target_include_directories(OpenCL SYSTEM PUBLIC ${OPENCL_HEADERS_DIR})

 # The following varables are needed to make find_package(OpenCL) work
 set(OpenCL_VERSION_STRING "2.2" CACHE STRING "" FORCE)
-set(OpenCL_INCLUDE_DIR "${OPENCL_ICD_LOADER_HEADERS_DIR}" CACHE PATH "" FORCE)
+set(OpenCL_INCLUDE_DIR "${OPENCL_HEADERS_DIR}" CACHE PATH "" FORCE)
 set(OPENCLROOT "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" CACHE PATH "" FORCE)
+
+get_lib_path("${OUTPUT_ROOT}/${BIN_FOLDER}" OPENCL_LIB_DIR)
+get_lib_name("OpenCL" OPENCL_LIB_NAME)
+
+set(OpenCL_LIBRARY "${OPENCL_LIB_DIR}/${OPENCL_LIB_NAME}" CACHE PATH "" FORCE)