Files
openvino/inference-engine/thirdparty/clDNN/runtime/kernels_cache.cpp
Mingyu Kim f15fcabfeb [GPU] minor improvements (#8113)
* Show bucket/batch id in case of build failure
* Support OV_GPU_DisableOnednn properly.
2021-10-22 13:01:14 +09:00

490 lines
18 KiB
C++

// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "kernels_factory.hpp"
#include "kernels_cache.hpp"
#include "ocl/ocl_engine.hpp"
#include "cldnn/runtime/debug_configuration.hpp"
#include <algorithm>
#include <cassert>
#include <sstream>
#include <fstream>
#include <set>
#include <string>
#include <memory>
#include <utility>
#include "cldnn_itt.hpp"
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
#include <tbb/parallel_for.h>
#include <tbb/blocked_range.h>
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
#include <thread>
#include <future>
#include <queue>
#include <condition_variable>
#endif
#if defined(__unix__) && !defined(__ANDROID__)
#include <malloc.h>
#endif
#ifndef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
# ifdef _WIN32
# if defined __INTEL_COMPILER || defined _MSC_VER
# define OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
# endif
# elif defined(__GNUC__) && (__GNUC__ > 5 || (__GNUC__ == 5 && __GNUC_MINOR__ > 2)) || defined(__clang__)
# define OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
# endif
#endif
#ifndef _WIN32
#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
#include <locale>
#include <codecvt>
#endif
#else
#include <Windows.h>
#endif
#if (CLDNN_THREADING != CLDNN_THREADING_SEQ)
#define DEFAULT_NUM_THREADS 2
#endif
namespace {
std::mutex cacheAccessMutex;
#if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
std::wstring multiByteCharToWString(const char* str) {
#ifdef _WIN32
int strSize = static_cast<int>(std::strlen(str));
int size_needed = MultiByteToWideChar(CP_UTF8, 0, str, strSize, NULL, 0);
std::wstring wstrTo(size_needed, 0);
MultiByteToWideChar(CP_UTF8, 0, str, strSize, &wstrTo[0], size_needed);
return wstrTo;
#else
std::wstring_convert<std::codecvt_utf8<wchar_t>> wstring_encoder;
std::wstring result = wstring_encoder.from_bytes(str);
return result;
#endif // _WIN32
}
#endif // defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
static std::vector<unsigned char> loadBinaryFromFile(std::string path) {
std::lock_guard<std::mutex> lock(cacheAccessMutex);
#if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
std::wstring widefilename = multiByteCharToWString(path.c_str());
const wchar_t* filename = widefilename.c_str();
FILE *fp = _wfopen(filename, L"rb");
#else
const char* filename = path.c_str();
FILE *fp = fopen(filename, "rb");
#endif
if (fp) {
fseek(fp, 0, SEEK_END);
size_t nsize = (size_t)ftell(fp);
fseek(fp, 0, SEEK_SET);
std::vector<unsigned char> ret(nsize);
auto res = fread(ret.data(), sizeof(unsigned char), nsize, fp);
(void)res;
fclose(fp);
return ret;
}
return {};
}
static void saveBinaryToFile(std::string path, const std::vector<unsigned char> buffer) {
std::lock_guard<std::mutex> lock(cacheAccessMutex);
#if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
std::wstring widefilename = multiByteCharToWString(path.c_str());
const wchar_t* filename = widefilename.c_str();
#else
const char* filename = path.c_str();
#endif
std::ofstream out_file(filename, std::ios::out | std::ios::binary);
if (out_file.is_open()) {
out_file.write(reinterpret_cast<const char*>(&buffer[0]), buffer.size());
}
}
std::string reorder_options(const std::string& org_options) {
std::stringstream ss(org_options);
std::set<std::string> sorted_options;
while (ss.good()) {
std::string word;
ss >> word;
sorted_options.insert(word);
}
std::string options;
for (const auto& o : sorted_options) {
options += o + " ";
}
return options;
}
inline bool does_options_support_batch_compilation(const std::string& options) {
return options.find("-D") == std::string::npos && options.find("-I") == std::string::npos;
}
} // namespace
namespace cldnn {
std::mutex kernels_cache::_mutex;
std::string kernels_cache::get_cache_path() const {
auto path = _engine.configuration().kernels_cache_path;
if (path.empty()) {
return {};
}
if (path.back() != '/' && path.back() != '\\') {
path += "/";
}
return path;
}
bool kernels_cache::is_cache_enabled() const {
return !_engine.configuration().kernels_cache_path.empty();
}
size_t kernels_cache::get_max_kernels_per_batch() const {
return 10;
}
void kernels_cache::get_program_source(const kernels_code& kernels_source_code, std::vector<kernels_cache::batch_program>* all_batches) const {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll::GetProgramSource");
std::map<std::string, std::vector<batch_program>> program_buckets;
for (const auto& code : kernels_source_code) {
std::string full_code = code.kernel_strings->jit + code.kernel_strings->str + code.kernel_strings->undefs;
const source_code org_source_code = { full_code };
std::string entry_point = code.kernel_strings->entry_point;
std::string options = code.kernel_strings->options;
bool batch_compilation = code.kernel_strings->batch_compilation;
bool dump_custom_program = code.dump_custom_program;
batch_compilation &= does_options_support_batch_compilation(options);
if (batch_compilation) {
options = reorder_options(options);
}
std::string key = options;
if (batch_compilation == false) {
key += " __PROGRAM__" + std::to_string(program_buckets.size());
}
if (dump_custom_program) {
key += " __DUMP_CUSTOM_PROGRAM__"; // Adding label to key so it would be separated from other programs
}
auto& current_bucket = program_buckets[key];
if (current_bucket.empty()) { // new bucket
const auto& batch_id = 0;
const auto& bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
// Create new kernels batch when the limit is reached
if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()) {
const auto& bucket_id = static_cast<int32_t>(program_buckets.size());
const auto& batch_id = static_cast<int32_t>(current_bucket.size());
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
auto& current_batch = current_bucket.back();
current_batch.dump_custom_program = dump_custom_program;
current_batch.entry_point_to_id[entry_point] = code.id;
assert(org_source_code.size() == 1);
current_batch.source.push_back(std::move(org_source_code.front()));
current_batch.kernels_counter++;
}
// Compute hash value for each batch
// Hash calculation might require additional optimizations, but currently execution time of this part is much smaller than loading
// of the precompiled binaries or get_undef_jit calls
// Hash is computed for string that contains compilation options + driver version +
// full source code (jit + template + undef sections) of all kernels in the batches
for (auto& c : program_buckets) {
auto options = c.first;
auto& batches = c.second;
for (auto& b : batches) {
std::string full_code = options + " " + _engine.get_device_info().driver_version;
for (auto& ss : b.source)
full_code += ss;
b.hash_value = std::hash<std::string>()(full_code);
all_batches->push_back(b);
}
}
}
kernels_cache::kernels_cache(engine& engine) : _engine(engine) { }
kernel_id kernels_cache::set_kernel_source(
const std::shared_ptr<kernel_string>& kernel_string,
bool dump_custom_program) {
std::lock_guard<std::mutex> lock(_mutex);
// we need unique id in order to avoid conflict across topologies.
const auto kernel_num = _kernels.size() + _kernels_code.size();
kernel_id id = kernel_string->entry_point + "_" + std::to_string(kernel_num);
auto res = _kernels_code.emplace(kernel_string, id, dump_custom_program);
assert(_kernels.find(id) == _kernels.end());
if (res.second) {
_pending_compilation = true;
}
return id;
}
static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
// Get the size of the program binary in bytes.
std::vector<size_t> binary_sizes = program.getInfo<CL_PROGRAM_BINARY_SIZES>();
if (binary_sizes.size() != 1)
throw std::runtime_error("Invalid binaries count");
size_t binary_size = binary_sizes.front();
// Binary is not available for the device.
if (binary_size == 0)
throw std::runtime_error("Binary is not avaliable after program build");
// Get program binary.
return program.getInfo<CL_PROGRAM_BINARIES>().front();
}
// TODO: This build_batch method should be backend specific
void kernels_cache::build_batch(const engine& build_engine, const batch_program& batch) {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::build_batch");
auto& cl_build_engine = dynamic_cast<const ocl::ocl_engine&>(build_engine);
bool dump_sources = !_engine.configuration().sources_dumps_dir.empty() || batch.dump_custom_program;
std::string dump_sources_dir = _engine.configuration().sources_dumps_dir;
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_sources.empty()) {
dump_sources = true;
dump_sources_dir = debug_config->dump_sources;
}
std::string err_log; // accumulated build log from all program's parts (only contains messages from parts which
std::string current_dump_file_name = "";
if (dump_sources) {
current_dump_file_name = dump_sources_dir;
if (!current_dump_file_name.empty() && current_dump_file_name.back() != '/')
current_dump_file_name += '/';
current_dump_file_name += "clDNN_program_" + std::to_string(batch.bucket_id) + "_part_" + std::to_string(batch.batch_id) + ".cl";
}
std::ofstream dump_file;
if (dump_sources) {
dump_file.open(current_dump_file_name);
if (dump_file.good()) {
for (auto& s : batch.source)
dump_file << s;
}
}
std::string cached_bin_name = get_cache_path() + std::to_string(batch.hash_value) + ".cl_cache";
cl::Program::Binaries precompiled_kernels = {};
if (is_cache_enabled()) {
// Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket
// If read is successful, then remove kernels from compilation bucket
auto bin = loadBinaryFromFile(cached_bin_name);
if (!bin.empty()) {
precompiled_kernels.push_back(bin);
}
}
try {
cl::vector<cl::Kernel> kernels;
// Run compilation
if (precompiled_kernels.empty()) {
cl::Program program(cl_build_engine.get_cl_context(), batch.source);
{
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildProgram::RunCompilation");
program.build(cl_build_engine.get_cl_device(), batch.options.c_str());
}
if (dump_sources && dump_file.good()) {
dump_file << "\n/* Build Log:\n";
for (auto& p : program.getBuildInfo<CL_PROGRAM_BUILD_LOG>())
dump_file << p.second << "\n";
dump_file << "*/\n";
}
program.createKernels(&kernels);
if (is_cache_enabled()) {
// If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache
// Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited
// Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer
// compile time.
saveBinaryToFile(cached_bin_name, getProgramBinaries(program));
}
} else {
cl::Program program(cl_build_engine.get_cl_context(), {cl_build_engine.get_cl_device()}, precompiled_kernels);
program.build(cl_build_engine.get_cl_device(), batch.options.c_str());
program.createKernels(&kernels);
}
{
std::lock_guard<std::mutex> lock(_mutex);
for (auto& k : kernels) {
const auto& entry_point = k.getInfo<CL_KERNEL_FUNCTION_NAME>();
const auto& k_id = batch.entry_point_to_id.find(entry_point);
if (k_id != batch.entry_point_to_id.end()) {
cl_kernel kern = k.get();
cl_context context = cl_build_engine.get_cl_context().get();
kernel::ptr kernel = kernels_factory::create(_engine, context, kern, entry_point);
const auto& kmap = std::make_pair(k_id->second, kernel);
_kernels.insert(kmap);
} else {
throw std::runtime_error("Could not find entry point");
}
}
}
} catch (const cl::BuildError& err) {
if (dump_sources && dump_file.good())
dump_file << "\n/* Build Log:\n";
for (auto& p : err.getBuildLog()) {
if (dump_sources && dump_file.good())
dump_file << p.second << "\n";
err_log += p.second + '\n';
}
if (dump_sources && dump_file.good())
dump_file << "*/\n";
}
if (!err_log.empty()) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->verbose) {
std::cout << "-------- OpenCL build error" << std::endl;
std::cout << err_log << std::endl;
std::cout << "-------- End of OpenCL build error" << std::endl;
}
std::stringstream err_ss(err_log);
std::string line;
int cnt = 0;
while (std::getline(err_ss, line, '\n')) {
if (line.find("error") != std::string::npos)
cnt = 5;
cnt--;
if (cnt > 0)
std::cout << line << std::endl;
else if (cnt == 0)
std::cout << "...." << std::endl;
}
throw std::runtime_error("Program build failed(" + std::to_string(batch.bucket_id) + + "_part_"
+ std::to_string(batch.batch_id)
+ "): You may enable OCL source dump to see the error log.\n");
}
}
kernel::ptr kernels_cache::get_kernel(kernel_id id) const {
if (_pending_compilation)
throw std::runtime_error("Kernel cache is not compiled, call build_all() first!");
auto res = _kernels.find(id);
if (_kernels.end() == res)
throw std::runtime_error("Kernel " + id + " not found in the kernel cache!");
return res->second;
}
void kernels_cache::build_all() {
OV_ITT_SCOPED_TASK(itt::domains::CLDNN, "KernelsCache::BuildAll");
if (!_pending_compilation)
return;
std::unique_ptr<ocl::ocl_engine> _build_engine = nullptr;
if (_engine.type() == engine_types::ocl) {
_build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl, _engine.configuration()));
}
std::vector<batch_program> batches;
{
std::lock_guard<std::mutex> lock(_mutex);
get_program_source(_kernels_code, &batches);
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
int n_threads = _engine.configuration().n_threads;
arena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena());
arena->initialize(n_threads);
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
int n_threads = _engine.configuration().n_threads;
pool = std::unique_ptr<thread_pool>(new thread_pool(n_threads));
#endif
}
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
arena->execute([this, &_build_engine, &batches] {
tbb::parallel_for(tbb::blocked_range<size_t>(0, batches.size()), [this, &_build_engine, &batches](const tbb::blocked_range<size_t>& r) {
for (auto i = r.begin(); i != r.end(); ++i) {
build_batch(*_build_engine, batches[i]);
}
});
});
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
std::vector<std::future<void>> builds;
for (size_t i = 0; i < batches.size(); ++i) {
builds.push_back(pool->enqueue([this, &_build_engine, &batches, i] () {
build_batch(*_build_engine, batches[i]);
}));
}
std::for_each(builds.begin(), builds.end(), [] (std::future<void>& f) { f.wait(); });
#else
// no parallel build
for (const auto& batch : batches) {
build_batch(*_build_engine, batch);
}
#endif
{
std::lock_guard<std::mutex> lock(_mutex);
_kernels_code.clear();
_pending_compilation = false;
#if (CLDNN_THREADING == CLDNN_THREADING_TBB)
arena.reset();
#if defined(__unix__) && !defined(__ANDROID__)
// NOTE: In linux, without malloc_trim, an amount of the memory used by compilation is not being returned to system thought they are freed.
// (It is at least 500 MB when we perform parallel compilation)
// It is observed that freeing the memory manually with malloc_trim saves significant amount of the memory.
// Also, this is not happening in Windows.
// So, added malloc_trim for linux build until we figure out a better solution.
malloc_trim(0);
#endif
#elif(CLDNN_THREADING == CLDNN_THREADING_THREADPOOL)
pool.reset();
#if defined(__unix__) && !defined(__ANDROID__)
malloc_trim(0);
#endif
#endif
}
}
void kernels_cache::reset() {
_kernels.clear();
_kernels_code.clear();
_pending_compilation = false;
}
} // namespace cldnn