From 0509c66ce03e43f9e7f660788a9e335a04275b25 Mon Sep 17 00:00:00 2001
From: Konrad Dobros <konrad.dobros@intel.com>
Date: Thu, 2 Jul 2020 13:18:28 +0200
Subject: [PATCH] [IE CLDNN] Add some auto-tuning improvements (#1154)

- add error reporting for failed kernel runs during auto-tune
- fix auto-tuning for asymmetric quantization
- add asymmetric quantization information to cache
- change auto-tuning metric from average to min
---
 .../convolution/convolution_params.cpp        |  2 +-
 .../core/common/weight_bias_params.cpp        | 16 +++++++++
 .../core/common/weight_bias_params.h          |  1 +
 .../thirdparty/clDNN/src/gpu/kernel.cpp       |  2 +-
 .../clDNN/src/gpu/kernel_runner.cpp           | 35 ++++++++++++-------
 5 files changed, 42 insertions(+), 14 deletions(-)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp
index 44bd574cc91..68b2ca86197 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_params.cpp
@@ -41,7 +41,7 @@ std::string convolution_params::to_string() const {
 std::string convolution_params::to_cache_string_v2() const {
     std::stringstream s;
 
-    s << weight_bias_params::to_cache_string_v2() << ";";
+    s << parent::to_cache_string_v2() << ";";
     s << filterSize.x << "_" << filterSize.y << "_" << filterSize.z << ";";
     s << stride.x << "_" << stride.y << "_" << stride.z << ";";
     s << dilation.x << "_" << dilation.y << "_" << dilation.z << ";";
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.cpp
index 4278cc33186..e6a94c06ee8 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.cpp
@@ -15,6 +15,7 @@
 */
 
 #include "weight_bias_params.h"
+#include <sstream>
 
 namespace kernel_selector {
 ParamsKey weight_bias_params::GetParamsKey() const {
@@ -37,4 +38,19 @@ ParamsKey weight_bias_params::GetParamsKey() const {
 
     return k;
 }
+
+std::string weight_bias_zero_point_params::to_cache_string_v2() const {
+    std::stringstream s;
+
+    s << weight_bias_params::to_cache_string_v2();
+    if (!activations_zero_points.empty())
+        s << ";activation_zp";
+    if (!weights_zero_points.empty())
+        s << ";weights_zp";
+    if (HasCompensation())
+        s << ";compensation";
+
+    return s.str();
+}
+
 }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.h
index ac4c5f5160d..7d1db484771 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/weight_bias_params.h
@@ -43,6 +43,7 @@ struct weight_bias_zero_point_params : public weight_bias_params {
     MultiDataTensor compensation;
 
     bool HasCompensation() const { return !compensation.empty(); }
+    std::string to_cache_string_v2() const override;
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp
index cf666b1fc59..aeeb91262eb 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel.cpp
@@ -238,7 +238,7 @@ void set_arguments(kernels_cache::kernel_type& kernel,
         }
 
         if (status != CL_SUCCESS) {
-            throw std::runtime_error("Error set args\n");
+            throw std::runtime_error("Error set arg " + std::to_string(i) + ", error code: " + std::to_string(status) + "\n");
         }
     }
 }
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.cpp
index ecf71af2c4c..37225d9fed7 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/kernel_runner.cpp
@@ -135,8 +135,9 @@ void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kern
         if (zero_points_exist) {
             const auto& zero_point_params =
                 static_cast<const kernel_selector::weight_bias_zero_point_params&>(weights_bias_params);
-            if (weight_zero_point_buffers.empty()) {
-                for (auto& weight_zero_point : zero_point_params.weights_zero_points) {
+            if (!zero_point_params.weights_zero_points.empty()) {
+                if (weight_zero_point_buffers.empty()) {
+                    auto& weight_zero_point = zero_point_params.weights_zero_points[0];
                     auto num_of_elements = static_cast<int>(weight_zero_point.PhysicalSize());
                     weight_zero_point_buffers.push_back(
                         engine->allocate_memory({
@@ -145,28 +146,33 @@ void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kern
                             tensor(1, num_of_elements, 1, 1) },
                             0));
                 }
+                args.weights_zero_points = weight_zero_point_buffers[0];
             }
-            if (activation_zero_point_buffers.empty()) {
-                for (auto& activation_zero_point : zero_point_params.activations_zero_points) {
+            if (!zero_point_params.activations_zero_points.empty()) {
+                if (activation_zero_point_buffers.empty()) {
+                    auto& activation_zero_point = zero_point_params.activations_zero_points[0];
                     auto num_of_elements = static_cast<int>(activation_zero_point.PhysicalSize());
-                    weight_zero_point_buffers.push_back(
+                    activation_zero_point_buffers.push_back(
                         engine->allocate_memory({
                             from_data_type(activation_zero_point.GetDType()),
                             format::bfyx,
                             tensor(1, num_of_elements, 1, 1) },
                             0));
                 }
+                args.activations_zero_points = activation_zero_point_buffers[0];
             }
-            if (compensation_buffers.empty()) {
-                for (auto& compensation : zero_point_params.compensation) {
+            if (!zero_point_params.compensation.empty()) {
+                if (compensation_buffers.empty()) {
+                    auto& compensation = zero_point_params.compensation[0];
                     auto num_of_elements = static_cast<int>(compensation.PhysicalSize());
-                    weight_zero_point_buffers.push_back(
+                    compensation_buffers.push_back(
                         engine->allocate_memory({
                             from_data_type(compensation.GetDType()),
                             format::bfyx,
                             tensor(1, num_of_elements, 1, 1) },
                             0));
                 }
+                args.compensation = compensation_buffers[0];
             }
         }
     }
@@ -202,19 +208,24 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
         int i = 0;
         for (auto it = batch_start; it < batch_end; it++) {
             std::vector<event_impl::ptr> events;
-            auto kernel_run_time = std::chrono::nanoseconds::zero();
+            auto kernel_run_time = std::chrono::nanoseconds::max();
             int num_of_runs = 0;
 
             for (int iteration = 0; iteration < runs_per_kernel; iteration++) {
                 event_impl::ptr event;
                 try {
                     event = kernels[i].run(0, it->kernels[0], {}, args);
+                } catch (std::exception& e) {
+                    std::cout << "[clDNN] Could not run kernel for auto-tune: " << it->kernelName
+                              << " with auto-tune index " << it->autoTuneIndex << std::endl
+                              << ", error message:" << e.what();
                 } catch (...) {
                     // Could not run this kernel. Push back NULL event (will be ignored later).
+                    std::cout << "[clDNN] Could not run kernel for auto-tune: " << it->kernelName
+                              << " with auto-tune index " << it->autoTuneIndex << std::endl;
                 }
                 events.push_back(event);
             }
-
             context->queue(0).finish();
 
             for (auto& event : events) {
@@ -222,7 +233,7 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
                     auto profiling_intervals = event->get_profiling_info();
                     for (auto const& profiling_interval : profiling_intervals) {
                         if (profiling_interval.name == "executing") {
-                            kernel_run_time += profiling_interval.value->value();
+                            kernel_run_time = std::min(profiling_interval.value->value(), kernel_run_time);
                             num_of_runs++;
                             break;
                         }
@@ -231,7 +242,7 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
             }
 
             if (num_of_runs > 0) {
-                run_times.push_back(kernel_run_time / num_of_runs);
+                run_times.push_back(kernel_run_time);
                 num_of_kernels_run += 1;
             } else {
                 run_times.push_back(std::chrono::nanoseconds::max());