[IE CLDNN] Removed unused DO gpu impl (#2809)

2020-10-28 09:24:22 +03:00 · 2020-10-28 09:24:22 +03:00 · c7661078d9
commit c7661078d9
parent abb8817cf6
20 changed files with 77 additions and 1585 deletions
--- a/inference-engine/thirdparty/clDNN/api/detection_output.hpp
+++ b/inference-engine/thirdparty/clDNN/api/detection_output.hpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -142,52 +142,6 @@ struct detection_output : public primitive_base<detection_output> {
 protected:
 };

-/// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression.
-/// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax].
-/// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1.
-struct detection_output_sort
-    : public primitive_base<detection_output_sort> {
-    CLDNN_DECLARE_PRIMITIVE(detection_output_sort)
-
-    /// @brief Constructs detection output primitive.
-    /// @param id This primitive id.
-    /// @param input_bboxes Input bounding boxes primitive id.
-    /// @param num_images Number of images to be predicted.
-    /// @param num_classes Number of classes to be predicted.
-    /// @param keep_top_k Number of total bounding boxes to be kept per image after NMS step.
-    /// @param share_location If true bounding box are shared among different classes.
-    /// @param top_k Maximum number of results to be kept in NMS.
-    /// @param output_padding Output padding.
-    detection_output_sort(const primitive_id& id,
-                          const primitive_id& input_bboxes,
-                          const uint32_t num_images,
-                          const uint32_t num_classes,
-                          const uint32_t keep_top_k,
-                          const bool share_location = true,
-                          const int top_k = -1,
-                          const int background_label_id = -1,
-                          const padding& output_padding = padding())
-        : primitive_base(id, {input_bboxes}, output_padding),
-          num_images(num_images),
-          num_classes(num_classes),
-          keep_top_k(keep_top_k),
-          share_location(share_location),
-          top_k(top_k),
-          background_label_id(background_label_id) {}
-
-    /// @brief Number of classes to be predicted.
-    const uint32_t num_images;
-    /// @brief Number of classes to be predicted.
-    const uint32_t num_classes;
-    /// @brief Number of total bounding boxes to be kept per image after NMS step.
-    const int keep_top_k;
-    /// @brief If true, bounding box are shared among different classes.
-    const bool share_location;
-    /// @brief Maximum number of results to be kept in NMS.
-    const int top_k;
-    /// @brief Background label id (-1 if there is no background class).
-    const int background_label_id;
-};
 /// @}
 /// @}
 /// @}
--- a/inference-engine/thirdparty/clDNN/api/program.hpp
+++ b/inference-engine/thirdparty/clDNN/api/program.hpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -46,9 +46,6 @@ enum class build_option_type {
    /// @brief Enable implicit static input reordering for user inputs (default: false).
    allow_static_input_reorder,

-    /// @brief Enable running detection output layer always on gpu, regardless performance
-    detection_output_gpu,
-
    /// @brief Enable debug mode (default: false).
    /// @details This option enforce all program primitives to be accessible as outputs.
    debug,
@ -130,9 +127,6 @@ struct build_option {
    /// @brief Enable implicit reordering for static user inputs (default: false).
    static std::shared_ptr<const build_option> allow_static_input_reorder(bool enable = false);

-    /// @brief Enable running detection output layer always on GPU, regardless performance (default: false).
-    static std::shared_ptr<const build_option> detection_output_gpu(bool enable = false);
-
    /// @brief Enable debug mode (default: false).
    /// @details This option enforce all program primitives to be accessible as outputs.
    static std::shared_ptr<const build_option> debug(bool enable = false);
@ -323,11 +317,6 @@ struct build_option_traits<build_option_type::allow_static_input_reorder> {
    static std::shared_ptr<const build_option> make_default() { return build_option::allow_static_input_reorder(); }
 };
 template <>
-struct build_option_traits<build_option_type::detection_output_gpu> {
-    typedef build_option_bool<build_option_type::detection_output_gpu> object_type;
-    static std::shared_ptr<const build_option> make_default() { return build_option::detection_output_gpu(); }
-};
-template <>
 struct build_option_traits<build_option_type::debug> {
    typedef build_option_bool<build_option_type::debug> object_type;
    static std::shared_ptr<const build_option> make_default() { return build_option::debug(); }
@ -384,10 +373,6 @@ inline std::shared_ptr<const build_option> build_option::allow_static_input_reor
    return std::make_shared<build_option_bool<build_option_type::allow_static_input_reorder>>(enable);
 }

-inline std::shared_ptr<const build_option> build_option::detection_output_gpu(bool enable) {
-    return std::make_shared<build_option_bool<build_option_type::detection_output_gpu>>(enable);
-}
-
 inline std::shared_ptr<const build_option> build_option::debug(bool enable) {
    return std::make_shared<build_option_bool<build_option_type::debug>>(enable);
 }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
@ -56,7 +56,6 @@ enum class KernelType {
    PYRAMID_ROI_ALIGN,
    CONTRACT,
    ONE_HOT,
-    DETECTION_OUTPUT,
    GATHER,
    SCATTER_UPDATE,
    DEPTH_TO_SPACE,
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.cpp
@ -1,64 +0,0 @@
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "detection_output_kernel_base.h"
-
-namespace kernel_selector {
-JitConstants DetectionOutputKernelBase::GetJitConstants(const detection_output_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    const auto& detectOutParams = params.detectOutParams;
-
-    jit.AddConstants({
-        MakeJitConstant("NUM_IMAGES", detectOutParams.num_images),
-        MakeJitConstant("NUM_CLASSES", detectOutParams.num_classes),
-        MakeJitConstant("KEEP_TOP_K", detectOutParams.keep_top_k),
-        MakeJitConstant("TOP_K", detectOutParams.top_k),
-        MakeJitConstant("BACKGROUND_LABEL_ID", detectOutParams.background_label_id),
-        MakeJitConstant("CODE_TYPE", detectOutParams.code_type),
-        MakeJitConstant("CONF_SIZE_X", detectOutParams.conf_size_x),
-        MakeJitConstant("CONF_SIZE_Y", detectOutParams.conf_size_y),
-        MakeJitConstant("CONF_PADDING_X", detectOutParams.conf_padding_x),
-        MakeJitConstant("CONF_PADDING_Y", detectOutParams.conf_padding_y),
-        MakeJitConstant("SHARE_LOCATION", detectOutParams.share_location),
-        MakeJitConstant("VARIANCE_ENCODED_IN_TARGET", detectOutParams.variance_encoded_in_target),
-        MakeJitConstant("NMS_THRESHOLD", detectOutParams.nms_threshold),
-        MakeJitConstant("ETA", detectOutParams.eta),
-        MakeJitConstant("CONFIDENCE_THRESHOLD", detectOutParams.confidence_threshold),
-        MakeJitConstant("IMAGE_WIDTH", detectOutParams.input_width),
-        MakeJitConstant("IMAGE_HEIGH", detectOutParams.input_heigh),
-        MakeJitConstant("ELEMENTS_PER_THREAD", detectOutParams.elements_per_thread),
-        MakeJitConstant("PRIOR_COORD_OFFSET", detectOutParams.prior_coordinates_offset),
-        MakeJitConstant("PRIOR_INFO_SIZE", detectOutParams.prior_info_size),
-        MakeJitConstant("PRIOR_IS_NORMALIZED", detectOutParams.prior_is_normalized),
-    });
-
-    return jit;
-}
-
-DetectionOutputKernelBase::DispatchData DetectionOutputKernelBase::SetDefault(const detection_output_params& /*params*/) const {
-    DispatchData dispatchData;
-
-    dispatchData.gws[0] = 0;
-    dispatchData.gws[1] = 0;
-    dispatchData.gws[2] = 0;
-
-    dispatchData.lws[0] = 0;
-    dispatchData.lws[1] = 0;
-    dispatchData.lws[2] = 0;
-
-    return dispatchData;
-}
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_base.h
@ -1,78 +0,0 @@
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "kernel_base_opencl.h"
-#include "kernel_selector_params.h"
-
-namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// detection_output_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct detection_output_params : public base_params {
-    detection_output_params() : base_params(KernelType::DETECTION_OUTPUT), detectOutParams() {}
-
-    struct DedicatedParams {
-        uint32_t num_images;
-        uint32_t num_classes;
-        int32_t keep_top_k;
-        int32_t top_k;
-        int32_t background_label_id;
-        int32_t code_type;
-        int32_t conf_size_x;
-        int32_t conf_size_y;
-        int32_t conf_padding_x;
-        int32_t conf_padding_y;
-        int32_t elements_per_thread;
-        int32_t input_width;
-        int32_t input_heigh;
-        int32_t prior_coordinates_offset;
-        int32_t prior_info_size;
-        bool prior_is_normalized;
-        bool share_location;
-        bool variance_encoded_in_target;
-        float nms_threshold;
-        float eta;
-        float confidence_threshold;
-    };
-
-    DedicatedParams detectOutParams;
-
-    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// detection_output_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct detection_output_optional_params : optional_params {
-    detection_output_optional_params() : optional_params(KernelType::DETECTION_OUTPUT) {}
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// DetectionOutputKernelBase
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-class DetectionOutputKernelBase : public KernelBaseOpenCL {
-public:
-    using KernelBaseOpenCL ::KernelBaseOpenCL;
-    virtual ~DetectionOutputKernelBase() {}
-
-    using DispatchData = CommonDispatchData;
-
-protected:
-    JitConstants GetJitConstants(const detection_output_params& params) const;
-    virtual DispatchData SetDefault(const detection_output_params& params) const;
-};
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.cpp
@ -1,88 +0,0 @@
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "detection_output_kernel_ref.h"
-#include "kernel_selector_utils.h"
-
-#define PRIOR_BOX_SIZE 4  // Each prior-box consists of [xmin, ymin, xmax, ymax].
-
-namespace kernel_selector {
-
-ParamsKey DetectionOutputKernel::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    return k;
-}
-
-CommonDispatchData DetectionOutputKernel::SetDefault(const detection_output_params& params) const {
-    CommonDispatchData dispatchData = DetectionOutputKernelBase::SetDefault(params);
-
-    // Number of all work items is set to total number of bounding boxes -
-    // one bounding box is procerssed by one work item
-    size_t num_classes = (params.detectOutParams.share_location) ? 1 : params.detectOutParams.num_classes;
-
-    // Size of input0 (input location), if shared loaction it is equal to size of one class,
-    // else it has size of all items for all classes
-    size_t bboxesNum = params.inputs[0].LogicalSize() / PRIOR_BOX_SIZE / num_classes;
-    // Work group size is set to number of bounding boxes per image for sorting purpose
-    // (access to one table with sorted values)
-    size_t work_group_size = bboxesNum / params.inputs[0].Batch().v;
-
-    if (work_group_size > 256) {
-        work_group_size = work_group_size / ((work_group_size / 256) + 1) + 1;
-    }
-
-    bboxesNum = work_group_size * params.inputs[0].Batch().v;
-
-    dispatchData.gws[0] = Align(bboxesNum, work_group_size);
-    dispatchData.gws[1] = 1;
-    dispatchData.gws[2] = 1;
-
-    dispatchData.lws[0] = work_group_size;
-    dispatchData.lws[1] = 1;
-    dispatchData.lws[2] = 1;
-
-    return dispatchData;
-}
-
-KernelsData DetectionOutputKernel::GetKernelsData(const Params& params, const optional_params& options) const {
-    assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT);
-
-    KernelData kd = KernelData::Default<detection_output_params>(params);
-    const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
-    DispatchData dispatchData = SetDefault(detectOutParams);
-
-    auto cldnnJit = GetJitConstants(detectOutParams);
-    auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
-    kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
-
-    kd.estimatedTime = FORCE_PRIORITY_8;
-
-    return {kd};
-}
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_ref.h
@ -1,33 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "detection_output_kernel_base.h"
-
-namespace kernel_selector {
-
-class DetectionOutputKernel : public DetectionOutputKernelBase {
-public:
-    DetectionOutputKernel() : DetectionOutputKernelBase("detection_output") {}
-    virtual ~DetectionOutputKernel() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-private:
-    CommonDispatchData SetDefault(const detection_output_params& params) const override;
-};
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.cpp
@ -1,34 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "detection_output_kernel_selector.h"
-#include "detection_output_kernel_ref.h"
-#include "detection_output_kernel_sort.h"
-
-namespace kernel_selector {
-detection_output_kernel_selector::detection_output_kernel_selector() { Attach<DetectionOutputKernel>(); }
-
-KernelsData detection_output_kernel_selector::GetBestKernels(const Params& params,
-                                                             const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::DETECTION_OUTPUT);
-}
-
-detection_output_sort_kernel_selector::detection_output_sort_kernel_selector() { Attach<DetectionOutputKernel_sort>(); }
-
-KernelsData detection_output_sort_kernel_selector::GetBestKernels(const Params& params,
-                                                                  const optional_params& options) const {
-    return GetNaiveBestKernel(params, options, KernelType::DETECTION_OUTPUT);
-}
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_selector.h
@ -1,48 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "kernel_selector.h"
-
-namespace kernel_selector {
-class detection_output_kernel_selector : public kernel_selector_base {
-public:
-    static detection_output_kernel_selector& Instance() {
-        static detection_output_kernel_selector instance_;
-        return instance_;
-    }
-
-    detection_output_kernel_selector();
-
-    virtual ~detection_output_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-
-class detection_output_sort_kernel_selector : public kernel_selector_base {
-public:
-    static detection_output_sort_kernel_selector& Instance() {
-        static detection_output_sort_kernel_selector instance_;
-        return instance_;
-    }
-
-    detection_output_sort_kernel_selector();
-
-    virtual ~detection_output_sort_kernel_selector() {}
-
-    KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
-};
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.cpp
@ -1,82 +0,0 @@
-// Copyright (c) 2018-2020 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "detection_output_kernel_sort.h"
-#include "kernel_selector_utils.h"
-
-#define DETECTION_OUTPUT_ROW_SIZE 7  // Each detection consists of [image_id, label, confidence, xmin, ymin, xmax, ymax].
-
-namespace kernel_selector {
-
-ParamsKey DetectionOutputKernel_sort::GetSupportedKey() const {
-    ParamsKey k;
-    k.EnableInputDataType(Datatype::F16);
-    k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F16);
-    k.EnableOutputDataType(Datatype::F32);
-    k.EnableInputLayout(DataLayout::bfyx);
-    k.EnableOutputLayout(DataLayout::bfyx);
-    k.EnableTensorOffset();
-    k.EnableTensorPitches();
-    k.EnableBatching();
-    return k;
-}
-
-CommonDispatchData DetectionOutputKernel_sort::SetDefault(const detection_output_params& params) const {
-    CommonDispatchData dispatchData = DetectionOutputKernelBase::SetDefault(params);
-
-    unsigned class_num = params.detectOutParams.num_classes;
-    if (params.detectOutParams.share_location && params.detectOutParams.background_label_id == 0) {
-        class_num -= 1;
-    }
-    const size_t bboxesNum = class_num * params.detectOutParams.num_images;
-    // Work group size is set to number of bounding boxes per image
-    size_t work_group_size = class_num;
-
-    if (work_group_size > 256) {
-        work_group_size = (work_group_size + work_group_size % 2) / (work_group_size / 256 + 1);
-    }
-
-    dispatchData.gws[0] = Align(bboxesNum, work_group_size);
-    dispatchData.gws[1] = 1;
-    dispatchData.gws[2] = 1;
-
-    dispatchData.lws[0] = work_group_size;
-    dispatchData.lws[1] = 1;
-    dispatchData.lws[2] = 1;
-
-    return dispatchData;
-}
-
-KernelsData DetectionOutputKernel_sort::GetKernelsData(const Params& params, const optional_params& options) const {
-    assert(params.GetType() == KernelType::DETECTION_OUTPUT &&
-           options.GetType() == KernelType::DETECTION_OUTPUT);
-
-    KernelData kd = KernelData::Default<detection_output_params>(params);
-    const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
-    DispatchData dispatchData = SetDefault(detectOutParams);
-
-    auto cldnnJit = GetJitConstants(detectOutParams);
-    auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options);
-    auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
-
-    auto& kernel = kd.kernels[0];
-    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
-
-    kd.estimatedTime = FORCE_PRIORITY_8;
-
-    return {kd};
-}
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/detection_output/detection_output_kernel_sort.h
@ -1,33 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#pragma once
-
-#include "detection_output_kernel_base.h"
-
-namespace kernel_selector {
-
-class DetectionOutputKernel_sort : public DetectionOutputKernelBase {
-public:
-    DetectionOutputKernel_sort() : DetectionOutputKernelBase("detection_output_sort") {}
-    virtual ~DetectionOutputKernel_sort() {}
-
-    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
-    ParamsKey GetSupportedKey() const override;
-
-private:
-    CommonDispatchData SetDefault(const detection_output_params& params) const override;
-};
-}  // namespace kernel_selector
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output.cl
@ -1,217 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-#include "include/detection_output_common.cl"
-
-KERNEL (detection_output)(__global UNIT_TYPE* input_location, __global UNIT_TYPE* output, __global UNIT_TYPE* input_confidence, __global UNIT_TYPE* input_prior_box)
-{
-    const uint idx = get_global_id(0);              // bbox idx
-    const uint local_id = (uint)get_local_id(0) * NUM_OF_ITEMS; // All bboxes from one image in work group
-    const uint idx_image = idx / NUM_OF_ITERATIONS;  // idx of current image
-
-    __local uint indexes[NUM_OF_PRIORS];
-    __local uint scores_size[NUM_CLASSES * NUM_OF_IMAGES];
-    __local bool stillSorting;
-
-    uint indexes_class_0[NUM_OF_PRIORS];
-
-    int last_bbox_in_class = NUM_OF_ITEMS;
-    bool is_last_bbox_in_class = false;
-    for (uint it = 0; it < NUM_OF_ITEMS; it ++)
-    {
-        if (((local_id + it + 1) % NUM_OF_PRIORS) == 0 )
-        {
-            last_bbox_in_class = it;
-            is_last_bbox_in_class = true;
-            break;
-        }
-    }
-
-    for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++)
-    {
-        if (idx_class == BACKGROUND_LABEL_ID)
-        {
-            continue;
-        }
-
-        for (uint it = 0;  it < NUM_OF_ITEMS; it++)
-        {
-            indexes[local_id + it] = local_id + it; 
-        }
-
-        stillSorting = true;
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        bool is_last_bbox_in_image = (is_last_bbox_in_class) && (idx_class == (NUM_CLASSES - 1));
-
-        while(stillSorting)
-        {
-            barrier(CLK_LOCAL_MEM_FENCE);
-            stillSorting = false;
-
-            for (uint i = 0; i < 2; i++)
-            {
-                for (uint it = 0; it < NUM_OF_ITEMS; it++)
-                {
-                    uint item_id = local_id + it;
-     
-                    uint idx1 = indexes[item_id];
-                    uint idx2 = indexes[item_id+1];
-                    bool perform = false;
-                    if ((((i % 2) && (item_id % 2)) ||
-                        ((!(i % 2)) && (!(item_id % 2)))) &&
-                        (it < last_bbox_in_class))
-                    {
-                        perform = true;
-                    }
-
-                    if (perform &&
-                        (FUNC_CALL(get_score)(input_confidence, idx1, idx_class, idx_image) <
-                         FUNC_CALL(get_score)(input_confidence, idx2, idx_class, idx_image)))
-                    {
-                        indexes[item_id] = idx2;
-                        indexes[item_id+1] = idx1;
-                        stillSorting = true;
-                    }
-                    barrier(CLK_LOCAL_MEM_FENCE);
-                }
-            }
-        }
-
-        // Do it only once per class in image
-        if (is_last_bbox_in_class)
-        {
-            UNIT_TYPE adaptive_threshold = NMS_THRESHOLD;
-            uint post_nms_count = 0;
-            const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
-            scores_size[idx_class] = 0;
-
-            // Do the "keep" algorithm only for classes with confidence greater than CONFIDENCE_THRESHOLD.
-            // Check first, the biggest one (after sort) element in class.
-            if (FUNC_CALL(get_score)(input_confidence, indexes[0], idx_class, idx_image) != 0.0f)
-            {
-                for (uint i = 0; i < SCORES_COUNT; i++)
-                {
-                    const uint bb_idx = indexes[i];
-                    bool keep = true;
-                    for (uint j = 0; j < post_nms_count; j++)
-                    {
-                        if (!keep)
-                        {
-                            break;
-                        }
-
-                        UNIT_TYPE overlap = 0.0;
-                        const uint bb_idx2 = indexes[j];
-
-                        UNIT_TYPE decoded_bbox1[4];
-                        FUNC_CALL(get_decoded_bbox)(decoded_bbox1, input_location, input_prior_box, bb_idx, shared_class, idx_image);
-                        UNIT_TYPE decoded_bbox2[4];
-                        FUNC_CALL(get_decoded_bbox)(decoded_bbox2, input_location, input_prior_box, bb_idx2, shared_class, idx_image);
-                        bool intersecting =
-                            (decoded_bbox1[0] < decoded_bbox2[2]) &
-                            (decoded_bbox2[0] < decoded_bbox1[2]) &
-                            (decoded_bbox1[1] < decoded_bbox2[3]) &
-                            (decoded_bbox2[1] < decoded_bbox1[3]);
-
-                        if (intersecting)
-                        {
-                            const UNIT_TYPE intersect_width = min(decoded_bbox1[2], decoded_bbox2[2]) - max(decoded_bbox1[0], decoded_bbox2[0]);
-                            const UNIT_TYPE intersect_height = min(decoded_bbox1[3], decoded_bbox2[3]) - max(decoded_bbox1[1], decoded_bbox2[1]);
-                            const UNIT_TYPE intersect_size = intersect_width * intersect_height;
-                            const UNIT_TYPE bbox1_area = (decoded_bbox1[2] - decoded_bbox1[0]) * (decoded_bbox1[3] - decoded_bbox1[1]);
-                            const UNIT_TYPE bbox2_area = (decoded_bbox2[2] - decoded_bbox2[0]) * (decoded_bbox2[3] - decoded_bbox2[1]);
-                            overlap = intersect_size / (bbox1_area + bbox2_area - intersect_size);
-                        }
-                        keep = (overlap <= adaptive_threshold);
-                    }
-                    if (keep)
-                    {
-                        indexes[post_nms_count] = indexes[i];
-                        ++post_nms_count;
-                    }
-                    if ((keep) && (ETA < 1) && (adaptive_threshold > 0.5))
-                    {
-                        adaptive_threshold *= ETA;
-                    }
-                }
-            }
-            // Write number of scores to global memory, for proper output order in separated work groups
-            scores_size[idx_class] = post_nms_count;
-        }
-
-        stillSorting = true;
-        // Wait for scores number from all classes in images
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        uint output_offset = (idx_image * NUM_CLASSES_OUT + idx_class - HIDDEN_CLASS) * SCORES_COUNT;
-
-        for (uint it = 0; it < NUM_OF_ITEMS; it++)
-        {
-            const uint local_id_out = local_id + it;
-            
-            if (local_id_out < scores_size[idx_class])
-            {
-                const uint score_idx = indexes[local_id_out];
-                uint bb_idx = indexes[local_id_out];
-                const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
-                UNIT_TYPE decoded_bbox[4];
-                FUNC_CALL(get_decoded_bbox)(decoded_bbox, input_location, input_prior_box, bb_idx, shared_class, idx_image);
-
-                const uint out_idx = (local_id_out + output_offset) * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
-                output[out_idx] = TO_UNIT_TYPE(idx_image);
-                output[out_idx + 1] = TO_UNIT_TYPE(idx_class);
-                output[out_idx + 2] = FUNC_CALL(get_score)(input_confidence, score_idx, idx_class, idx_image);
-                output[out_idx + 3] = decoded_bbox[0];
-                output[out_idx + 4] = decoded_bbox[1];
-                output[out_idx + 5] = decoded_bbox[2];
-                output[out_idx + 6] = decoded_bbox[3];
-            }
-        }
-
-        // If work item is processing last bbox in image (we already know the number of all detections),
-        // use it to fill rest of keep_top_k items if number of detections is smaller
-        if (is_last_bbox_in_class)
-        {
-            uint out_idx = output_offset + scores_size[idx_class];
-
-            uint current_top_k = output_offset + SCORES_COUNT;
-            for (uint i = out_idx; i < current_top_k; i++)
-            {
-                out_idx = i * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
-                output[out_idx] = -1.0;
-                output[out_idx + 1] = 0.0;
-                output[out_idx + 2] = 0.0;
-                output[out_idx + 3] = 0.0;
-                output[out_idx + 4] = 0.0;
-                output[out_idx + 5] = 0.0;
-                output[out_idx + 6] = 0.0;
-            }
-        }
-
-        // Write number of scores kept in first step of detection output
-        if (is_last_bbox_in_image)
-        {
-            uint scores_sum = 0;
-            for (uint i = 0; i < NUM_CLASSES; i++)
-            {
-                scores_sum += scores_size[i];
-            }
-            output[idx_image] = scores_sum;
-
-        }
-    }
-}
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_sort.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/detection_output_sort.cl
@ -1,217 +0,0 @@
-// Copyright (c) 2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "include/include_all.cl"
-#include "include/detection_output_common.cl"
-
-UNIT_TYPE FUNC(get_score_sort)(__global UNIT_TYPE* input_bboxes, const uint idx_bbox, const uint idx_image)
-{
-    if (idx_bbox == KEEP_BBOXES_NUM)
-    {
-        // Idx set to dummy value, return -1 to exclude this element from sorting
-        return -1;
-    }
-    else
-    {
-        return input_bboxes[(idx_bbox + idx_image * NUM_OF_IMAGE_BBOXES) * OUTPUT_ROW_SIZE + INPUT_OFFSET + SCORE_OFFSET];
-    }
-}
-
-KERNEL (detection_output_sort)(__global UNIT_TYPE* input_bboxes, __global UNIT_TYPE* output)
-{
-    __local uint indexes[NUM_CLASSES_IN];
-    __local bool stillSorting;
-    __local uint output_count;
-    __local uint num_out_per_class[NUM_CLASSES_IN];
-
-    output_count = 0;
-    num_out_per_class[get_local_id(0)] = 0;
-
-    const uint image_id = (uint)get_global_id(0) / NUM_CLASSES_IN;
-    const uint local_id = (uint)get_local_id(0) * NUM_OF_ITEMS_SORT; // All bboxes from one image in work group
-
-    uint image_offset_input = image_id * NUM_OF_IMAGE_BBOXES;
-
-    uint count_sum = 0;
-    for (uint i = 0; i < image_id; i++)
-    {
-        count_sum += (input_bboxes[i] < KEEP_TOP_K)? input_bboxes[i] : KEEP_TOP_K;
-    }
-
-    uint image_offset_output = count_sum * OUTPUT_ROW_SIZE;
-
-    // If there is less elements than needed, write input to output
-    if (input_bboxes[image_id] <= KEEP_TOP_K)
-    {
-        if (local_id == 0)
-        {
-            for (uint class = 0; class < NUM_CLASSES_IN; class++)
-            {
-                if (class == BACKGROUND_LABEL_ID && !HIDDEN_CLASS)
-                {
-                    continue;
-                }
-                for (uint i = 0; i < NUM_OF_CLASS_BBOXES; i++)
-                {
-                    uint input_idx = (i + image_offset_input + class * NUM_OF_CLASS_BBOXES) * OUTPUT_ROW_SIZE + INPUT_OFFSET;
-                    if (input_bboxes[input_idx] != -1)
-                    {
-                        uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output;
-
-                        for (uint idx = 0; idx < OUTPUT_ROW_SIZE; idx++)
-                        {
-                            output[out_idx + idx] = input_bboxes[input_idx + idx];
-                        }
-
-                        output_count++;
-                    }
-                    else
-                    {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        uint sorted_output[KEEP_TOP_K * NUM_CLASSES_IN];
-
-        for (uint it = 0; it < NUM_OF_ITEMS_SORT; it++)
-        {
-            indexes[local_id + it] = (local_id + it) * NUM_OF_CLASS_BBOXES;
-        }
-
-        while (output_count < KEEP_BBOXES_NUM)
-        {
-            stillSorting = true;
-
-            while(stillSorting)
-            {
-                barrier(CLK_LOCAL_MEM_FENCE);
-                stillSorting = false;
-                for (uint it = 0; it < NUM_OF_ITEMS_SORT; it++)
-                {
-                    uint item_id = local_id + it;
-                    for (uint i = 0; i < 2; i++)
-                    {
-
-                        uint idx1 = indexes[item_id];
-                        uint idx2 = indexes[item_id+1];
-                        bool perform = false;
-                        if ((((i % 2) && (item_id % 2)) ||
-                            ((!(i % 2)) && (!(item_id % 2)))) &&
-                            (item_id != (NUM_CLASSES_IN - 1)))
-                        {
-                            perform = true;
-                        }
-
-                        if (perform &&
-                            (FUNC_CALL(get_score_sort)(input_bboxes, idx1, image_id) <
-                             FUNC_CALL(get_score_sort)(input_bboxes, idx2, image_id)))
-                        {
-                            indexes[item_id] = idx2;
-                            indexes[item_id+1] = idx1;
-                            stillSorting = true;
-                        }
-                        barrier(CLK_LOCAL_MEM_FENCE);
-                    }
-                }
-            }
-
-            if (local_id == 0)
-            {
-                UNIT_TYPE top_score = FUNC_CALL(get_score_sort)(input_bboxes, indexes[0], image_id);
-
-                if (top_score != 0)
-                {
-                    for (uint it = 0; (it < NUM_CLASSES_IN) && (output_count < KEEP_BBOXES_NUM); it++)
-                    {
-                        if (FUNC_CALL(get_score_sort)(input_bboxes, indexes[it], image_id) == top_score)
-                        {
-                            // write to output, create counter, and check if keep_top_k is satisfied.
-                            uint input_idx = (indexes[it] + image_offset_input) * OUTPUT_ROW_SIZE + INPUT_OFFSET;
-                            uint class_idx = input_bboxes[input_idx + 1] - HIDDEN_CLASS;
-
-                            sorted_output[class_idx * KEEP_TOP_K + num_out_per_class[class_idx]] = input_idx;
-                            num_out_per_class[class_idx]++;
-
-                            indexes[it]++;
-                            output_count++;
-
-                            // If all class elements are written to output, set dummy value to exclude class from sorting.
-                            if ((indexes[it] % NUM_OF_CLASS_BBOXES) == 0)
-                            {
-                                indexes[it] = KEEP_BBOXES_NUM;
-                            }
-                        }
-                    }
-                }
-                else
-                {
-                    // There is no more significant results to sort.
-                    output_count = KEEP_BBOXES_NUM;
-                }
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        if (local_id == 0)
-        {
-            output_count = 0;
-            for (uint i = 0; i < NUM_CLASSES_IN; i++)
-            {
-                for (uint j = 0; j < num_out_per_class[i]; j++)
-                {
-
-                    uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output;
-                    for (uint idx = 0; idx < OUTPUT_ROW_SIZE; idx++)
-                    {
-                        output[out_idx + idx] = input_bboxes[sorted_output[i * KEEP_TOP_K + j] + idx];
-                    }
-                    output_count++;
-                }
-           }
-           uint image_count_sum = (input_bboxes[image_id] < KEEP_TOP_K)? input_bboxes[image_id] : KEEP_TOP_K;
-           for (output_count; output_count < image_count_sum; output_count++)
-           {
-                uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output;
-                output[out_idx] = -1.0;
-                output[out_idx + 1] = 0.0;
-                output[out_idx + 2] = 0.0;
-                output[out_idx + 3] = 0.0;
-                output[out_idx + 4] = 0.0;
-                output[out_idx + 5] = 0.0;
-                output[out_idx + 6] = 0.0;
-           }
-        }
-    }
-
-    if (local_id == 0 &&
-        image_id == (NUM_IMAGES - 1))
-    {
-        for (output_count += count_sum; output_count < (KEEP_TOP_K *  NUM_IMAGES); output_count++ )
-        {
-            uint out_idx = output_count * OUTPUT_ROW_SIZE;
-            output[out_idx] = -1.0;
-            output[out_idx + 1] = 0.0;
-            output[out_idx + 2] = 0.0;
-            output[out_idx + 3] = 0.0;
-            output[out_idx + 4] = 0.0;
-            output[out_idx + 5] = 0.0;
-            output[out_idx + 6] = 0.0;
-        }
-    }
-
-}
--- a/inference-engine/thirdparty/clDNN/src/detection_output.cpp
+++ b/inference-engine/thirdparty/clDNN/src/detection_output.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -65,16 +65,8 @@ layout detection_output_inst::calc_output_layout(detection_output_node const& no
    // Add space for number of output results per image - needed in the next detection output step
    output_size += ((input_layout.size.batch[0] + 15) / 16) * 16;

-    if (node.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled()) {
-        return {input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, 1, output_size)};
-    } else {
-        return {input_layout.data_type,
-                cldnn::format::bfyx,
-                cldnn::tensor(1,
-                              1,
-                              DETECTION_OUTPUT_ROW_SIZE,
-                              node.get_primitive()->keep_top_k * input_layout.size.batch[0])};
-    }
+    return {input_layout.data_type, cldnn::format::bfyx,
+            cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0])};
 }

 std::string detection_output_inst::to_string(detection_output_node const& node) {
@ -203,82 +195,4 @@ detection_output_inst::typed_primitive_inst(network_impl& network, detection_out
                     "Detection output layer doesn't support input padding in Prior-Box input");
 }

-/************************ Detection Output keep_top_k part ************************/
-
-primitive_type_id detection_output_sort::type_id() {
-    static primitive_type_base<detection_output_sort> instance;
-    return &instance;
-}
-
-layout detection_output_sort_inst::calc_output_layout(detection_output_sort_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for "
-           "detection_output_sort_node!");
-    CLDNN_ERROR_NOT_EQUAL(node.id(),
-                          "Detection output layer input number",
-                          node.get_dependencies().size(),
-                          "expected number of inputs",
-                          static_cast<size_t>(1),
-                          "");
-
-    auto input_layout = node.input().get_output_layout();
-    int keep_top_k = node.as<detection_output_sort>().get_primitive()->keep_top_k;
-    int num_images = node.as<detection_output_sort>().get_primitive()->num_images;
-
-    // If detection output sort is used as a second part of detection output get proper info from detection otput node
-    if (num_images == 0) {
-        CLDNN_ERROR_BOOL(node.id(),
-                         "node.get_dependency(0).is_type<detection_output>()",
-                         !node.get_dependency(0).is_type<detection_output>(),
-                         "Cannot calculate output layout.");
-        input_layout = node.get_dependency(0).as<detection_output>().location().get_output_layout();
-        keep_top_k = node.get_dependency(0).as<detection_output>().get_primitive()->keep_top_k;
-        num_images = input_layout.size.batch[0];
-    }
-    // Batch size and feature size are 1.
-    // Number of bounding boxes to be kept is set to keep_top_k*batch size.
-    // If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1.
-    // Each row is a 7 dimension vector, which stores:
-    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    return {input_layout.data_type,
-            cldnn::format::bfyx,
-            cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, keep_top_k * num_images)};
-}
-
-std::string detection_output_sort_inst::to_string(detection_output_sort_node const& node) {
-    auto node_info = node.desc_to_json();
-    auto desc = node.get_primitive();
-
-    auto& input_bboxes = node.input();
-
-    std::stringstream primitive_description;
-
-    json_composite detec_out_info;
-    detec_out_info.add("input bboxes id", input_bboxes.id());
-    detec_out_info.add("num_classes:", desc->num_images);
-    detec_out_info.add("num_classes:", desc->num_classes);
-    detec_out_info.add("keep_top_k", desc->keep_top_k);
-    detec_out_info.add("share_location", desc->share_location);
-    detec_out_info.add("top_k", desc->top_k);
-    detec_out_info.dump(primitive_description);
-
-    node_info->add("dection output info", detec_out_info);
-    node_info->dump(primitive_description);
-
-    return primitive_description.str();
-}
-
-detection_output_sort_inst::typed_primitive_inst(network_impl& network, detection_output_sort_node const& node)
-    : parent(network, node) {
-    CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
-                                  "Input memory format",
-                                  node.get_dependency(0).get_output_layout().format.value,
-                                  "expected bfyx input format",
-                                  format::bfyx);
-
-    CLDNN_ERROR_BOOL(node.id(),
-                     "Detecion output layer padding",
-                     node.is_padded(),
-                     "Detection output layer doesn't support output padding.");
-}
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_cpu.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -19,6 +19,7 @@
 #include "network_impl.h"
 #include "implementation_map.h"
 #include "math_utils.h"
+#include "register_gpu.hpp"
 #include "cpu_impl_helpers.hpp"

 #include <algorithm>
@ -636,7 +637,14 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
    static primitive_impl* create(const detection_output_node& arg) { return new detection_output_cpu(arg); }
 };

-primitive_impl* runDetectOutCpu(const detection_output_node& arg) { return new detection_output_cpu(arg); }
+namespace detail {
+
+attach_detection_output_gpu::attach_detection_output_gpu() {
+    implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), detection_output_cpu::create);
+    implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), detection_output_cpu::create);
+}
+
+}  // namespace detail

 }  // namespace gpu
 }  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/gpu/detection_output_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/detection_output_gpu.cpp
@ -1,164 +0,0 @@
-/*
-// Copyright (c) 2016 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-*/
-
-#include "detection_output_inst.h"
-#include "primitive_gpu_base.h"
-#include "error_handler.h"
-#include "kernel_selector_helper.h"
-#include "detection_output/detection_output_kernel_base.h"
-#include "detection_output/detection_output_kernel_selector.h"
-
-#ifdef FIX_OPENMP_RELEASE_ISSUE
-#ifdef OPENMP_FOUND
-#include <omp.h>
-#endif
-#endif
-
-namespace cldnn {
-namespace gpu {
-
-struct detection_output_gpu : typed_primitive_gpu_impl<detection_output> {
-    using parent = typed_primitive_gpu_impl<detection_output>;
-    using parent::parent;
-
-private:
-    static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams,
-                                           const detection_output_node& arg) {
-        auto primitive = arg.get_primitive();
-        detectOutParams.keep_top_k = primitive->keep_top_k;
-        detectOutParams.num_classes = primitive->num_classes;
-        detectOutParams.top_k = primitive->top_k;
-        detectOutParams.background_label_id = primitive->background_label_id;
-        detectOutParams.code_type = (int32_t)primitive->code_type;
-        detectOutParams.share_location = primitive->share_location;
-        detectOutParams.variance_encoded_in_target = primitive->variance_encoded_in_target;
-        detectOutParams.nms_threshold = primitive->nms_threshold;
-        detectOutParams.eta = primitive->eta;
-        detectOutParams.confidence_threshold = primitive->confidence_threshold;
-        detectOutParams.prior_coordinates_offset = primitive->prior_coordinates_offset;
-        detectOutParams.prior_info_size = primitive->prior_info_size;
-        detectOutParams.prior_is_normalized = primitive->prior_is_normalized;
-        detectOutParams.input_width = primitive->input_width;
-        detectOutParams.input_heigh = primitive->input_height;
-        detectOutParams.conf_size_x = arg.confidence().get_output_layout().get_buffer_size().spatial[0];
-        detectOutParams.conf_size_y = arg.confidence().get_output_layout().get_buffer_size().spatial[1];
-        detectOutParams.conf_padding_x = arg.confidence().get_output_layout().data_padding.lower_size().spatial[0];
-        detectOutParams.conf_padding_y = arg.confidence().get_output_layout().data_padding.lower_size().spatial[1];
-    }
-
-public:
-    static primitive_impl* create(const detection_output_node& arg) {
-        if (!arg.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled()) {
-            return runDetectOutCpu(arg);
-        }
-
-        auto detect_out_params = get_default_params<kernel_selector::detection_output_params>(arg);
-        auto detect_out_optional_params =
-            get_default_optional_params<kernel_selector::detection_output_optional_params>(arg.get_program());
-
-        setDetectOutSpecificParams(detect_out_params.detectOutParams, arg);
-
-        auto& kernel_selector = kernel_selector::detection_output_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto detect_out = new detection_output_gpu(arg, best_kernels[0]);
-
-        return detect_out;
-    }
-};
-
-primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel) {
-    return new detection_output_gpu(arg, kernel);
-}
-
-/************************ Detection Output keep_top_k part ************************/
-
-struct detection_output_sort_gpu : typed_primitive_gpu_impl<detection_output_sort> {
-    using parent = typed_primitive_gpu_impl<detection_output_sort>;
-    using parent::parent;
-
-private:
-    static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams,
-                                           const detection_output_sort_node& arg) {
-        if (arg.get_dependency(0).is_type<detection_output>()) {
-            auto primitive = arg.get_dependency(0).as<detection_output>().get_primitive();
-            detectOutParams.keep_top_k = primitive->keep_top_k;
-            detectOutParams.num_classes = primitive->num_classes;
-            detectOutParams.num_images =
-                arg.get_dependency(0).as<detection_output>().location().get_output_layout().size.batch[0];
-            detectOutParams.top_k = primitive->top_k;
-            detectOutParams.share_location = primitive->share_location;
-            detectOutParams.background_label_id = primitive->background_label_id;
-        } else {
-            auto primitive = arg.get_primitive();
-            detectOutParams.keep_top_k = primitive->keep_top_k;
-            detectOutParams.num_classes = primitive->num_classes;
-            detectOutParams.num_images = primitive->num_images;
-            detectOutParams.top_k = primitive->top_k;
-            detectOutParams.share_location = primitive->share_location;
-            detectOutParams.background_label_id = primitive->background_label_id;
-        }
-    }
-
-public:
-    static primitive_impl* create(const detection_output_sort_node& arg) {
-        auto detect_out_params = get_default_params<kernel_selector::detection_output_params>(arg);
-        auto detect_out_optional_params =
-            get_default_optional_params<kernel_selector::detection_output_optional_params>(arg.get_program());
-
-        setDetectOutSpecificParams(detect_out_params.detectOutParams, arg);
-
-        auto& kernel_selector = kernel_selector::detection_output_sort_kernel_selector::Instance();
-        auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params);
-
-        CLDNN_ERROR_BOOL(arg.id(),
-                         "Best_kernel.empty()",
-                         best_kernels.empty(),
-                         "Cannot find a proper kernel with this arguments");
-
-        auto detect_out = new detection_output_sort_gpu(arg, best_kernels[0]);
-
-        return detect_out;
-    }
-};
-
-primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel) {
-    return new detection_output_sort_gpu(arg, kernel);
-}
-
-namespace detail {
-
-attach_detection_output_gpu::attach_detection_output_gpu() {
-    implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-                                              detection_output_gpu::create);
-    implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-                                              detection_output_gpu::create);
-    implementation_map<detection_output_sort>::add(
-        std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
-        detection_output_sort_gpu::create);
-    implementation_map<detection_output_sort>::add(
-        std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
-        detection_output_sort_gpu::create);
-}
-
-}  // namespace detail
-}  // namespace gpu
-}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
@ -120,56 +120,6 @@ void graph_initializations::replace_nodes(program_impl& p) {
    }
 }

-void graph_initializations::handle_detection_output(program_impl& p) {
-    auto itr = p.nodes_map.begin();  // note we need to use iterators since currently processed element can be removed
-    while (itr != p.nodes_map.end()) {
-        auto node_itr = itr++;
-        auto& node = *(*node_itr).second;
-        // Create second part detection output primitive and replace nodes names - do it only once
-        if ((p.get_options().get<build_option_type::detection_output_gpu>()->enabled()) &&
-            (node.is_type<detection_output>()) &&
-            (node.id().find("_pre") ==
-             std::string::npos)) {  // ToDo: this will fail if user will name the primitive with using _pre like do_pre
-                                    //       we need to use node mark() or some other idea to prevent it
-            // rename detection output
-            const primitive_id detect_out_node_name = node.id();
-            const primitive_id new_primitive_id = detect_out_node_name + "_pre";
-            p.rename(node, new_primitive_id);
-
-            auto detect_out_prim = node.as<detection_output>().typed_desc();
-            // Create new primitive, "keep top k" part of detection output
-            // ToDo: add a default parameters to the detection_output_sort class constructor to get rid off this
-            // initialization from here
-            auto detect_out_sort_prim =
-                std::make_shared<detection_output_sort>(detect_out_node_name,
-                                                        node.id(),
-                                                        // not important params here - it will be set during
-                                                        // "primitive_impl* create" func in "detection_output_sort_gpu"
-                                                        0,      // num_images
-                                                        0,      // num_classes
-                                                        0,      // keep_top_k
-                                                        false,  // share_location
-                                                        0,      // top_k
-                                                        -1,     // background_label_id
-                                                        detect_out_prim->output_padding);
-
-            p.get_or_create(detect_out_sort_prim);
-
-            auto sort_node_itr = p.nodes_map.find(detect_out_node_name);
-            if (sort_node_itr == p.nodes_map.end()) continue;
-
-            auto sort_node = sort_node_itr->second;
-
-            // Add connection to second part of detection output
-            if (node.get_users().size()) {
-                p.add_intermediate(*sort_node, *(node.get_users().front()), 0, false);
-            } else {
-                p.add_connection(node, *sort_node);
-            }
-        }
-    }
-}
-
 void graph_initializations::handle_lstm(program_impl& p) {
    bool has_lstm_children;
    auto itr = p.nodes_map.begin();  // note we need to use iterators since currently processed element can be removed
@ -528,7 +478,6 @@ void graph_initializations::set_outputs(program_impl& p) {

 void graph_initializations::run(program_impl& p) {
    replace_nodes(p);
-    handle_detection_output(p);
    handle_lstm(p);
    handle_dynamic_lstm(p);
    set_outputs(p);
--- a/inference-engine/thirdparty/clDNN/src/include/detection_output_inst.h
+++ b/inference-engine/thirdparty/clDNN/src/include/detection_output_inst.h
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -60,36 +60,4 @@ public:

 using detection_output_inst = typed_primitive_inst<detection_output>;

-template <>
-class typed_program_node<detection_output_sort> : public typed_program_node_base<detection_output_sort> {
-    using parent = typed_program_node_base<detection_output_sort>;
-
-public:
-    using parent::parent;
-
-    program_node& input() const { return get_dependency(0); }
-};
-
-using detection_output_sort_node = typed_program_node<detection_output_sort>;
-
-template <>
-class typed_primitive_inst<detection_output_sort> : public typed_primitive_inst_base<detection_output_sort> {
-    using parent = typed_primitive_inst_base<detection_output_sort>;
-
-public:
-    static layout calc_output_layout(detection_output_sort_node const& node);
-    static std::string to_string(detection_output_sort_node const& node);
-
-public:
-    typed_primitive_inst(network_impl& network, detection_output_sort_node const& node);
-};
-
-using detection_output_sort_inst = typed_primitive_inst<detection_output_sort>;
-
-namespace gpu {
-primitive_impl* runDetectOutCpu(const detection_output_node& arg);
-primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel);
-primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel);
-}  // namespace gpu
-
-}  // namespace cldnn
+}  // namespace cldnn
--- a/inference-engine/thirdparty/clDNN/src/include/pass_manager.h
+++ b/inference-engine/thirdparty/clDNN/src/include/pass_manager.h
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -115,7 +115,6 @@ public:
 private:
    void run(program_impl& p) override;
    void replace_nodes(program_impl& p);
-    void handle_detection_output(program_impl& p);
    void handle_lstm(program_impl& p);
    void handle_dynamic_lstm(program_impl& p);
    void set_outputs(program_impl& p);
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/detection_output_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/detection_output_test.cpp
@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -24,8 +24,7 @@
 #include <api/engine.hpp>
 #include "test_utils/test_utils.h"

-namespace cldnn
-{
+namespace cldnn {
    template<> struct type_to_data_type<FLOAT16> { static const data_types value = data_types::f16; };
 }

@ -33,18 +32,15 @@ using namespace cldnn;
 using namespace tests;

 template <typename T>
-class detection_output_test : public ::testing::Test
-{
+class detection_output_test : public ::testing::Test {

 public:
    detection_output_test() :
-        nms_threshold(0.1f)
-    {}
+        nms_threshold(0.1f) {}

    void init_buffers(cldnn::memory prior_memory, cldnn::memory confidence_memory, cldnn::memory location_memory,
                      bool share_location, bool variance_encoded_in_target = false,
-                      int prior_info_size = 4, int prior_coordinates_offset = 0, bool prior_is_normalized = true)
-    {
+                      int prior_info_size = 4, int prior_coordinates_offset = 0, bool prior_is_normalized = true) {
        auto location_ptr = location_memory.pointer<T>();
        auto confidence_ptr = confidence_memory.pointer<T>();
        auto prior_box_ptr = prior_memory.pointer<T>();
@ -59,11 +55,9 @@ public:
        const float prior_multiplier = prior_is_normalized ? 1.0f : static_cast<float>(this->img_size);
        const float variance = 0.1f;
        int idx = 0;
-        for (int h = 0; h < 2; ++h)
-        {
+        for (int h = 0; h < 2; ++h) {
            float center_y = (h + 0.5f) * step;
-            for (int w = 0; w < 2; ++w) 
-            {
+            for (int w = 0; w < 2; ++w) {
                float center_x = (w + 0.5f) * step;
                prior_data[idx+prior_coordinates_offset+0] = (center_x - box_size / 2) * prior_multiplier;
                prior_data[idx+prior_coordinates_offset+1] = (center_y - box_size / 2) * prior_multiplier;
@ -73,28 +67,20 @@ public:
                idx += prior_info_size;
            }
        }
-        if (!variance_encoded_in_target)
-        {
-            for (int i = 0; i < idx; ++i)
-            {
+        if (!variance_encoded_in_target) {
+            for (int i = 0; i < idx; ++i) {
                prior_data[idx + i] = variance;
            }
        }

        // Fill confidences.
        idx = 0;
-        for (int i = 0; i < num_of_images; ++i) 
-        {
-            for (int j = 0; j < num_priors; ++j) 
-            {
-                for (int c = 0; c < num_classes; ++c) 
-                {
-                    if (i % 2 == c % 2) 
-                    {
+        for (int i = 0; i < num_of_images; ++i) {
+            for (int j = 0; j < num_priors; ++j) {
+                for (int c = 0; c < num_classes; ++c) {
+                    if (i % 2 == c % 2) {
                        confidence_data[idx++] = j * 0.2f;
-                    }
-                    else 
-                    {
+                    } else {
                        confidence_data[idx++] = 1 - j * 0.2f;
                    }
                }
@ -105,14 +91,10 @@ public:
        const int num_loc_classes = share_location ? 1 : num_classes;
        const float loc_multiplier = variance_encoded_in_target ? variance : 1.0f;
        idx = 0;
-        for (int i = 0; i < num_of_images; ++i) 
-        {
-            for (int h = 0; h < 2; ++h) 
-            {
-                for (int w = 0; w < 2; ++w) 
-                {
-                    for (int c = 0; c < num_loc_classes; ++c) 
-                    {
+        for (int i = 0; i < num_of_images; ++i) {
+            for (int h = 0; h < 2; ++h) {
+                for (int w = 0; w < 2; ++w) {
+                    for (int c = 0; c < num_loc_classes; ++c) {
                        location_data[idx++] = (w % 2 ? -1 : 1) * (i * 1 + c / 2.f + 0.5f) * loc_multiplier;
                        location_data[idx++] = (h % 2 ? -1 : 1) * (i * 1 + c / 2.f + 0.5f) * loc_multiplier;
                        location_data[idx++] = (w % 2 ? -1 : 1) * (i * 1 + c / 2.f + 0.5f) * loc_multiplier;
@ -123,8 +105,7 @@ public:
        }
    }

-    void init_buffer_sort(cldnn::memory input_buff)
-    {
+    void init_buffer_sort(cldnn::memory input_buff) {
        auto input_data_ptr = input_buff.pointer<T>();

        EXPECT_EQ((int)input_buff.count(), 128);
@ -150,8 +131,7 @@ public:
        input_data[121] = -1; input_data[122] = 0; input_data[123] = 0; input_data[124] = 0; input_data[125] = 0; input_data[126] = 0; input_data[127] = 0;
    }

-    void check_results(const memory& output, const int num, const std::string values)
-    {
+    void check_results(const memory& output, const int num, const std::string values) {
        assert(num < output.get_layout().size.spatial[1]);

        // Split values to vector of items.
@ -163,18 +143,15 @@ public:
        // Check data.
        auto out_ptr = output.pointer<T>();
        const T* data = out_ptr.data();
-        for (int i = 0; i < 2; ++i)
-        {
+        for (int i = 0; i < 2; ++i) {
            EXPECT_EQ(static_cast<int>((float)data[num * output.get_layout().size.spatial[0] + i]), atoi(items[i].c_str()));
        }
-        for (int i = 2; i < 7; ++i) 
-        {
+        for (int i = 2; i < 7; ++i) {
            EXPECT_TRUE(floating_point_equal(data[num * output.get_layout().size.spatial[0] + i], (T)(float)atof(items[i].c_str())));
        }
    }

-    void setup_basic(bool runOnGPU)
-    {
+    void setup_basic() {
        const bool share_location = true;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 150;
@ -192,11 +169,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -213,8 +185,7 @@ public:
        EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
    }

-    void setup_two_layers(bool runOnGPU)
-    {
+    void setup_two_layers() {
        const bool share_location = true;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 150;
@ -233,11 +204,6 @@ public:
        topology.add(detection_output("detection_output_2", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -247,8 +213,7 @@ public:

        EXPECT_EQ(outputs.size(), size_t(2));
        unsigned i = 1;
-        for (auto it = outputs.begin(); it != outputs.begin(); it++)
-        {
+        for (auto it = outputs.begin(); it != outputs.begin(); it++) {

            EXPECT_EQ(it->first, "detection_output_" + std::to_string(i));

@ -260,8 +225,7 @@ public:
        }
    }

-    void forward_share_location(bool runOnGPU)
-    {
+    void forward_share_location() {
        const bool share_location = true;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 4;
@ -282,11 +246,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -314,8 +273,7 @@ public:
        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
    }

-    void forward_num_detections_greater_than_keep_top_k(bool runOnGPU)
-    {
+    void forward_num_detections_greater_than_keep_top_k() {
        const bool share_location = true;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 1;
@ -336,11 +294,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -362,8 +315,7 @@ public:
        check_results(output_prim, 1, "1 1 0.6 0.45 0.45 0.75 0.75");
    }

-    void forward_num_detections_smaller_than_keep_top_k(bool runOnGPU)
-    {
+    void forward_num_detections_smaller_than_keep_top_k() {
        const bool share_location = true;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 6;
@ -384,11 +336,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -420,8 +367,7 @@ public:
        check_results(output_prim, 11, "-1 0 0 0 0 0 0");
    }

-    void test_forward_share_location_top_k(bool runOnGPU)
-    {
+    void test_forward_share_location_top_k() {
        const bool share_location = true;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 2;
@ -443,11 +389,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -471,8 +412,7 @@ public:
        check_results(output_prim, 3, "-1 0 0 0 0 0 0");
    }

-    void forward_no_share_location(bool runOnGPU)
-    {
+    void forward_no_share_location() {
        const bool share_location = false;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 10;
@ -493,11 +433,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -537,8 +472,7 @@ public:
        check_results(output_prim, 19, "-1 0 0 0 0 0 0");
    }

-    void forward_no_share_location_top_k(bool runOnGPU)
-    {
+    void forward_no_share_location_top_k() {
        const bool share_location = false;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 4;
@ -560,11 +494,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -592,8 +521,7 @@ public:
        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
    }

-    void forward_no_share_location_neg_0(bool runOnGPU)
-    {
+    void forward_no_share_location_neg_0() {
        const bool share_location = false;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 5;
@ -614,11 +542,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -648,8 +571,7 @@ public:
        check_results(output_prim, 9, "-1 0 0 0 0 0 0");
    }

-    void forward_no_share_location_neg_0_top_k(bool runOnGPU)
-    {
+    void forward_no_share_location_neg_0_top_k() {
        const bool share_location = false;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 2;
@ -671,11 +593,6 @@ public:
        topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -699,8 +616,7 @@ public:
        check_results(output_prim, 3, "-1 0 0 0 0 0 0");
    }

-    void forward_no_share_location_top_k_input_padding(bool runOnGPU)
-    {
+    void forward_no_share_location_top_k_input_padding() {
        const bool share_location = false;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 4;
@ -723,11 +639,6 @@ public:
        topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -755,8 +666,7 @@ public:
        check_results(output_prim, 7, "-1 0 0 0 0 0 0");
    }

-    void test_forward_no_share_location_top_k_faster_rcnn_case(bool runOnGPU)
-    {
+    void test_forward_no_share_location_top_k_faster_rcnn_case() {
        const bool share_location = false;
        const int num_loc_classes = share_location ? 1 : this->num_classes;
        const int keep_top_k = 4;
@ -792,11 +702,6 @@ public:
        ));

        build_options opts;
-        if (runOnGPU)
-        {
-            opts.set_option(build_option::detection_output_gpu(true));
-        }
-
        network network(engine, topology, opts);
        network.set_input_data("input_location", input_location);
        network.set_input_data("input_confidence", input_confidence);
@ -834,181 +739,50 @@ public:
 typedef ::testing::Types<float, FLOAT16> detection_output_test_types;
 TYPED_TEST_CASE(detection_output_test, detection_output_test_types);

-TYPED_TEST(detection_output_test, test_setup_basic)
-{
-    this->setup_basic(false);
+TYPED_TEST(detection_output_test, test_setup_basic) {
+    this->setup_basic();
 }

-TYPED_TEST(detection_output_test, test_setup_basic_gpu)
-{
-    this->setup_basic(true);
+TYPED_TEST(detection_output_test, test_setup_two_layers) {
+    this->setup_two_layers();
 }

-TYPED_TEST(detection_output_test, test_setup_two_layers)
-{
-    this->setup_two_layers(false);
+TYPED_TEST(detection_output_test, test_forward_share_location) {
+    this->forward_share_location();
 }

-TYPED_TEST(detection_output_test, test_setup_two_layers_gpu)
-{
-    this->setup_two_layers(true);
+TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k) {
+    this->forward_num_detections_greater_than_keep_top_k();
 }

-TYPED_TEST(detection_output_test, test_forward_share_location)
-{
-    this->forward_share_location(false);
+TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k) {
+    this->forward_num_detections_smaller_than_keep_top_k();
 }

-TYPED_TEST(detection_output_test, DISABLED_test_forward_share_location_gpu)
-{
-    this->forward_share_location(true);
+TYPED_TEST(detection_output_test, test_forward_share_location_top_k) {
+    this->test_forward_share_location_top_k();
 }

-TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k)
-{
-    this->forward_num_detections_greater_than_keep_top_k(false);
+TYPED_TEST(detection_output_test, test_forward_no_share_location) {
+    this->forward_no_share_location();
 }

-TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k_gpu)
-{
-    this->forward_num_detections_greater_than_keep_top_k(true);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k) {
+    this->forward_no_share_location_top_k();
 }

-TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k)
-{
-    this->forward_num_detections_smaller_than_keep_top_k(false);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0) {
+    this->forward_no_share_location_neg_0();
 }

-TYPED_TEST(detection_output_test, DISABLED_test_forward_num_detections_smaller_than_keep_top_k_gpu)
-{
-    this->forward_num_detections_smaller_than_keep_top_k(true);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k) {
+    this->forward_no_share_location_neg_0_top_k();
 }

-TYPED_TEST(detection_output_test, test_forward_share_location_top_k)
-{
-    this->test_forward_share_location_top_k(false);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding) {
+    this->forward_no_share_location_top_k_input_padding();
 }

-TYPED_TEST(detection_output_test, test_forward_share_location_top_k_gpu)
-{
-    this->test_forward_share_location_top_k(true);
+TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case) {
+    this->test_forward_no_share_location_top_k_faster_rcnn_case();
 }
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location)
-{
-    this->forward_no_share_location(false);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_gpu)
-{
-    this->forward_no_share_location(true);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k)
-{
-    this->forward_no_share_location_top_k(false);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_gpu)
-{
-    this->forward_no_share_location_top_k(true);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0)
-{
-    this->forward_no_share_location_neg_0(false);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_gpu)
-{
-    this->forward_no_share_location_neg_0(true);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k)
-{
-    this->forward_no_share_location_neg_0_top_k(false);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k_gpu)
-{
-    this->forward_no_share_location_neg_0_top_k(true);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding)
-{
-    this->forward_no_share_location_top_k_input_padding(false);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding_gpu)
-{
-    this->forward_no_share_location_top_k_input_padding(true);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case)
-{
-    this->test_forward_no_share_location_top_k_faster_rcnn_case(false);
-}
-
-TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case_gpu)
-{
-    this->test_forward_no_share_location_top_k_faster_rcnn_case(true);
-}
-
-TYPED_TEST(detection_output_test, test_detection_output_sort_gpu)
-{
-    const bool share_location = false;
-    const int num_loc_classes = share_location ? 1 : this->num_classes;
-    const int keep_top_k = 10;
-    const int background_label_id = -1;
-    const int top_k = -1;
-
-    const unsigned out_row_size = 7;
-    const unsigned score_space = ((this->num_of_images + 15) / 16) * 16;
-    int input_size = this->num_of_images * num_loc_classes * this->num_priors * out_row_size + score_space;
-
-    const auto& engine = get_test_engine();
-    cldnn::memory input_buff = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 1, 1, input_size } });
-
-    this->init_buffer_sort(input_buff);
-
-    topology topology;
-    topology.add(input_layout("input_location", input_buff.get_layout()));
-
-    topology.add(detection_output_sort("detection_output_sort", "input_location", this->num_of_images, this->num_classes, keep_top_k, share_location, top_k, background_label_id));
-    network network(engine, topology);
-    network.set_input_data("input_location", input_buff);
-
-    auto outputs = network.execute();
-
-    EXPECT_EQ(outputs.size(), size_t(1));
-    EXPECT_EQ(outputs.begin()->first, "detection_output_sort");
-
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
-    EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
-
-    auto output_prim = outputs.begin()->second.get_memory();
-
-    this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
-    this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
-    this->check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45");
-    this->check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45");
-    this->check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50");
-    this->check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50");
-    this->check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80");
-    this->check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80");
-    this->check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55");
-    this->check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75");
-    this->check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70");
-    this->check_results(output_prim, 11, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 12, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 13, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 14, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 15, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 16, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 17, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 18, "-1 0 0 0 0 0 0");
-    this->check_results(output_prim, 19, "-1 0 0 0 0 0 0");
-}
-