[IE CLDNN] Removed unused DO gpu impl (#2809)

This commit is contained in:
Vladimir Paramuzov 2020-10-28 09:24:22 +03:00 committed by GitHub
parent abb8817cf6
commit c7661078d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 77 additions and 1585 deletions

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -142,52 +142,6 @@ struct detection_output : public primitive_base<detection_output> {
protected:
};
/// @brief Generates a list of detections based on location and confidence predictions by doing non maximum suppression.
/// @details Each row is a 7 dimension vector, which stores: [image_id, label, confidence, xmin, ymin, xmax, ymax].
/// If number of detections per image is lower than keep_top_k, will write dummy results at the end with image_id=-1.
struct detection_output_sort
: public primitive_base<detection_output_sort> {
CLDNN_DECLARE_PRIMITIVE(detection_output_sort)
/// @brief Constructs detection output primitive.
/// @param id This primitive id.
/// @param input_bboxes Input bounding boxes primitive id.
/// @param num_images Number of images to be predicted.
/// @param num_classes Number of classes to be predicted.
/// @param keep_top_k Number of total bounding boxes to be kept per image after NMS step.
/// @param share_location If true bounding box are shared among different classes.
/// @param top_k Maximum number of results to be kept in NMS.
/// @param output_padding Output padding.
detection_output_sort(const primitive_id& id,
const primitive_id& input_bboxes,
const uint32_t num_images,
const uint32_t num_classes,
const uint32_t keep_top_k,
const bool share_location = true,
const int top_k = -1,
const int background_label_id = -1,
const padding& output_padding = padding())
: primitive_base(id, {input_bboxes}, output_padding),
num_images(num_images),
num_classes(num_classes),
keep_top_k(keep_top_k),
share_location(share_location),
top_k(top_k),
background_label_id(background_label_id) {}
/// @brief Number of classes to be predicted.
const uint32_t num_images;
/// @brief Number of classes to be predicted.
const uint32_t num_classes;
/// @brief Number of total bounding boxes to be kept per image after NMS step.
const int keep_top_k;
/// @brief If true, bounding box are shared among different classes.
const bool share_location;
/// @brief Maximum number of results to be kept in NMS.
const int top_k;
/// @brief Background label id (-1 if there is no background class).
const int background_label_id;
};
/// @}
/// @}
/// @}

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -46,9 +46,6 @@ enum class build_option_type {
/// @brief Enable implicit static input reordering for user inputs (default: false).
allow_static_input_reorder,
/// @brief Enable running detection output layer always on gpu, regardless performance
detection_output_gpu,
/// @brief Enable debug mode (default: false).
/// @details This option enforce all program primitives to be accessible as outputs.
debug,
@ -130,9 +127,6 @@ struct build_option {
/// @brief Enable implicit reordering for static user inputs (default: false).
static std::shared_ptr<const build_option> allow_static_input_reorder(bool enable = false);
/// @brief Enable running detection output layer always on GPU, regardless performance (default: false).
static std::shared_ptr<const build_option> detection_output_gpu(bool enable = false);
/// @brief Enable debug mode (default: false).
/// @details This option enforce all program primitives to be accessible as outputs.
static std::shared_ptr<const build_option> debug(bool enable = false);
@ -323,11 +317,6 @@ struct build_option_traits<build_option_type::allow_static_input_reorder> {
static std::shared_ptr<const build_option> make_default() { return build_option::allow_static_input_reorder(); }
};
template <>
struct build_option_traits<build_option_type::detection_output_gpu> {
typedef build_option_bool<build_option_type::detection_output_gpu> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::detection_output_gpu(); }
};
template <>
struct build_option_traits<build_option_type::debug> {
typedef build_option_bool<build_option_type::debug> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::debug(); }
@ -384,10 +373,6 @@ inline std::shared_ptr<const build_option> build_option::allow_static_input_reor
return std::make_shared<build_option_bool<build_option_type::allow_static_input_reorder>>(enable);
}
inline std::shared_ptr<const build_option> build_option::detection_output_gpu(bool enable) {
return std::make_shared<build_option_bool<build_option_type::detection_output_gpu>>(enable);
}
inline std::shared_ptr<const build_option> build_option::debug(bool enable) {
return std::make_shared<build_option_bool<build_option_type::debug>>(enable);
}

View File

@ -56,7 +56,6 @@ enum class KernelType {
PYRAMID_ROI_ALIGN,
CONTRACT,
ONE_HOT,
DETECTION_OUTPUT,
GATHER,
SCATTER_UPDATE,
DEPTH_TO_SPACE,

View File

@ -1,64 +0,0 @@
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "detection_output_kernel_base.h"
namespace kernel_selector {
JitConstants DetectionOutputKernelBase::GetJitConstants(const detection_output_params& params) const {
JitConstants jit = MakeBaseParamsJitConstants(params);
const auto& detectOutParams = params.detectOutParams;
jit.AddConstants({
MakeJitConstant("NUM_IMAGES", detectOutParams.num_images),
MakeJitConstant("NUM_CLASSES", detectOutParams.num_classes),
MakeJitConstant("KEEP_TOP_K", detectOutParams.keep_top_k),
MakeJitConstant("TOP_K", detectOutParams.top_k),
MakeJitConstant("BACKGROUND_LABEL_ID", detectOutParams.background_label_id),
MakeJitConstant("CODE_TYPE", detectOutParams.code_type),
MakeJitConstant("CONF_SIZE_X", detectOutParams.conf_size_x),
MakeJitConstant("CONF_SIZE_Y", detectOutParams.conf_size_y),
MakeJitConstant("CONF_PADDING_X", detectOutParams.conf_padding_x),
MakeJitConstant("CONF_PADDING_Y", detectOutParams.conf_padding_y),
MakeJitConstant("SHARE_LOCATION", detectOutParams.share_location),
MakeJitConstant("VARIANCE_ENCODED_IN_TARGET", detectOutParams.variance_encoded_in_target),
MakeJitConstant("NMS_THRESHOLD", detectOutParams.nms_threshold),
MakeJitConstant("ETA", detectOutParams.eta),
MakeJitConstant("CONFIDENCE_THRESHOLD", detectOutParams.confidence_threshold),
MakeJitConstant("IMAGE_WIDTH", detectOutParams.input_width),
MakeJitConstant("IMAGE_HEIGH", detectOutParams.input_heigh),
MakeJitConstant("ELEMENTS_PER_THREAD", detectOutParams.elements_per_thread),
MakeJitConstant("PRIOR_COORD_OFFSET", detectOutParams.prior_coordinates_offset),
MakeJitConstant("PRIOR_INFO_SIZE", detectOutParams.prior_info_size),
MakeJitConstant("PRIOR_IS_NORMALIZED", detectOutParams.prior_is_normalized),
});
return jit;
}
DetectionOutputKernelBase::DispatchData DetectionOutputKernelBase::SetDefault(const detection_output_params& /*params*/) const {
DispatchData dispatchData;
dispatchData.gws[0] = 0;
dispatchData.gws[1] = 0;
dispatchData.gws[2] = 0;
dispatchData.lws[0] = 0;
dispatchData.lws[1] = 0;
dispatchData.lws[2] = 0;
return dispatchData;
}
} // namespace kernel_selector

View File

@ -1,78 +0,0 @@
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "kernel_base_opencl.h"
#include "kernel_selector_params.h"
namespace kernel_selector {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// detection_output_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct detection_output_params : public base_params {
detection_output_params() : base_params(KernelType::DETECTION_OUTPUT), detectOutParams() {}
struct DedicatedParams {
uint32_t num_images;
uint32_t num_classes;
int32_t keep_top_k;
int32_t top_k;
int32_t background_label_id;
int32_t code_type;
int32_t conf_size_x;
int32_t conf_size_y;
int32_t conf_padding_x;
int32_t conf_padding_y;
int32_t elements_per_thread;
int32_t input_width;
int32_t input_heigh;
int32_t prior_coordinates_offset;
int32_t prior_info_size;
bool prior_is_normalized;
bool share_location;
bool variance_encoded_in_target;
float nms_threshold;
float eta;
float confidence_threshold;
};
DedicatedParams detectOutParams;
virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// detection_output_optional_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct detection_output_optional_params : optional_params {
detection_output_optional_params() : optional_params(KernelType::DETECTION_OUTPUT) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// DetectionOutputKernelBase
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class DetectionOutputKernelBase : public KernelBaseOpenCL {
public:
using KernelBaseOpenCL ::KernelBaseOpenCL;
virtual ~DetectionOutputKernelBase() {}
using DispatchData = CommonDispatchData;
protected:
JitConstants GetJitConstants(const detection_output_params& params) const;
virtual DispatchData SetDefault(const detection_output_params& params) const;
};
} // namespace kernel_selector

View File

@ -1,88 +0,0 @@
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "detection_output_kernel_ref.h"
#include "kernel_selector_utils.h"
#define PRIOR_BOX_SIZE 4 // Each prior-box consists of [xmin, ymin, xmax, ymax].
namespace kernel_selector {
ParamsKey DetectionOutputKernel::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
return k;
}
CommonDispatchData DetectionOutputKernel::SetDefault(const detection_output_params& params) const {
CommonDispatchData dispatchData = DetectionOutputKernelBase::SetDefault(params);
// Number of all work items is set to total number of bounding boxes -
// one bounding box is procerssed by one work item
size_t num_classes = (params.detectOutParams.share_location) ? 1 : params.detectOutParams.num_classes;
// Size of input0 (input location), if shared loaction it is equal to size of one class,
// else it has size of all items for all classes
size_t bboxesNum = params.inputs[0].LogicalSize() / PRIOR_BOX_SIZE / num_classes;
// Work group size is set to number of bounding boxes per image for sorting purpose
// (access to one table with sorted values)
size_t work_group_size = bboxesNum / params.inputs[0].Batch().v;
if (work_group_size > 256) {
work_group_size = work_group_size / ((work_group_size / 256) + 1) + 1;
}
bboxesNum = work_group_size * params.inputs[0].Batch().v;
dispatchData.gws[0] = Align(bboxesNum, work_group_size);
dispatchData.gws[1] = 1;
dispatchData.gws[2] = 1;
dispatchData.lws[0] = work_group_size;
dispatchData.lws[1] = 1;
dispatchData.lws[2] = 1;
return dispatchData;
}
KernelsData DetectionOutputKernel::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::DETECTION_OUTPUT && options.GetType() == KernelType::DETECTION_OUTPUT);
KernelData kd = KernelData::Default<detection_output_params>(params);
const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
DispatchData dispatchData = SetDefault(detectOutParams);
auto cldnnJit = GetJitConstants(detectOutParams);
auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
kernel.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2});
kd.estimatedTime = FORCE_PRIORITY_8;
return {kd};
}
} // namespace kernel_selector

View File

@ -1,33 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "detection_output_kernel_base.h"
namespace kernel_selector {
class DetectionOutputKernel : public DetectionOutputKernelBase {
public:
DetectionOutputKernel() : DetectionOutputKernelBase("detection_output") {}
virtual ~DetectionOutputKernel() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
private:
CommonDispatchData SetDefault(const detection_output_params& params) const override;
};
} // namespace kernel_selector

View File

@ -1,34 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "detection_output_kernel_selector.h"
#include "detection_output_kernel_ref.h"
#include "detection_output_kernel_sort.h"
namespace kernel_selector {
detection_output_kernel_selector::detection_output_kernel_selector() { Attach<DetectionOutputKernel>(); }
KernelsData detection_output_kernel_selector::GetBestKernels(const Params& params,
const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::DETECTION_OUTPUT);
}
detection_output_sort_kernel_selector::detection_output_sort_kernel_selector() { Attach<DetectionOutputKernel_sort>(); }
KernelsData detection_output_sort_kernel_selector::GetBestKernels(const Params& params,
const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::DETECTION_OUTPUT);
}
} // namespace kernel_selector

View File

@ -1,48 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "kernel_selector.h"
namespace kernel_selector {
class detection_output_kernel_selector : public kernel_selector_base {
public:
static detection_output_kernel_selector& Instance() {
static detection_output_kernel_selector instance_;
return instance_;
}
detection_output_kernel_selector();
virtual ~detection_output_kernel_selector() {}
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
};
class detection_output_sort_kernel_selector : public kernel_selector_base {
public:
static detection_output_sort_kernel_selector& Instance() {
static detection_output_sort_kernel_selector instance_;
return instance_;
}
detection_output_sort_kernel_selector();
virtual ~detection_output_sort_kernel_selector() {}
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
};
} // namespace kernel_selector

View File

@ -1,82 +0,0 @@
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "detection_output_kernel_sort.h"
#include "kernel_selector_utils.h"
#define DETECTION_OUTPUT_ROW_SIZE 7 // Each detection consists of [image_id, label, confidence, xmin, ymin, xmax, ymax].
namespace kernel_selector {
ParamsKey DetectionOutputKernel_sort::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableInputLayout(DataLayout::bfyx);
k.EnableOutputLayout(DataLayout::bfyx);
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
return k;
}
CommonDispatchData DetectionOutputKernel_sort::SetDefault(const detection_output_params& params) const {
CommonDispatchData dispatchData = DetectionOutputKernelBase::SetDefault(params);
unsigned class_num = params.detectOutParams.num_classes;
if (params.detectOutParams.share_location && params.detectOutParams.background_label_id == 0) {
class_num -= 1;
}
const size_t bboxesNum = class_num * params.detectOutParams.num_images;
// Work group size is set to number of bounding boxes per image
size_t work_group_size = class_num;
if (work_group_size > 256) {
work_group_size = (work_group_size + work_group_size % 2) / (work_group_size / 256 + 1);
}
dispatchData.gws[0] = Align(bboxesNum, work_group_size);
dispatchData.gws[1] = 1;
dispatchData.gws[2] = 1;
dispatchData.lws[0] = work_group_size;
dispatchData.lws[1] = 1;
dispatchData.lws[2] = 1;
return dispatchData;
}
KernelsData DetectionOutputKernel_sort::GetKernelsData(const Params& params, const optional_params& options) const {
assert(params.GetType() == KernelType::DETECTION_OUTPUT &&
options.GetType() == KernelType::DETECTION_OUTPUT);
KernelData kd = KernelData::Default<detection_output_params>(params);
const detection_output_params& detectOutParams = static_cast<const detection_output_params&>(params);
DispatchData dispatchData = SetDefault(detectOutParams);
auto cldnnJit = GetJitConstants(detectOutParams);
auto entryPoint = GetEntryPoint(kernelName, detectOutParams.layerID, options);
auto jit = CreateJit(kernelName, cldnnJit, entryPoint);
auto& kernel = kd.kernels[0];
FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entryPoint);
kd.estimatedTime = FORCE_PRIORITY_8;
return {kd};
}
} // namespace kernel_selector

View File

@ -1,33 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "detection_output_kernel_base.h"
namespace kernel_selector {
class DetectionOutputKernel_sort : public DetectionOutputKernelBase {
public:
DetectionOutputKernel_sort() : DetectionOutputKernelBase("detection_output_sort") {}
virtual ~DetectionOutputKernel_sort() {}
KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
ParamsKey GetSupportedKey() const override;
private:
CommonDispatchData SetDefault(const detection_output_params& params) const override;
};
} // namespace kernel_selector

View File

@ -1,217 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/include_all.cl"
#include "include/detection_output_common.cl"
KERNEL (detection_output)(__global UNIT_TYPE* input_location, __global UNIT_TYPE* output, __global UNIT_TYPE* input_confidence, __global UNIT_TYPE* input_prior_box)
{
const uint idx = get_global_id(0); // bbox idx
const uint local_id = (uint)get_local_id(0) * NUM_OF_ITEMS; // All bboxes from one image in work group
const uint idx_image = idx / NUM_OF_ITERATIONS; // idx of current image
__local uint indexes[NUM_OF_PRIORS];
__local uint scores_size[NUM_CLASSES * NUM_OF_IMAGES];
__local bool stillSorting;
uint indexes_class_0[NUM_OF_PRIORS];
int last_bbox_in_class = NUM_OF_ITEMS;
bool is_last_bbox_in_class = false;
for (uint it = 0; it < NUM_OF_ITEMS; it ++)
{
if (((local_id + it + 1) % NUM_OF_PRIORS) == 0 )
{
last_bbox_in_class = it;
is_last_bbox_in_class = true;
break;
}
}
for (uint idx_class = 0; idx_class < NUM_CLASSES; idx_class++)
{
if (idx_class == BACKGROUND_LABEL_ID)
{
continue;
}
for (uint it = 0; it < NUM_OF_ITEMS; it++)
{
indexes[local_id + it] = local_id + it;
}
stillSorting = true;
barrier(CLK_LOCAL_MEM_FENCE);
bool is_last_bbox_in_image = (is_last_bbox_in_class) && (idx_class == (NUM_CLASSES - 1));
while(stillSorting)
{
barrier(CLK_LOCAL_MEM_FENCE);
stillSorting = false;
for (uint i = 0; i < 2; i++)
{
for (uint it = 0; it < NUM_OF_ITEMS; it++)
{
uint item_id = local_id + it;
uint idx1 = indexes[item_id];
uint idx2 = indexes[item_id+1];
bool perform = false;
if ((((i % 2) && (item_id % 2)) ||
((!(i % 2)) && (!(item_id % 2)))) &&
(it < last_bbox_in_class))
{
perform = true;
}
if (perform &&
(FUNC_CALL(get_score)(input_confidence, idx1, idx_class, idx_image) <
FUNC_CALL(get_score)(input_confidence, idx2, idx_class, idx_image)))
{
indexes[item_id] = idx2;
indexes[item_id+1] = idx1;
stillSorting = true;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
// Do it only once per class in image
if (is_last_bbox_in_class)
{
UNIT_TYPE adaptive_threshold = NMS_THRESHOLD;
uint post_nms_count = 0;
const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
scores_size[idx_class] = 0;
// Do the "keep" algorithm only for classes with confidence greater than CONFIDENCE_THRESHOLD.
// Check first, the biggest one (after sort) element in class.
if (FUNC_CALL(get_score)(input_confidence, indexes[0], idx_class, idx_image) != 0.0f)
{
for (uint i = 0; i < SCORES_COUNT; i++)
{
const uint bb_idx = indexes[i];
bool keep = true;
for (uint j = 0; j < post_nms_count; j++)
{
if (!keep)
{
break;
}
UNIT_TYPE overlap = 0.0;
const uint bb_idx2 = indexes[j];
UNIT_TYPE decoded_bbox1[4];
FUNC_CALL(get_decoded_bbox)(decoded_bbox1, input_location, input_prior_box, bb_idx, shared_class, idx_image);
UNIT_TYPE decoded_bbox2[4];
FUNC_CALL(get_decoded_bbox)(decoded_bbox2, input_location, input_prior_box, bb_idx2, shared_class, idx_image);
bool intersecting =
(decoded_bbox1[0] < decoded_bbox2[2]) &
(decoded_bbox2[0] < decoded_bbox1[2]) &
(decoded_bbox1[1] < decoded_bbox2[3]) &
(decoded_bbox2[1] < decoded_bbox1[3]);
if (intersecting)
{
const UNIT_TYPE intersect_width = min(decoded_bbox1[2], decoded_bbox2[2]) - max(decoded_bbox1[0], decoded_bbox2[0]);
const UNIT_TYPE intersect_height = min(decoded_bbox1[3], decoded_bbox2[3]) - max(decoded_bbox1[1], decoded_bbox2[1]);
const UNIT_TYPE intersect_size = intersect_width * intersect_height;
const UNIT_TYPE bbox1_area = (decoded_bbox1[2] - decoded_bbox1[0]) * (decoded_bbox1[3] - decoded_bbox1[1]);
const UNIT_TYPE bbox2_area = (decoded_bbox2[2] - decoded_bbox2[0]) * (decoded_bbox2[3] - decoded_bbox2[1]);
overlap = intersect_size / (bbox1_area + bbox2_area - intersect_size);
}
keep = (overlap <= adaptive_threshold);
}
if (keep)
{
indexes[post_nms_count] = indexes[i];
++post_nms_count;
}
if ((keep) && (ETA < 1) && (adaptive_threshold > 0.5))
{
adaptive_threshold *= ETA;
}
}
}
// Write number of scores to global memory, for proper output order in separated work groups
scores_size[idx_class] = post_nms_count;
}
stillSorting = true;
// Wait for scores number from all classes in images
barrier(CLK_LOCAL_MEM_FENCE);
uint output_offset = (idx_image * NUM_CLASSES_OUT + idx_class - HIDDEN_CLASS) * SCORES_COUNT;
for (uint it = 0; it < NUM_OF_ITEMS; it++)
{
const uint local_id_out = local_id + it;
if (local_id_out < scores_size[idx_class])
{
const uint score_idx = indexes[local_id_out];
uint bb_idx = indexes[local_id_out];
const uint shared_class = (SHARE_LOCATION)? 0 : idx_class;
UNIT_TYPE decoded_bbox[4];
FUNC_CALL(get_decoded_bbox)(decoded_bbox, input_location, input_prior_box, bb_idx, shared_class, idx_image);
const uint out_idx = (local_id_out + output_offset) * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
output[out_idx] = TO_UNIT_TYPE(idx_image);
output[out_idx + 1] = TO_UNIT_TYPE(idx_class);
output[out_idx + 2] = FUNC_CALL(get_score)(input_confidence, score_idx, idx_class, idx_image);
output[out_idx + 3] = decoded_bbox[0];
output[out_idx + 4] = decoded_bbox[1];
output[out_idx + 5] = decoded_bbox[2];
output[out_idx + 6] = decoded_bbox[3];
}
}
// If work item is processing last bbox in image (we already know the number of all detections),
// use it to fill rest of keep_top_k items if number of detections is smaller
if (is_last_bbox_in_class)
{
uint out_idx = output_offset + scores_size[idx_class];
uint current_top_k = output_offset + SCORES_COUNT;
for (uint i = out_idx; i < current_top_k; i++)
{
out_idx = i * OUTPUT_ROW_SIZE + OUTPUT_OFFSET;
output[out_idx] = -1.0;
output[out_idx + 1] = 0.0;
output[out_idx + 2] = 0.0;
output[out_idx + 3] = 0.0;
output[out_idx + 4] = 0.0;
output[out_idx + 5] = 0.0;
output[out_idx + 6] = 0.0;
}
}
// Write number of scores kept in first step of detection output
if (is_last_bbox_in_image)
{
uint scores_sum = 0;
for (uint i = 0; i < NUM_CLASSES; i++)
{
scores_sum += scores_size[i];
}
output[idx_image] = scores_sum;
}
}
}

View File

@ -1,217 +0,0 @@
// Copyright (c) 2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/include_all.cl"
#include "include/detection_output_common.cl"
UNIT_TYPE FUNC(get_score_sort)(__global UNIT_TYPE* input_bboxes, const uint idx_bbox, const uint idx_image)
{
if (idx_bbox == KEEP_BBOXES_NUM)
{
// Idx set to dummy value, return -1 to exclude this element from sorting
return -1;
}
else
{
return input_bboxes[(idx_bbox + idx_image * NUM_OF_IMAGE_BBOXES) * OUTPUT_ROW_SIZE + INPUT_OFFSET + SCORE_OFFSET];
}
}
KERNEL (detection_output_sort)(__global UNIT_TYPE* input_bboxes, __global UNIT_TYPE* output)
{
__local uint indexes[NUM_CLASSES_IN];
__local bool stillSorting;
__local uint output_count;
__local uint num_out_per_class[NUM_CLASSES_IN];
output_count = 0;
num_out_per_class[get_local_id(0)] = 0;
const uint image_id = (uint)get_global_id(0) / NUM_CLASSES_IN;
const uint local_id = (uint)get_local_id(0) * NUM_OF_ITEMS_SORT; // All bboxes from one image in work group
uint image_offset_input = image_id * NUM_OF_IMAGE_BBOXES;
uint count_sum = 0;
for (uint i = 0; i < image_id; i++)
{
count_sum += (input_bboxes[i] < KEEP_TOP_K)? input_bboxes[i] : KEEP_TOP_K;
}
uint image_offset_output = count_sum * OUTPUT_ROW_SIZE;
// If there is less elements than needed, write input to output
if (input_bboxes[image_id] <= KEEP_TOP_K)
{
if (local_id == 0)
{
for (uint class = 0; class < NUM_CLASSES_IN; class++)
{
if (class == BACKGROUND_LABEL_ID && !HIDDEN_CLASS)
{
continue;
}
for (uint i = 0; i < NUM_OF_CLASS_BBOXES; i++)
{
uint input_idx = (i + image_offset_input + class * NUM_OF_CLASS_BBOXES) * OUTPUT_ROW_SIZE + INPUT_OFFSET;
if (input_bboxes[input_idx] != -1)
{
uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output;
for (uint idx = 0; idx < OUTPUT_ROW_SIZE; idx++)
{
output[out_idx + idx] = input_bboxes[input_idx + idx];
}
output_count++;
}
else
{
break;
}
}
}
}
}
else
{
uint sorted_output[KEEP_TOP_K * NUM_CLASSES_IN];
for (uint it = 0; it < NUM_OF_ITEMS_SORT; it++)
{
indexes[local_id + it] = (local_id + it) * NUM_OF_CLASS_BBOXES;
}
while (output_count < KEEP_BBOXES_NUM)
{
stillSorting = true;
while(stillSorting)
{
barrier(CLK_LOCAL_MEM_FENCE);
stillSorting = false;
for (uint it = 0; it < NUM_OF_ITEMS_SORT; it++)
{
uint item_id = local_id + it;
for (uint i = 0; i < 2; i++)
{
uint idx1 = indexes[item_id];
uint idx2 = indexes[item_id+1];
bool perform = false;
if ((((i % 2) && (item_id % 2)) ||
((!(i % 2)) && (!(item_id % 2)))) &&
(item_id != (NUM_CLASSES_IN - 1)))
{
perform = true;
}
if (perform &&
(FUNC_CALL(get_score_sort)(input_bboxes, idx1, image_id) <
FUNC_CALL(get_score_sort)(input_bboxes, idx2, image_id)))
{
indexes[item_id] = idx2;
indexes[item_id+1] = idx1;
stillSorting = true;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
if (local_id == 0)
{
UNIT_TYPE top_score = FUNC_CALL(get_score_sort)(input_bboxes, indexes[0], image_id);
if (top_score != 0)
{
for (uint it = 0; (it < NUM_CLASSES_IN) && (output_count < KEEP_BBOXES_NUM); it++)
{
if (FUNC_CALL(get_score_sort)(input_bboxes, indexes[it], image_id) == top_score)
{
// write to output, create counter, and check if keep_top_k is satisfied.
uint input_idx = (indexes[it] + image_offset_input) * OUTPUT_ROW_SIZE + INPUT_OFFSET;
uint class_idx = input_bboxes[input_idx + 1] - HIDDEN_CLASS;
sorted_output[class_idx * KEEP_TOP_K + num_out_per_class[class_idx]] = input_idx;
num_out_per_class[class_idx]++;
indexes[it]++;
output_count++;
// If all class elements are written to output, set dummy value to exclude class from sorting.
if ((indexes[it] % NUM_OF_CLASS_BBOXES) == 0)
{
indexes[it] = KEEP_BBOXES_NUM;
}
}
}
}
else
{
// There is no more significant results to sort.
output_count = KEEP_BBOXES_NUM;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_id == 0)
{
output_count = 0;
for (uint i = 0; i < NUM_CLASSES_IN; i++)
{
for (uint j = 0; j < num_out_per_class[i]; j++)
{
uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output;
for (uint idx = 0; idx < OUTPUT_ROW_SIZE; idx++)
{
output[out_idx + idx] = input_bboxes[sorted_output[i * KEEP_TOP_K + j] + idx];
}
output_count++;
}
}
uint image_count_sum = (input_bboxes[image_id] < KEEP_TOP_K)? input_bboxes[image_id] : KEEP_TOP_K;
for (output_count; output_count < image_count_sum; output_count++)
{
uint out_idx = output_count * OUTPUT_ROW_SIZE + image_offset_output;
output[out_idx] = -1.0;
output[out_idx + 1] = 0.0;
output[out_idx + 2] = 0.0;
output[out_idx + 3] = 0.0;
output[out_idx + 4] = 0.0;
output[out_idx + 5] = 0.0;
output[out_idx + 6] = 0.0;
}
}
}
if (local_id == 0 &&
image_id == (NUM_IMAGES - 1))
{
for (output_count += count_sum; output_count < (KEEP_TOP_K * NUM_IMAGES); output_count++ )
{
uint out_idx = output_count * OUTPUT_ROW_SIZE;
output[out_idx] = -1.0;
output[out_idx + 1] = 0.0;
output[out_idx + 2] = 0.0;
output[out_idx + 3] = 0.0;
output[out_idx + 4] = 0.0;
output[out_idx + 5] = 0.0;
output[out_idx + 6] = 0.0;
}
}
}

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -65,16 +65,8 @@ layout detection_output_inst::calc_output_layout(detection_output_node const& no
// Add space for number of output results per image - needed in the next detection output step
output_size += ((input_layout.size.batch[0] + 15) / 16) * 16;
if (node.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled()) {
return {input_layout.data_type, cldnn::format::bfyx, cldnn::tensor(1, 1, 1, output_size)};
} else {
return {input_layout.data_type,
cldnn::format::bfyx,
cldnn::tensor(1,
1,
DETECTION_OUTPUT_ROW_SIZE,
node.get_primitive()->keep_top_k * input_layout.size.batch[0])};
}
return {input_layout.data_type, cldnn::format::bfyx,
cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, node.get_primitive()->keep_top_k * input_layout.size.batch[0])};
}
std::string detection_output_inst::to_string(detection_output_node const& node) {
@ -203,82 +195,4 @@ detection_output_inst::typed_primitive_inst(network_impl& network, detection_out
"Detection output layer doesn't support input padding in Prior-Box input");
}
/************************ Detection Output keep_top_k part ************************/
primitive_type_id detection_output_sort::type_id() {
static primitive_type_base<detection_output_sort> instance;
return &instance;
}
layout detection_output_sort_inst::calc_output_layout(detection_output_sort_node const& node) {
assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
"Output data type forcing is not supported for "
"detection_output_sort_node!");
CLDNN_ERROR_NOT_EQUAL(node.id(),
"Detection output layer input number",
node.get_dependencies().size(),
"expected number of inputs",
static_cast<size_t>(1),
"");
auto input_layout = node.input().get_output_layout();
int keep_top_k = node.as<detection_output_sort>().get_primitive()->keep_top_k;
int num_images = node.as<detection_output_sort>().get_primitive()->num_images;
// If detection output sort is used as a second part of detection output get proper info from detection otput node
if (num_images == 0) {
CLDNN_ERROR_BOOL(node.id(),
"node.get_dependency(0).is_type<detection_output>()",
!node.get_dependency(0).is_type<detection_output>(),
"Cannot calculate output layout.");
input_layout = node.get_dependency(0).as<detection_output>().location().get_output_layout();
keep_top_k = node.get_dependency(0).as<detection_output>().get_primitive()->keep_top_k;
num_images = input_layout.size.batch[0];
}
// Batch size and feature size are 1.
// Number of bounding boxes to be kept is set to keep_top_k*batch size.
// If number of detections is lower than keep_top_k, will write dummy results at the end with image_id=-1.
// Each row is a 7 dimension vector, which stores:
// [image_id, label, confidence, xmin, ymin, xmax, ymax]
return {input_layout.data_type,
cldnn::format::bfyx,
cldnn::tensor(1, 1, DETECTION_OUTPUT_ROW_SIZE, keep_top_k * num_images)};
}
std::string detection_output_sort_inst::to_string(detection_output_sort_node const& node) {
auto node_info = node.desc_to_json();
auto desc = node.get_primitive();
auto& input_bboxes = node.input();
std::stringstream primitive_description;
json_composite detec_out_info;
detec_out_info.add("input bboxes id", input_bboxes.id());
detec_out_info.add("num_classes:", desc->num_images);
detec_out_info.add("num_classes:", desc->num_classes);
detec_out_info.add("keep_top_k", desc->keep_top_k);
detec_out_info.add("share_location", desc->share_location);
detec_out_info.add("top_k", desc->top_k);
detec_out_info.dump(primitive_description);
node_info->add("dection output info", detec_out_info);
node_info->dump(primitive_description);
return primitive_description.str();
}
detection_output_sort_inst::typed_primitive_inst(network_impl& network, detection_output_sort_node const& node)
: parent(network, node) {
CLDNN_ERROR_NOT_PROPER_FORMAT(node.id(),
"Input memory format",
node.get_dependency(0).get_output_layout().format.value,
"expected bfyx input format",
format::bfyx);
CLDNN_ERROR_BOOL(node.id(),
"Detecion output layer padding",
node.is_padded(),
"Detection output layer doesn't support output padding.");
}
} // namespace cldnn

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -19,6 +19,7 @@
#include "network_impl.h"
#include "implementation_map.h"
#include "math_utils.h"
#include "register_gpu.hpp"
#include "cpu_impl_helpers.hpp"
#include <algorithm>
@ -636,7 +637,14 @@ struct detection_output_cpu : typed_primitive_impl<detection_output> {
static primitive_impl* create(const detection_output_node& arg) { return new detection_output_cpu(arg); }
};
primitive_impl* runDetectOutCpu(const detection_output_node& arg) { return new detection_output_cpu(arg); }
namespace detail {
attach_detection_output_gpu::attach_detection_output_gpu() {
implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), detection_output_cpu::create);
implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), detection_output_cpu::create);
}
} // namespace detail
} // namespace gpu
} // namespace cldnn

View File

@ -1,164 +0,0 @@
/*
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/
#include "detection_output_inst.h"
#include "primitive_gpu_base.h"
#include "error_handler.h"
#include "kernel_selector_helper.h"
#include "detection_output/detection_output_kernel_base.h"
#include "detection_output/detection_output_kernel_selector.h"
#ifdef FIX_OPENMP_RELEASE_ISSUE
#ifdef OPENMP_FOUND
#include <omp.h>
#endif
#endif
namespace cldnn {
namespace gpu {
struct detection_output_gpu : typed_primitive_gpu_impl<detection_output> {
using parent = typed_primitive_gpu_impl<detection_output>;
using parent::parent;
private:
static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams,
const detection_output_node& arg) {
auto primitive = arg.get_primitive();
detectOutParams.keep_top_k = primitive->keep_top_k;
detectOutParams.num_classes = primitive->num_classes;
detectOutParams.top_k = primitive->top_k;
detectOutParams.background_label_id = primitive->background_label_id;
detectOutParams.code_type = (int32_t)primitive->code_type;
detectOutParams.share_location = primitive->share_location;
detectOutParams.variance_encoded_in_target = primitive->variance_encoded_in_target;
detectOutParams.nms_threshold = primitive->nms_threshold;
detectOutParams.eta = primitive->eta;
detectOutParams.confidence_threshold = primitive->confidence_threshold;
detectOutParams.prior_coordinates_offset = primitive->prior_coordinates_offset;
detectOutParams.prior_info_size = primitive->prior_info_size;
detectOutParams.prior_is_normalized = primitive->prior_is_normalized;
detectOutParams.input_width = primitive->input_width;
detectOutParams.input_heigh = primitive->input_height;
detectOutParams.conf_size_x = arg.confidence().get_output_layout().get_buffer_size().spatial[0];
detectOutParams.conf_size_y = arg.confidence().get_output_layout().get_buffer_size().spatial[1];
detectOutParams.conf_padding_x = arg.confidence().get_output_layout().data_padding.lower_size().spatial[0];
detectOutParams.conf_padding_y = arg.confidence().get_output_layout().data_padding.lower_size().spatial[1];
}
public:
static primitive_impl* create(const detection_output_node& arg) {
if (!arg.get_program().get_options().get<build_option_type::detection_output_gpu>()->enabled()) {
return runDetectOutCpu(arg);
}
auto detect_out_params = get_default_params<kernel_selector::detection_output_params>(arg);
auto detect_out_optional_params =
get_default_optional_params<kernel_selector::detection_output_optional_params>(arg.get_program());
setDetectOutSpecificParams(detect_out_params.detectOutParams, arg);
auto& kernel_selector = kernel_selector::detection_output_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params);
CLDNN_ERROR_BOOL(arg.id(),
"Best_kernel.empty()",
best_kernels.empty(),
"Cannot find a proper kernel with this arguments");
auto detect_out = new detection_output_gpu(arg, best_kernels[0]);
return detect_out;
}
};
primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel) {
return new detection_output_gpu(arg, kernel);
}
/************************ Detection Output keep_top_k part ************************/
struct detection_output_sort_gpu : typed_primitive_gpu_impl<detection_output_sort> {
using parent = typed_primitive_gpu_impl<detection_output_sort>;
using parent::parent;
private:
static void setDetectOutSpecificParams(kernel_selector::detection_output_params::DedicatedParams& detectOutParams,
const detection_output_sort_node& arg) {
if (arg.get_dependency(0).is_type<detection_output>()) {
auto primitive = arg.get_dependency(0).as<detection_output>().get_primitive();
detectOutParams.keep_top_k = primitive->keep_top_k;
detectOutParams.num_classes = primitive->num_classes;
detectOutParams.num_images =
arg.get_dependency(0).as<detection_output>().location().get_output_layout().size.batch[0];
detectOutParams.top_k = primitive->top_k;
detectOutParams.share_location = primitive->share_location;
detectOutParams.background_label_id = primitive->background_label_id;
} else {
auto primitive = arg.get_primitive();
detectOutParams.keep_top_k = primitive->keep_top_k;
detectOutParams.num_classes = primitive->num_classes;
detectOutParams.num_images = primitive->num_images;
detectOutParams.top_k = primitive->top_k;
detectOutParams.share_location = primitive->share_location;
detectOutParams.background_label_id = primitive->background_label_id;
}
}
public:
static primitive_impl* create(const detection_output_sort_node& arg) {
auto detect_out_params = get_default_params<kernel_selector::detection_output_params>(arg);
auto detect_out_optional_params =
get_default_optional_params<kernel_selector::detection_output_optional_params>(arg.get_program());
setDetectOutSpecificParams(detect_out_params.detectOutParams, arg);
auto& kernel_selector = kernel_selector::detection_output_sort_kernel_selector::Instance();
auto best_kernels = kernel_selector.GetBestKernels(detect_out_params, detect_out_optional_params);
CLDNN_ERROR_BOOL(arg.id(),
"Best_kernel.empty()",
best_kernels.empty(),
"Cannot find a proper kernel with this arguments");
auto detect_out = new detection_output_sort_gpu(arg, best_kernels[0]);
return detect_out;
}
};
primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel) {
return new detection_output_sort_gpu(arg, kernel);
}
namespace detail {
attach_detection_output_gpu::attach_detection_output_gpu() {
implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
detection_output_gpu::create);
implementation_map<detection_output>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
detection_output_gpu::create);
implementation_map<detection_output_sort>::add(
std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx),
detection_output_sort_gpu::create);
implementation_map<detection_output_sort>::add(
std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx),
detection_output_sort_gpu::create);
}
} // namespace detail
} // namespace gpu
} // namespace cldnn

View File

@ -120,56 +120,6 @@ void graph_initializations::replace_nodes(program_impl& p) {
}
}
void graph_initializations::handle_detection_output(program_impl& p) {
auto itr = p.nodes_map.begin(); // note we need to use iterators since currently processed element can be removed
while (itr != p.nodes_map.end()) {
auto node_itr = itr++;
auto& node = *(*node_itr).second;
// Create second part detection output primitive and replace nodes names - do it only once
if ((p.get_options().get<build_option_type::detection_output_gpu>()->enabled()) &&
(node.is_type<detection_output>()) &&
(node.id().find("_pre") ==
std::string::npos)) { // ToDo: this will fail if user will name the primitive with using _pre like do_pre
// we need to use node mark() or some other idea to prevent it
// rename detection output
const primitive_id detect_out_node_name = node.id();
const primitive_id new_primitive_id = detect_out_node_name + "_pre";
p.rename(node, new_primitive_id);
auto detect_out_prim = node.as<detection_output>().typed_desc();
// Create new primitive, "keep top k" part of detection output
// ToDo: add a default parameters to the detection_output_sort class constructor to get rid off this
// initialization from here
auto detect_out_sort_prim =
std::make_shared<detection_output_sort>(detect_out_node_name,
node.id(),
// not important params here - it will be set during
// "primitive_impl* create" func in "detection_output_sort_gpu"
0, // num_images
0, // num_classes
0, // keep_top_k
false, // share_location
0, // top_k
-1, // background_label_id
detect_out_prim->output_padding);
p.get_or_create(detect_out_sort_prim);
auto sort_node_itr = p.nodes_map.find(detect_out_node_name);
if (sort_node_itr == p.nodes_map.end()) continue;
auto sort_node = sort_node_itr->second;
// Add connection to second part of detection output
if (node.get_users().size()) {
p.add_intermediate(*sort_node, *(node.get_users().front()), 0, false);
} else {
p.add_connection(node, *sort_node);
}
}
}
}
void graph_initializations::handle_lstm(program_impl& p) {
bool has_lstm_children;
auto itr = p.nodes_map.begin(); // note we need to use iterators since currently processed element can be removed
@ -528,7 +478,6 @@ void graph_initializations::set_outputs(program_impl& p) {
void graph_initializations::run(program_impl& p) {
replace_nodes(p);
handle_detection_output(p);
handle_lstm(p);
handle_dynamic_lstm(p);
set_outputs(p);

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -60,36 +60,4 @@ public:
using detection_output_inst = typed_primitive_inst<detection_output>;
template <>
class typed_program_node<detection_output_sort> : public typed_program_node_base<detection_output_sort> {
using parent = typed_program_node_base<detection_output_sort>;
public:
using parent::parent;
program_node& input() const { return get_dependency(0); }
};
using detection_output_sort_node = typed_program_node<detection_output_sort>;
template <>
class typed_primitive_inst<detection_output_sort> : public typed_primitive_inst_base<detection_output_sort> {
using parent = typed_primitive_inst_base<detection_output_sort>;
public:
static layout calc_output_layout(detection_output_sort_node const& node);
static std::string to_string(detection_output_sort_node const& node);
public:
typed_primitive_inst(network_impl& network, detection_output_sort_node const& node);
};
using detection_output_sort_inst = typed_primitive_inst<detection_output_sort>;
namespace gpu {
primitive_impl* runDetectOutCpu(const detection_output_node& arg);
primitive_impl* runDetectOutGpu(const detection_output_node& arg, kernel_selector::KernelData kernel);
primitive_impl* runDetectOutSortGpu(const detection_output_sort_node& arg, kernel_selector::KernelData kernel);
} // namespace gpu
} // namespace cldnn
} // namespace cldnn

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2018-2019 Intel Corporation
// Copyright (c) 2018-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -115,7 +115,6 @@ public:
private:
void run(program_impl& p) override;
void replace_nodes(program_impl& p);
void handle_detection_output(program_impl& p);
void handle_lstm(program_impl& p);
void handle_dynamic_lstm(program_impl& p);
void set_outputs(program_impl& p);

View File

@ -1,5 +1,5 @@
/*
// Copyright (c) 2016 Intel Corporation
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -24,8 +24,7 @@
#include <api/engine.hpp>
#include "test_utils/test_utils.h"
namespace cldnn
{
namespace cldnn {
template<> struct type_to_data_type<FLOAT16> { static const data_types value = data_types::f16; };
}
@ -33,18 +32,15 @@ using namespace cldnn;
using namespace tests;
template <typename T>
class detection_output_test : public ::testing::Test
{
class detection_output_test : public ::testing::Test {
public:
detection_output_test() :
nms_threshold(0.1f)
{}
nms_threshold(0.1f) {}
void init_buffers(cldnn::memory prior_memory, cldnn::memory confidence_memory, cldnn::memory location_memory,
bool share_location, bool variance_encoded_in_target = false,
int prior_info_size = 4, int prior_coordinates_offset = 0, bool prior_is_normalized = true)
{
int prior_info_size = 4, int prior_coordinates_offset = 0, bool prior_is_normalized = true) {
auto location_ptr = location_memory.pointer<T>();
auto confidence_ptr = confidence_memory.pointer<T>();
auto prior_box_ptr = prior_memory.pointer<T>();
@ -59,11 +55,9 @@ public:
const float prior_multiplier = prior_is_normalized ? 1.0f : static_cast<float>(this->img_size);
const float variance = 0.1f;
int idx = 0;
for (int h = 0; h < 2; ++h)
{
for (int h = 0; h < 2; ++h) {
float center_y = (h + 0.5f) * step;
for (int w = 0; w < 2; ++w)
{
for (int w = 0; w < 2; ++w) {
float center_x = (w + 0.5f) * step;
prior_data[idx+prior_coordinates_offset+0] = (center_x - box_size / 2) * prior_multiplier;
prior_data[idx+prior_coordinates_offset+1] = (center_y - box_size / 2) * prior_multiplier;
@ -73,28 +67,20 @@ public:
idx += prior_info_size;
}
}
if (!variance_encoded_in_target)
{
for (int i = 0; i < idx; ++i)
{
if (!variance_encoded_in_target) {
for (int i = 0; i < idx; ++i) {
prior_data[idx + i] = variance;
}
}
// Fill confidences.
idx = 0;
for (int i = 0; i < num_of_images; ++i)
{
for (int j = 0; j < num_priors; ++j)
{
for (int c = 0; c < num_classes; ++c)
{
if (i % 2 == c % 2)
{
for (int i = 0; i < num_of_images; ++i) {
for (int j = 0; j < num_priors; ++j) {
for (int c = 0; c < num_classes; ++c) {
if (i % 2 == c % 2) {
confidence_data[idx++] = j * 0.2f;
}
else
{
} else {
confidence_data[idx++] = 1 - j * 0.2f;
}
}
@ -105,14 +91,10 @@ public:
const int num_loc_classes = share_location ? 1 : num_classes;
const float loc_multiplier = variance_encoded_in_target ? variance : 1.0f;
idx = 0;
for (int i = 0; i < num_of_images; ++i)
{
for (int h = 0; h < 2; ++h)
{
for (int w = 0; w < 2; ++w)
{
for (int c = 0; c < num_loc_classes; ++c)
{
for (int i = 0; i < num_of_images; ++i) {
for (int h = 0; h < 2; ++h) {
for (int w = 0; w < 2; ++w) {
for (int c = 0; c < num_loc_classes; ++c) {
location_data[idx++] = (w % 2 ? -1 : 1) * (i * 1 + c / 2.f + 0.5f) * loc_multiplier;
location_data[idx++] = (h % 2 ? -1 : 1) * (i * 1 + c / 2.f + 0.5f) * loc_multiplier;
location_data[idx++] = (w % 2 ? -1 : 1) * (i * 1 + c / 2.f + 0.5f) * loc_multiplier;
@ -123,8 +105,7 @@ public:
}
}
void init_buffer_sort(cldnn::memory input_buff)
{
void init_buffer_sort(cldnn::memory input_buff) {
auto input_data_ptr = input_buff.pointer<T>();
EXPECT_EQ((int)input_buff.count(), 128);
@ -150,8 +131,7 @@ public:
input_data[121] = -1; input_data[122] = 0; input_data[123] = 0; input_data[124] = 0; input_data[125] = 0; input_data[126] = 0; input_data[127] = 0;
}
void check_results(const memory& output, const int num, const std::string values)
{
void check_results(const memory& output, const int num, const std::string values) {
assert(num < output.get_layout().size.spatial[1]);
// Split values to vector of items.
@ -163,18 +143,15 @@ public:
// Check data.
auto out_ptr = output.pointer<T>();
const T* data = out_ptr.data();
for (int i = 0; i < 2; ++i)
{
for (int i = 0; i < 2; ++i) {
EXPECT_EQ(static_cast<int>((float)data[num * output.get_layout().size.spatial[0] + i]), atoi(items[i].c_str()));
}
for (int i = 2; i < 7; ++i)
{
for (int i = 2; i < 7; ++i) {
EXPECT_TRUE(floating_point_equal(data[num * output.get_layout().size.spatial[0] + i], (T)(float)atof(items[i].c_str())));
}
}
void setup_basic(bool runOnGPU)
{
void setup_basic() {
const bool share_location = true;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 150;
@ -192,11 +169,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -213,8 +185,7 @@ public:
EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
}
void setup_two_layers(bool runOnGPU)
{
void setup_two_layers() {
const bool share_location = true;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 150;
@ -233,11 +204,6 @@ public:
topology.add(detection_output("detection_output_2", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -247,8 +213,7 @@ public:
EXPECT_EQ(outputs.size(), size_t(2));
unsigned i = 1;
for (auto it = outputs.begin(); it != outputs.begin(); it++)
{
for (auto it = outputs.begin(); it != outputs.begin(); it++) {
EXPECT_EQ(it->first, "detection_output_" + std::to_string(i));
@ -260,8 +225,7 @@ public:
}
}
void forward_share_location(bool runOnGPU)
{
void forward_share_location() {
const bool share_location = true;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 4;
@ -282,11 +246,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -314,8 +273,7 @@ public:
check_results(output_prim, 7, "-1 0 0 0 0 0 0");
}
void forward_num_detections_greater_than_keep_top_k(bool runOnGPU)
{
void forward_num_detections_greater_than_keep_top_k() {
const bool share_location = true;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 1;
@ -336,11 +294,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -362,8 +315,7 @@ public:
check_results(output_prim, 1, "1 1 0.6 0.45 0.45 0.75 0.75");
}
void forward_num_detections_smaller_than_keep_top_k(bool runOnGPU)
{
void forward_num_detections_smaller_than_keep_top_k() {
const bool share_location = true;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 6;
@ -384,11 +336,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -420,8 +367,7 @@ public:
check_results(output_prim, 11, "-1 0 0 0 0 0 0");
}
void test_forward_share_location_top_k(bool runOnGPU)
{
void test_forward_share_location_top_k() {
const bool share_location = true;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 2;
@ -443,11 +389,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -471,8 +412,7 @@ public:
check_results(output_prim, 3, "-1 0 0 0 0 0 0");
}
void forward_no_share_location(bool runOnGPU)
{
void forward_no_share_location() {
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 10;
@ -493,11 +433,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -537,8 +472,7 @@ public:
check_results(output_prim, 19, "-1 0 0 0 0 0 0");
}
void forward_no_share_location_top_k(bool runOnGPU)
{
void forward_no_share_location_top_k() {
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 4;
@ -560,11 +494,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -592,8 +521,7 @@ public:
check_results(output_prim, 7, "-1 0 0 0 0 0 0");
}
void forward_no_share_location_neg_0(bool runOnGPU)
{
void forward_no_share_location_neg_0() {
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 5;
@ -614,11 +542,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -648,8 +571,7 @@ public:
check_results(output_prim, 9, "-1 0 0 0 0 0 0");
}
void forward_no_share_location_neg_0_top_k(bool runOnGPU)
{
void forward_no_share_location_neg_0_top_k() {
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 2;
@ -671,11 +593,6 @@ public:
topology.add(detection_output("detection_output", "input_location", "input_confidence", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -699,8 +616,7 @@ public:
check_results(output_prim, 3, "-1 0 0 0 0 0 0");
}
void forward_no_share_location_top_k_input_padding(bool runOnGPU)
{
void forward_no_share_location_top_k_input_padding() {
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 4;
@ -723,11 +639,6 @@ public:
topology.add(detection_output("detection_output", "input_location_padded", "input_confidence_padded", "input_prior_box", this->num_classes, keep_top_k, share_location, background_label_id, this->nms_threshold, top_k));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -755,8 +666,7 @@ public:
check_results(output_prim, 7, "-1 0 0 0 0 0 0");
}
void test_forward_no_share_location_top_k_faster_rcnn_case(bool runOnGPU)
{
void test_forward_no_share_location_top_k_faster_rcnn_case() {
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 4;
@ -792,11 +702,6 @@ public:
));
build_options opts;
if (runOnGPU)
{
opts.set_option(build_option::detection_output_gpu(true));
}
network network(engine, topology, opts);
network.set_input_data("input_location", input_location);
network.set_input_data("input_confidence", input_confidence);
@ -834,181 +739,50 @@ public:
typedef ::testing::Types<float, FLOAT16> detection_output_test_types;
TYPED_TEST_CASE(detection_output_test, detection_output_test_types);
TYPED_TEST(detection_output_test, test_setup_basic)
{
this->setup_basic(false);
TYPED_TEST(detection_output_test, test_setup_basic) {
this->setup_basic();
}
TYPED_TEST(detection_output_test, test_setup_basic_gpu)
{
this->setup_basic(true);
TYPED_TEST(detection_output_test, test_setup_two_layers) {
this->setup_two_layers();
}
TYPED_TEST(detection_output_test, test_setup_two_layers)
{
this->setup_two_layers(false);
TYPED_TEST(detection_output_test, test_forward_share_location) {
this->forward_share_location();
}
TYPED_TEST(detection_output_test, test_setup_two_layers_gpu)
{
this->setup_two_layers(true);
TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k) {
this->forward_num_detections_greater_than_keep_top_k();
}
TYPED_TEST(detection_output_test, test_forward_share_location)
{
this->forward_share_location(false);
TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k) {
this->forward_num_detections_smaller_than_keep_top_k();
}
TYPED_TEST(detection_output_test, DISABLED_test_forward_share_location_gpu)
{
this->forward_share_location(true);
TYPED_TEST(detection_output_test, test_forward_share_location_top_k) {
this->test_forward_share_location_top_k();
}
TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k)
{
this->forward_num_detections_greater_than_keep_top_k(false);
TYPED_TEST(detection_output_test, test_forward_no_share_location) {
this->forward_no_share_location();
}
TYPED_TEST(detection_output_test, test_forward_num_detections_greater_than_keep_top_k_gpu)
{
this->forward_num_detections_greater_than_keep_top_k(true);
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k) {
this->forward_no_share_location_top_k();
}
TYPED_TEST(detection_output_test, test_forward_num_detections_smaller_than_keep_top_k)
{
this->forward_num_detections_smaller_than_keep_top_k(false);
TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0) {
this->forward_no_share_location_neg_0();
}
TYPED_TEST(detection_output_test, DISABLED_test_forward_num_detections_smaller_than_keep_top_k_gpu)
{
this->forward_num_detections_smaller_than_keep_top_k(true);
TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k) {
this->forward_no_share_location_neg_0_top_k();
}
TYPED_TEST(detection_output_test, test_forward_share_location_top_k)
{
this->test_forward_share_location_top_k(false);
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding) {
this->forward_no_share_location_top_k_input_padding();
}
TYPED_TEST(detection_output_test, test_forward_share_location_top_k_gpu)
{
this->test_forward_share_location_top_k(true);
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case) {
this->test_forward_no_share_location_top_k_faster_rcnn_case();
}
TYPED_TEST(detection_output_test, test_forward_no_share_location)
{
this->forward_no_share_location(false);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_gpu)
{
this->forward_no_share_location(true);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k)
{
this->forward_no_share_location_top_k(false);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_gpu)
{
this->forward_no_share_location_top_k(true);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0)
{
this->forward_no_share_location_neg_0(false);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_gpu)
{
this->forward_no_share_location_neg_0(true);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k)
{
this->forward_no_share_location_neg_0_top_k(false);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_neg_0_top_k_gpu)
{
this->forward_no_share_location_neg_0_top_k(true);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding)
{
this->forward_no_share_location_top_k_input_padding(false);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_input_padding_gpu)
{
this->forward_no_share_location_top_k_input_padding(true);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case)
{
this->test_forward_no_share_location_top_k_faster_rcnn_case(false);
}
TYPED_TEST(detection_output_test, test_forward_no_share_location_top_k_faster_rcnn_case_gpu)
{
this->test_forward_no_share_location_top_k_faster_rcnn_case(true);
}
TYPED_TEST(detection_output_test, test_detection_output_sort_gpu)
{
const bool share_location = false;
const int num_loc_classes = share_location ? 1 : this->num_classes;
const int keep_top_k = 10;
const int background_label_id = -1;
const int top_k = -1;
const unsigned out_row_size = 7;
const unsigned score_space = ((this->num_of_images + 15) / 16) * 16;
int input_size = this->num_of_images * num_loc_classes * this->num_priors * out_row_size + score_space;
const auto& engine = get_test_engine();
cldnn::memory input_buff = memory::allocate(engine, { type_to_data_type<TypeParam>::value, format::bfyx,{ 1, 1, 1, input_size } });
this->init_buffer_sort(input_buff);
topology topology;
topology.add(input_layout("input_location", input_buff.get_layout()));
topology.add(detection_output_sort("detection_output_sort", "input_location", this->num_of_images, this->num_classes, keep_top_k, share_location, top_k, background_label_id));
network network(engine, topology);
network.set_input_data("input_location", input_buff);
auto outputs = network.execute();
EXPECT_EQ(outputs.size(), size_t(1));
EXPECT_EQ(outputs.begin()->first, "detection_output_sort");
EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.batch[0], 1);
EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.feature[0], 1);
EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[1], keep_top_k * this->num_of_images);
EXPECT_EQ(outputs.begin()->second.get_memory().get_layout().size.spatial[0], 7);
auto output_prim = outputs.begin()->second.get_memory();
this->check_results(output_prim, 0, "0 0 0.6 0.55 0.55 0.85 0.85");
this->check_results(output_prim, 1, "0 0 0.4 0.15 0.55 0.45 0.85");
this->check_results(output_prim, 2, "0 0 0.2 0.55 0.15 0.85 0.45");
this->check_results(output_prim, 3, "0 0 0.0 0.15 0.15 0.45 0.45");
this->check_results(output_prim, 4, "0 1 1.0 0.20 0.20 0.50 0.50");
this->check_results(output_prim, 5, "0 1 0.8 0.50 0.20 0.80 0.50");
this->check_results(output_prim, 6, "0 1 0.6 0.20 0.50 0.50 0.80");
this->check_results(output_prim, 7, "0 1 0.4 0.50 0.50 0.80 0.80");
this->check_results(output_prim, 8, "1 0 1.0 0.25 0.25 0.55 0.55");
this->check_results(output_prim, 9, "1 0 0.4 0.45 0.45 0.75 0.75");
this->check_results(output_prim, 10, "1 1 0.6 0.40 0.40 0.70 0.70");
this->check_results(output_prim, 11, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 12, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 13, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 14, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 15, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 16, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 17, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 18, "-1 0 0 0 0 0 0");
this->check_results(output_prim, 19, "-1 0 0 0 0 0 0");
}