openvino/inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp

/*
// Copyright (c) 2016-2020 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
*/

///////////////////////////////////////////////////////////////////////////////////////////////////

#include <gtest/gtest.h>
#include <gmock/gmock.h>
#include "api/memory.hpp"
#include <api/input_layout.hpp>
#include "api/convolution.hpp"
#include "api/eltwise.hpp"
#include <api/topology.hpp>
#include <api/network.hpp>
#include <api/engine.hpp>
#include "test_utils/test_utils.h"
#include "test_utils/float16.h"
#include <api/data.hpp>
#include <algorithm>
#include <array>
#include <cmath>
#include <iostream>
#include <iomanip>
#include <thread>
#include <type_traits>
#include <fstream>
#include <tuple>
#include <api/crop.hpp>
#include <api/reorder.hpp>
#include <src/include/to_string_utils.h>

using namespace cldnn;
using namespace tests;

namespace cldnn
{
    template<> struct type_to_data_type<FLOAT16> { static const data_types value = data_types::f16; };
}

template<typename T>
T kahan_summation(std::vector<T> &input) {
    T sum = 0;
    T c = 0;
    for (T x : input) {
        T y = x - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }
    return sum;
}

template <typename InputT>
struct convolution_accumulator {
    using type = InputT;
};

template <>
struct convolution_accumulator<int8_t> {
    using type = int;
};

template <>
struct convolution_accumulator<uint8_t> {
    using type = int;
};

template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT,  typename AccT = typename convolution_accumulator<InputT>::type>
VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
        int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
        int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
        const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
{
    size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
    size_t kernel_extent_x = dilation_x * (filter[0][0].size() - 1) + 1;
    size_t output_y = 1 + (input[0].size() - kernel_extent_y + 2 * input_padding_y) / stride_y + 2 * output_padding_y;
    size_t output_x = 1 + (input[0][0].size() - kernel_extent_x + 2 * input_padding_x) / stride_x + 2 * output_padding_x;
    bool asymm_data = !data_zp.empty();
    bool asymm_weights = weights_zp != static_cast<WeightsT>(0);
    VVF<OutputT> output(output_y, VF<OutputT>(output_x, 0));
    size_t filter_begin = f_begin ? f_begin : 0;
    size_t filter_end = f_end ? f_end : filter.size();
    for (size_t f = filter_begin; f < filter_end; ++f) {
        for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
            for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
                VF<AccT> values;
                values.reserve(filter[0].size() * filter[0][0].size());
                for (size_t yf = 0; yf < filter[0].size(); ++yf) {
                    int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
                    bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
                    if (!yi_inside) continue;
                    for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
                        int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
                        bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
                        if (!xi_inside) continue;

                        auto input_val = static_cast<AccT>(input[f][yi][xi]);

                        if (asymm_data) {
                            input_val = input_val - static_cast<AccT>(data_zp[f]);
                        }

                        AccT weights_val;
                        if (!depthwise && !grouped) {
                            weights_val = static_cast<AccT>(filter[f][yf][xf]);
                        } else if (grouped) {
                            weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
                        }
                        else {
                            weights_val = static_cast<AccT>(filter[0][yf][xf]);
                        }

                        if (asymm_weights) {
                            weights_val = weights_val - static_cast<AccT>(weights_zp);
                        }

                        values.push_back(input_val * weights_val);
                    }
                }
                output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(kahan_summation<AccT>(values));
            }
        }
    }

    for (size_t y = 0; y < (output_y - 2 * output_padding_y); ++y) {
        for (size_t x = 0; x < (output_x - 2 * output_padding_x); ++x) {
            output[y + output_padding_y][x + output_padding_x] += static_cast<OutputT>(bias);
        }
    }
    return output;
}

template <typename T>
VVF<T> reference_scale_post_op(const VVF<T>& input, const T& scale, const T& shift) {
    auto output = input;
    auto size_y = input.size();
    auto size_x = input[0].size();
    for (size_t yi = 0; yi < size_y; ++yi) {
        for (size_t xi = 0; xi < size_x; ++xi) {
            output[yi][xi] = output[yi][xi] * scale + shift;
        }
    }
    return output;
}

void dump_buffer(memory const& mem, std::string const& name)
{
    std::ofstream out(name);
    auto size = mem.get_layout().get_buffer_size();
    auto ptr = mem.pointer<const float>();
    auto pitches = mem.get_layout().get_pitches();
    out << "Data size: " << mem.get_layout().size << "\n";
    out << "Lower padding: " << mem.get_layout().data_padding.lower_size() << "\n";
    out << "Upper padding: " << mem.get_layout().data_padding.upper_size() << "\n";
    out << "\n";

    for (int b = 0; b < size.batch[0]; ++b)
    {
        out << " ================ BATCH " << b << " =================\n\n";
        for (int f = 0; f < size.feature[0]; ++f)
        {
            out << "feature " << f << ":\n";
            for (int y = 0; y < size.spatial[1]; ++y)
            {
                for (int x = 0; x < size.spatial[0]; ++x)
                {
                    size_t idx = b * pitches.batch[0] + f * pitches.feature[0] + y * pitches.spatial[1] + x * pitches.spatial[0];
                    out << ptr[idx] << " ";
                }
                out << "\n";
            }

            out << "\n";
        }

        out << "\n";
    }
}

TEST(deformable_convolution_f32_fw_gpu, basic_deformable_convolution_def_group1_2) {
    //  Input    : 4x4x4
    //  Trans    : 18x4x4
    //  Output   : 4x4x4
    //  In_offset: 1x1
    //  Filter   : 3x3
    //  Stride   : 1x1
    //  Dilation : 1x1
    //  Group    : 1
    //  Def_group: 1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 4, 4 } });
    auto trans = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 18, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 4, 4, 3, 3 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 1 } });

    set_values(input, { 0.680375f, -0.211234f, 0.566198f, 0.59688f, 0.823295f, -0.604897f, -0.329554f, 0.536459f,
                        -0.444451f, 0.10794f, -0.0452059f, 0.257742f, -0.270431f, 0.0268018f, 0.904459f, 0.83239f,
                        0.271423f, 0.434594f, -0.716795f, 0.213938f, -0.967399f, -0.514226f, -0.725537f, 0.608353f,
                        -0.686642f, -0.198111f, -0.740419f, -0.782382f, 0.997849f, -0.563486f, 0.0258648f, 0.678224f,
                        0.22528f, -0.407937f, 0.275105f, 0.0485743f, -0.012834f, 0.94555f, -0.414966f, 0.542715f,
                        0.0534899f, 0.539828f, -0.199543f, 0.783059f, -0.433371f, -0.295083f, 0.615449f, 0.838053f,
                        -0.860489f, 0.898654f, 0.0519907f, -0.827888f, -0.615572f, 0.326454f, 0.780465f, -0.302214f,
                        -0.871657f, -0.959954f, -0.0845965f, -0.873808f, -0.52344f, 0.941268f, 0.804416f, 0.70184f });

    set_values(trans, { -0.466668f, 0.0795207f, -0.249586f, 0.520497f, 0.0250708f, 0.335448f, 0.0632129f, -0.921439f, -0.124725f,
                        0.86367f, 0.86162f, 0.441905f, -0.431413f, 0.477069f, 0.279958f, -0.291903f, 0.375723f, -0.668052f,
                        -0.119791f, 0.76015f, 0.658402f, -0.339326f, -0.542064f, 0.786745f, -0.29928f, 0.37334f, 0.912936f,
                        0.17728f, 0.314608f, 0.717353f, -0.12088f, 0.84794f, -0.203127f, 0.629534f, 0.368437f, 0.821944f,
                        -0.0350187f, -0.56835f, 0.900505f, 0.840256f, -0.70468f, 0.762124f, 0.282161f, -0.136093f, 0.239193f,
                        -0.437881f, 0.572004f, -0.385084f, -0.105933f, -0.547787f, -0.624934f, -0.447531f, 0.112888f, -0.166997f,
                        -0.660786f, 0.813608f, -0.793658f, -0.747849f, -0.00911188f, 0.52095f, 0.969503f, 0.870008f, 0.36889f,
                        -0.233623f, 0.499542f, -0.262673f, -0.411679f, -0.535477f, 0.168977f, -0.511175f, -0.69522f, 0.464297f,
                        -0.74905f, 0.586941f, -0.671796f, 0.490143f, -0.85094f, 0.900208f, -0.894941f, 0.0431267f, -0.647579f,
                        -0.519875f, 0.595596f, 0.465309f, 0.313127f, 0.93481f, 0.278917f, 0.51947f, -0.813039f, -0.730195f,
                        0.0404202f, -0.843536f, -0.860187f, -0.59069f, -0.077159f, 0.639355f, 0.146637f, 0.511162f, -0.896122f,
                        -0.684386f, 0.999987f, -0.591343f, 0.779911f, -0.749063f, 0.995598f, -0.891885f, 0.74108f, -0.855342f,
                        -0.991677f, 0.846138f, 0.187784f, -0.639255f, -0.673737f, -0.21662f, 0.826053f, 0.63939f, -0.281809f,
                        0.10497f, 0.15886f, -0.0948483f, 0.374775f, -0.80072f, 0.0616159f, 0.514588f, -0.39141f, 0.984457f,
                        0.153942f, 0.755228f, 0.495619f, 0.25782f, -0.929158f, 0.495606f, 0.666477f, 0.850753f, 0.746543f,
                        0.662075f, 0.958868f, 0.487622f, 0.806733f, 0.967191f, 0.333761f, -0.00548297f, -0.672064f, 0.660024f,
                        0.777897f, -0.846011f, 0.299414f, -0.503912f, 0.258959f, -0.541726f, 0.40124f, -0.366266f, -0.342446f,
                        -0.537144f, -0.851678f, 0.266144f, -0.552687f, 0.302264f, 0.021372f, 0.942931f, -0.439916f, 0.0922137f,
                        0.438537f, -0.773439f, -0.0570331f, 0.18508f, 0.888636f, -0.0981649f, -0.327298f, 0.695369f, -0.130973f,
                        -0.993537f, -0.310114f, 0.196963f, 0.666487f, -0.532217f, 0.350952f, -0.0340995f, -0.0361283f, -0.390089f,
                        0.424175f, -0.634888f, 0.243646f, -0.918271f, -0.172033f, 0.391968f, 0.347873f, 0.27528f, -0.305768f,
                        -0.630755f, 0.218212f, 0.254316f, 0.461459f, -0.343251f, 0.480877f, -0.595574f, 0.841829f, 0.369513f,
                        0.306261f, -0.485469f, 0.0648819f, -0.824713f, -0.479006f, 0.754768f, 0.37225f, -0.81252f, -0.777449f,
                        -0.276798f, 0.153381f, 0.186423f, 0.333113f, -0.422444f, 0.551535f, -0.423241f, -0.340716f, -0.620498f,
                        0.968726f, -0.992843f, 0.654782f, -0.337042f, -0.623598f, -0.127006f, 0.917274f, 0.837861f, 0.529743f,
                        0.398151f, -0.757714f, 0.371572f, -0.232336f, 0.548547f, 0.886103f, 0.832546f, 0.723834f, -0.592904f,
                        0.587314f, 0.0960841f, -0.405423f, 0.809865f, 0.819286f, 0.747958f, -0.00371218f, 0.152399f, -0.674487f,
                        -0.452178f, 0.729158f, -0.0152023f, -0.0726757f, 0.697884f, -0.0080452f, -0.417893f, -0.639158f, 0.368357f,
                        0.455101f, -0.721884f, 0.206218f, -0.0151566f, 0.676267f, 0.448504f, -0.643585f, -0.556069f, -0.00294906f,
                        -0.757482f, -0.723523f, -0.279115f, -0.350386f, 0.863791f, 0.816969f, 0.244191f, 0.673656f, 0.636255f,
                        -0.00785118f, -0.330057f, -0.211346f, 0.317662f, 0.217766f, -0.482188f, -0.69754f, -0.85491f, -0.784303f,
                        0.294415f, -0.272803f, -0.423461f, -0.337228f, -0.817703f, -0.145345f, 0.868989f, 0.167141f, -0.469077f });

    set_values(weights, { 0.0f, 0.841471f, 0.909297f, 0.14112f, -0.756802f, -0.958924f, -0.279415f, 0.656987f, 0.989358f,
                          0.412118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.912945f, 0.836656f,
                          -0.00885131f, -0.84622f, -0.905578f, -0.132352f, 0.762558f, 0.956376f, 0.270906f, -0.663634f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.745113f, -0.158623f, -0.916522f,
                          -0.831775f, 0.0177019f, 0.850904f, 0.901788f, 0.123573f, -0.768255f, -0.953753f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.304811f, -0.966118f, -0.739181f, 0.167356f,
                          0.920026f, 0.826829f, -0.0265512f, -0.85552f, -0.897928f, -0.114785f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.993889f, -0.629888f, 0.313229f, 0.968364f, 0.73319f,
                          -0.176076f, -0.923458f, -0.821818f, 0.0353983f, 0.860069f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, -0.506366f, 0.452026f, 0.994827f, 0.622989f, -0.321622f, -0.970535f,
                          -0.727143f, 0.184782f, 0.926818f, 0.816743f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.580611f, 0.998815f, 0.498713f, -0.459903f, -0.995687f, -0.61604f, 0.329991f,
                          0.97263f, 0.721038f, -0.193473f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.98024f, 0.363171f, -0.587795f, -0.998345f });

    set_values(biases, { -0.491022f, 0.467745f, 0.996469f, 0.609044f });

    std::vector<float> output_vec = {
            -0.0742483f, -2.09984f, -0.850381f, 0.0398813f, -1.06922f, -0.0233979f, -1.15264f, -0.970688f, -0.0428347f,
            -1.73668f, 0.613717f, 2.12469f, 0.450835f, -1.82602f, -0.57416f, -0.682909f, -0.211437f, 0.351543f,
            0.930845f, 0.412505f, 1.23318f, 0.894126f, 1.56587f, 0.882479f, 0.640997f, -1.94229f, -1.00846f, -3.64185f,
            -0.383791f, -0.274041f, -1.49479f, -2.82027f, 0.858848f, 1.7228f, 0.51184f, 0.537693f, 1.40331f, 0.192823f,
            0.325383f, 0.814044f, 1.19015f, 0.403436f, 1.40995f, 0.42931f, 0.131369f, 2.01262f, 0.253117f, 0.018361f,
            1.3469f, 1.15957f, 0.599044f, 1.48224f, 2.16468f, 0.504246f, -1.52044f, 0.10271f, 0.0379517f, 0.942841f,
            -2.6067f, 0.562893f, 0.671884f, 0.404735f, 1.45044f, 0.950113f };

    topology topology(
            input_layout("input", input.get_layout()),
            input_layout("trans", trans.get_layout()),
            data("weights", weights),
            data("biases", biases),
            convolution(
                    "conv",
                    "input",
                    "trans",
                    { "weights" },
                    { "biases" },
                    1,
                    1,
                    { 1, 1, 1, 1 },
                    { 0, 0, -1, -1 },
                    { 1, 1, 1, 1 },
                    { 1, 4, 4, 4 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);
    network.set_input_data("trans", trans);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 4);
    EXPECT_EQ(x_size, 4);
    EXPECT_EQ(f_size, 4);
    EXPECT_EQ(b_size, 1);

    for (size_t i = 0; i < output_vec.size(); ++i) {
        EXPECT_NEAR(output_vec[i], output_ptr[i], 0.1);
    }
}

TEST(deformable_convolution_f32_fw_gpu, basic_deformable_convolution_def_group1) {
    //  Input    : 4x4x4
    //  Trans    : 18x4x4
    //  Output   : 4x4x4
    //  In_offset: 2x2
    //  Filter   : 3x3
    //  Stride   : 1x1
    //  Dilation : 2x2
    //  Group    : 1
    //  Def_group: 1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 4, 4 } });
    auto trans = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 18, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 4, 4, 3, 3 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 1 } });

    set_values(input, { 0.680375f, -0.211234f, 0.566198f, 0.59688f, 0.823295f, -0.604897f, -0.329554f, 0.536459f,
                        -0.444451f, 0.10794f, -0.0452059f, 0.257742f, -0.270431f, 0.0268018f, 0.904459f, 0.83239f,
                        0.271423f, 0.434594f, -0.716795f, 0.213938f, -0.967399f, -0.514226f, -0.725537f, 0.608353f,
                        -0.686642f, -0.198111f, -0.740419f, -0.782382f, 0.997849f, -0.563486f, 0.0258648f, 0.678224f,
                        0.22528f, -0.407937f, 0.275105f, 0.0485743f, -0.012834f, 0.94555f, -0.414966f, 0.542715f,
                        0.0534899f, 0.539828f, -0.199543f, 0.783059f, -0.433371f, -0.295083f, 0.615449f, 0.838053f,
                        -0.860489f, 0.898654f, 0.0519907f, -0.827888f, -0.615572f, 0.326454f, 0.780465f, -0.302214f,
                        -0.871657f, -0.959954f, -0.0845965f, -0.873808f, -0.52344f, 0.941268f, 0.804416f, 0.70184f });

    set_values(trans, { -0.466668f, 0.0795207f, -0.249586f, 0.520497f, 0.0250708f, 0.335448f, 0.0632129f, -0.921439f, -0.124725f,
                        0.86367f, 0.86162f, 0.441905f, -0.431413f, 0.477069f, 0.279958f, -0.291903f, 0.375723f, -0.668052f,
                        -0.119791f, 0.76015f, 0.658402f, -0.339326f, -0.542064f, 0.786745f, -0.29928f, 0.37334f, 0.912936f,
                        0.17728f, 0.314608f, 0.717353f, -0.12088f, 0.84794f, -0.203127f, 0.629534f, 0.368437f, 0.821944f,
                        -0.0350187f, -0.56835f, 0.900505f, 0.840256f, -0.70468f, 0.762124f, 0.282161f, -0.136093f, 0.239193f,
                        -0.437881f, 0.572004f, -0.385084f, -0.105933f, -0.547787f, -0.624934f, -0.447531f, 0.112888f, -0.166997f,
                        -0.660786f, 0.813608f, -0.793658f, -0.747849f, -0.00911188f, 0.52095f, 0.969503f, 0.870008f, 0.36889f,
                        -0.233623f, 0.499542f, -0.262673f, -0.411679f, -0.535477f, 0.168977f, -0.511175f, -0.69522f, 0.464297f,
                        -0.74905f, 0.586941f, -0.671796f, 0.490143f, -0.85094f, 0.900208f, -0.894941f, 0.0431267f, -0.647579f,
                        -0.519875f, 0.595596f, 0.465309f, 0.313127f, 0.93481f, 0.278917f, 0.51947f, -0.813039f, -0.730195f,
                        0.0404202f, -0.843536f, -0.860187f, -0.59069f, -0.077159f, 0.639355f, 0.146637f, 0.511162f, -0.896122f,
                        -0.684386f, 0.999987f, -0.591343f, 0.779911f, -0.749063f, 0.995598f, -0.891885f, 0.74108f, -0.855342f,
                        -0.991677f, 0.846138f, 0.187784f, -0.639255f, -0.673737f, -0.21662f, 0.826053f, 0.63939f, -0.281809f,
                        0.10497f, 0.15886f, -0.0948483f, 0.374775f, -0.80072f, 0.0616159f, 0.514588f, -0.39141f, 0.984457f,
                        0.153942f, 0.755228f, 0.495619f, 0.25782f, -0.929158f, 0.495606f, 0.666477f, 0.850753f, 0.746543f,
                        0.662075f, 0.958868f, 0.487622f, 0.806733f, 0.967191f, 0.333761f, -0.00548297f, -0.672064f, 0.660024f,
                        0.777897f, -0.846011f, 0.299414f, -0.503912f, 0.258959f, -0.541726f, 0.40124f, -0.366266f, -0.342446f,
                        -0.537144f, -0.851678f, 0.266144f, -0.552687f, 0.302264f, 0.021372f, 0.942931f, -0.439916f, 0.0922137f,
                        0.438537f, -0.773439f, -0.0570331f, 0.18508f, 0.888636f, -0.0981649f, -0.327298f, 0.695369f, -0.130973f,
                        -0.993537f, -0.310114f, 0.196963f, 0.666487f, -0.532217f, 0.350952f, -0.0340995f, -0.0361283f, -0.390089f,
                        0.424175f, -0.634888f, 0.243646f, -0.918271f, -0.172033f, 0.391968f, 0.347873f, 0.27528f, -0.305768f,
                        -0.630755f, 0.218212f, 0.254316f, 0.461459f, -0.343251f, 0.480877f, -0.595574f, 0.841829f, 0.369513f,
                        0.306261f, -0.485469f, 0.0648819f, -0.824713f, -0.479006f, 0.754768f, 0.37225f, -0.81252f, -0.777449f,
                        -0.276798f, 0.153381f, 0.186423f, 0.333113f, -0.422444f, 0.551535f, -0.423241f, -0.340716f, -0.620498f,
                        0.968726f, -0.992843f, 0.654782f, -0.337042f, -0.623598f, -0.127006f, 0.917274f, 0.837861f, 0.529743f,
                        0.398151f, -0.757714f, 0.371572f, -0.232336f, 0.548547f, 0.886103f, 0.832546f, 0.723834f, -0.592904f,
                        0.587314f, 0.0960841f, -0.405423f, 0.809865f, 0.819286f, 0.747958f, -0.00371218f, 0.152399f, -0.674487f,
                        -0.452178f, 0.729158f, -0.0152023f, -0.0726757f, 0.697884f, -0.0080452f, -0.417893f, -0.639158f, 0.368357f,
                        0.455101f, -0.721884f, 0.206218f, -0.0151566f, 0.676267f, 0.448504f, -0.643585f, -0.556069f, -0.00294906f,
                        -0.757482f, -0.723523f, -0.279115f, -0.350386f, 0.863791f, 0.816969f, 0.244191f, 0.673656f, 0.636255f,
                        -0.00785118f, -0.330057f, -0.211346f, 0.317662f, 0.217766f, -0.482188f, -0.69754f, -0.85491f, -0.784303f,
                        0.294415f, -0.272803f, -0.423461f, -0.337228f, -0.817703f, -0.145345f, 0.868989f, 0.167141f, -0.469077f });

    set_values(weights, { 0.0f, 0.841471f, 0.909297f, 0.14112f, -0.756802f, -0.958924f, -0.279415f, 0.656987f,
                          0.989358f, 0.412118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.912945f,
                          0.836656f, -0.00885131f, -0.84622f, -0.905578f, -0.132352f, 0.762558f, 0.956376f, 0.270906f,
                          -0.663634f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.745113f, -0.158623f,
                          -0.916522f, -0.831775f, 0.0177019f, 0.850904f, 0.901788f, 0.123573f, -0.768255f, -0.953753f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.304811f, -0.966118f, -0.739181f,
                          0.167356f, 0.920026f, 0.826829f, -0.0265512f, -0.85552f, -0.897928f, -0.114785f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.993889f, -0.629888f, 0.313229f, 0.968364f,
                          0.73319f, -0.176076f, -0.923458f, -0.821818f, 0.0353983f, 0.860069f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.506366f, 0.452026f, 0.994827f, 0.622989f, -0.321622f,
                          -0.970535f, -0.727143f, 0.184782f, 0.926818f, 0.816743f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.580611f, 0.998815f, 0.498713f, -0.459903f, -0.995687f, -0.61604f,
                          0.329991f, 0.97263f, 0.721038f, -0.193473f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.98024f, 0.363171f, -0.587795f, -0.998345f });

    set_values(biases, { -0.491022f, 0.467745f, 0.996469f, 0.609044f });

    std::vector<float> output_vec = {
            0.304297f, -0.385894f, -1.44155f, -0.954556f, -0.260702f, -0.0162079f, 1.05196f, -0.129013f, 0.668587f,
            -1.4058f, -0.0966965f, -0.45043f, -2.23299f, -1.56306f, 0.083207f, -0.42985f, -0.00589353f, 1.08037f,
            1.06648f, 0.0936709f, 1.62321f, 1.50433f, 0.00480294f, -0.0550415f, 0.165425f, 0.146279f, -0.45487f,
            0.370202f, 0.177222f, -1.03958f, -0.744073f, -0.375273f, 0.587801f, 0.120338f, 1.17536f, 1.88443f,
            0.119988f, -0.540461f, -1.7228f, 1.54217f, 0.962263f, -0.0363407f, 0.762274f, 1.32504f, 1.43954f,
            -0.143791f, 1.21981f, 1.71111f, 0.195772f, 0.650412f, 0.474924f, 0.929919f, -0.442715f, 0.462916f,
            -0.210789f, -0.973089f, -0.407542f, 1.11818f, 0.843776f, 0.628229f, 1.29095f, 1.18637f, 0.808982f, 1.43841f };

    topology topology(
            input_layout("input", input.get_layout()),
            input_layout("trans", trans.get_layout()),
            data("weights", weights),
            data("biases", biases),
            convolution(
                    "conv",
                    "input",
                    "trans",
                    { "weights" },
                    { "biases" },
                    1,
                    1,
                    { 1, 1, 1, 1 },
                    { 0, 0, -2, -2 },
                    { 1, 1, 2, 2 },
                    { 1, 4, 4, 4 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);
    network.set_input_data("trans", trans);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 4);
    EXPECT_EQ(x_size, 4);
    EXPECT_EQ(f_size, 4);
    EXPECT_EQ(b_size, 1);

    for (size_t i = 0; i < output_vec.size(); ++i) {
        EXPECT_NEAR(output_vec[i], output_ptr[i], 0.1);
    }
}

TEST(deformable_convolution_f32_fw_gpu, basic_deformable_convolution) {
    //  Input    : 4x4x4
    //  Trans    : 36x4x4
    //  Output   : 4x4x4
    //  In_offset: 2x2
    //  Filter   : 3x3
    //  Stride   : 1x1
    //  Dilation : 2x2
    //  Group    : 1
    //  Def_group: 2

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 4, 4 } });
    auto trans = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 36, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 4, 4, 3, 3 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 1 } });

    set_values(input, { 0.680375f, -0.211234f, 0.566198f, 0.59688f, 0.823295f, -0.604897f, -0.329554f, 0.536459f,
                        -0.444451f, 0.10794f, -0.0452059f, 0.257742f, -0.270431f, 0.0268018f, 0.904459f, 0.83239f,
                        0.271423f, 0.434594f, -0.716795f, 0.213938f, -0.967399f, -0.514226f, -0.725537f, 0.608353f,
                        -0.686642f, -0.198111f, -0.740419f, -0.782382f, 0.997849f, -0.563486f, 0.0258648f, 0.678224f,
                        0.22528f, -0.407937f, 0.275105f, 0.0485743f, -0.012834f, 0.94555f, -0.414966f, 0.542715f,
                        0.0534899f, 0.539828f, -0.199543f, 0.783059f, -0.433371f, -0.295083f, 0.615449f, 0.838053f,
                        -0.860489f, 0.898654f, 0.0519907f, -0.827888f, -0.615572f, 0.326454f, 0.780465f, -0.302214f,
                        -0.871657f, -0.959954f, -0.0845965f, -0.873808f, -0.52344f, 0.941268f, 0.804416f, 0.70184f });

    set_values(trans, { -0.466668f, 0.0795207f, -0.249586f, 0.520497f, 0.0250708f, 0.335448f, 0.0632129f, -0.921439f, -0.124725f,
                        0.86367f, 0.86162f, 0.441905f, -0.431413f, 0.477069f, 0.279958f, -0.291903f, 0.375723f, -0.668052f,
                        -0.119791f, 0.76015f, 0.658402f, -0.339326f, -0.542064f, 0.786745f, -0.29928f, 0.37334f, 0.912936f,
                        0.17728f, 0.314608f, 0.717353f, -0.12088f, 0.84794f, -0.203127f, 0.629534f, 0.368437f, 0.821944f,
                        -0.0350187f, -0.56835f, 0.900505f, 0.840256f, -0.70468f, 0.762124f, 0.282161f, -0.136093f, 0.239193f,
                        -0.437881f, 0.572004f, -0.385084f, -0.105933f, -0.547787f, -0.624934f, -0.447531f, 0.112888f, -0.166997f,
                        -0.660786f, 0.813608f, -0.793658f, -0.747849f, -0.00911188f, 0.52095f, 0.969503f, 0.870008f, 0.36889f,
                        -0.233623f, 0.499542f, -0.262673f, -0.411679f, -0.535477f, 0.168977f, -0.511175f, -0.69522f, 0.464297f,
                        -0.74905f, 0.586941f, -0.671796f, 0.490143f, -0.85094f, 0.900208f, -0.894941f, 0.0431267f, -0.647579f,
                        -0.519875f, 0.595596f, 0.465309f, 0.313127f, 0.93481f, 0.278917f, 0.51947f, -0.813039f, -0.730195f,
                        0.0404202f, -0.843536f, -0.860187f, -0.59069f, -0.077159f, 0.639355f, 0.146637f, 0.511162f, -0.896122f,
                        -0.684386f, 0.999987f, -0.591343f, 0.779911f, -0.749063f, 0.995598f, -0.891885f, 0.74108f, -0.855342f,
                        -0.991677f, 0.846138f, 0.187784f, -0.639255f, -0.673737f, -0.21662f, 0.826053f, 0.63939f, -0.281809f,
                        0.10497f, 0.15886f, -0.0948483f, 0.374775f, -0.80072f, 0.0616159f, 0.514588f, -0.39141f, 0.984457f,
                        0.153942f, 0.755228f, 0.495619f, 0.25782f, -0.929158f, 0.495606f, 0.666477f, 0.850753f, 0.746543f,
                        0.662075f, 0.958868f, 0.487622f, 0.806733f, 0.967191f, 0.333761f, -0.00548297f, -0.672064f, 0.660024f,
                        0.777897f, -0.846011f, 0.299414f, -0.503912f, 0.258959f, -0.541726f, 0.40124f, -0.366266f, -0.342446f,
                        -0.537144f, -0.851678f, 0.266144f, -0.552687f, 0.302264f, 0.021372f, 0.942931f, -0.439916f, 0.0922137f,
                        0.438537f, -0.773439f, -0.0570331f, 0.18508f, 0.888636f, -0.0981649f, -0.327298f, 0.695369f, -0.130973f,
                        -0.993537f, -0.310114f, 0.196963f, 0.666487f, -0.532217f, 0.350952f, -0.0340995f, -0.0361283f, -0.390089f,
                        0.424175f, -0.634888f, 0.243646f, -0.918271f, -0.172033f, 0.391968f, 0.347873f, 0.27528f, -0.305768f,
                        -0.630755f, 0.218212f, 0.254316f, 0.461459f, -0.343251f, 0.480877f, -0.595574f, 0.841829f, 0.369513f,
                        0.306261f, -0.485469f, 0.0648819f, -0.824713f, -0.479006f, 0.754768f, 0.37225f, -0.81252f, -0.777449f,
                        -0.276798f, 0.153381f, 0.186423f, 0.333113f, -0.422444f, 0.551535f, -0.423241f, -0.340716f, -0.620498f,
                        0.968726f, -0.992843f, 0.654782f, -0.337042f, -0.623598f, -0.127006f, 0.917274f, 0.837861f, 0.529743f,
                        0.398151f, -0.757714f, 0.371572f, -0.232336f, 0.548547f, 0.886103f, 0.832546f, 0.723834f, -0.592904f,
                        0.587314f, 0.0960841f, -0.405423f, 0.809865f, 0.819286f, 0.747958f, -0.00371218f, 0.152399f, -0.674487f,
                        -0.452178f, 0.729158f, -0.0152023f, -0.0726757f, 0.697884f, -0.0080452f, -0.417893f, -0.639158f, 0.368357f,
                        0.455101f, -0.721884f, 0.206218f, -0.0151566f, 0.676267f, 0.448504f, -0.643585f, -0.556069f, -0.00294906f,
                        -0.757482f, -0.723523f, -0.279115f, -0.350386f, 0.863791f, 0.816969f, 0.244191f, 0.673656f, 0.636255f,
                        -0.00785118f, -0.330057f, -0.211346f, 0.317662f, 0.217766f, -0.482188f, -0.69754f, -0.85491f, -0.784303f,
                        0.294415f, -0.272803f, -0.423461f, -0.337228f, -0.817703f, -0.145345f, 0.868989f, 0.167141f, -0.469077f,
                        0.317493f, 0.523556f, -0.0251462f, -0.685456f, 0.766073f, 0.251331f, 0.0354295f, -0.584313f, 0.115121f,
                        -0.147601f, 0.659878f, -0.211223f, -0.511346f, -0.347973f, 0.45872f, 0.277308f, 0.969689f, -0.323514f,
                        0.79512f, -0.727851f, -0.178424f, -0.989183f, 0.566564f, 0.548772f, -0.412644f, -0.770664f, 0.73107f,
                        0.442012f, -0.901675f, -0.10179f, 0.972934f, 0.415818f, -0.578234f, -0.0522121f, 0.730362f, -0.812161f,
                        -0.800881f, -0.234208f, -0.396474f, 0.31424f, 0.618191f, -0.736596f, -0.896983f, -0.893155f, -0.0845688f,
                        0.561737f, 0.384153f, -0.11488f, -0.761777f, 0.179273f, 0.15727f, 0.0597985f, 0.190091f, -0.276166f,
                        -0.391429f, 0.777447f, -0.0468307f, -0.66036f, 0.219458f, 0.0514942f, 0.237851f, 0.192392f, -0.532688f,
                        0.659617f, -0.85982f, -0.802325f, 0.847456f, -0.660701f, -0.0365333f, -0.549018f, 0.653539f, -0.418343f,
                        -0.285614f, 0.756555f, -0.311498f, 0.629817f, 0.318292f, -0.927345f, -0.485062f, 0.556515f, 0.251928f,
                        0.672207f, -0.383687f, -0.557981f, -0.603959f, 0.224884f, -0.780535f, 0.349211f, 0.564525f, 0.438924f,
                        -0.599295f, -0.197625f, -0.368684f, -0.131983f, -0.538008f, -0.228504f, 0.0656919f, -0.690552f, 0.110795f,
                        -0.970841f, -0.239571f, -0.235666f, -0.389184f, 0.474815f, -0.47911f, 0.299318f, 0.104633f, 0.839182f,
                        0.371973f, 0.619571f, 0.395696f, -0.376099f, 0.291778f, -0.98799f, 0.0659196f, 0.687819f, 0.236894f,
                        0.285385f, 0.0370297f, -0.198582f, -0.275691f, 0.437734f, 0.603793f, 0.355625f, -0.694248f, -0.934215f,
                        -0.872879f, 0.371444f, -0.624767f, 0.237917f, 0.400602f, 0.135662f, -0.997749f, -0.988582f, -0.389522f,
                        -0.476859f, 0.310736f, 0.71511f, -0.637678f, -0.317291f, 0.334681f, 0.758019f, 0.30661f, -0.373541f,
                        0.770028f, -0.62747f, -0.685722f, 0.00692201f, 0.657915f, 0.351308f, 0.80834f, -0.617777f, -0.210957f,
                        0.412133f, 0.737848f, 0.0947942f, 0.477919f, 0.864969f, -0.533762f, 0.853152f, 0.102886f, 0.86684f,
                        -0.0111862f, 0.105137f, 0.878258f, 0.599291f, 0.628277f, 0.188995f, 0.314402f, 0.9906f, 0.871704f,
                        -0.350917f, 0.748619f, 0.178313f, 0.275542f, 0.518647f, 0.550843f, 0.58982f, -0.474431f, 0.208758f,
                        -0.0588716f, -0.666091f, 0.590981f, 0.730171f, 0.746043f, 0.328829f, -0.175035f, 0.223961f, 0.193798f,
                        0.291203f, 0.077113f, -0.703316f, 0.158043f, -0.934073f, 0.401821f, 0.0363014f, 0.665218f, 0.0300982f,
                        -0.774704f, -0.02038f, 0.0206981f, -0.903001f, 0.628703f, -0.230683f, 0.275313f, -0.0957552f, -0.712036f,
                        -0.173844f, -0.505935f, -0.186467f, -0.965087f, 0.435194f, 0.147442f, 0.625894f, 0.165365f, -0.106515f,
                        -0.0452772f, 0.99033f, -0.882554f, -0.851479f, 0.281533f, 0.19456f, -0.554795f, -0.560424f, 0.260486f,
                        0.847025f, 0.475877f, -0.0742955f, -0.122876f, 0.701173f, 0.905324f, 0.897822f, 0.798172f, 0.534027f,
                        -0.332862f, 0.073485f, -0.561728f, -0.0448976f, 0.899641f, -0.0676628f, 0.768636f, 0.934554f, -0.632469f,
                        -0.083922f, 0.560448f, 0.532895f, 0.809563f, -0.484829f, 0.523225f, 0.927009f, -0.336308f, -0.195242f,
                        0.121569f, 0.108896f, 0.244333f, -0.617945f, -0.0440782f, -0.27979f, 0.30776f, 0.833045f, -0.578617f,
                        0.213084f, 0.730867f, -0.780445f, -0.252888f, -0.601995f, 0.29304f, 0.185384f, 0.353108f, 0.192681f,
                        -0.882279f, 0.121744f, 0.127235f, -0.514748f, -0.962178f, -0.312317f, -0.981853f, 0.847385f, 0.202853f,
                        0.541372f, 0.774394f, 0.866545f, -0.653871f, -0.104037f, -0.0245586f, 0.590463f, 0.278018f, 0.931363f });

    set_values(weights, { 0.0f, 0.841471f, 0.909297f, 0.14112f, -0.756802f, -0.958924f, -0.279415f, 0.656987f,
                          0.989358f, 0.412118f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.912945f,
                          0.836656f, -0.00885131f, -0.84622f, -0.905578f, -0.132352f, 0.762558f, 0.956376f, 0.270906f,
                          -0.663634f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.745113f, -0.158623f,
                          -0.916522f, -0.831775f, 0.0177019f, 0.850904f, 0.901788f, 0.123573f, -0.768255f, -0.953753f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.304811f, -0.966118f, -0.739181f,
                          0.167356f, 0.920026f, 0.826829f, -0.0265512f, -0.85552f, -0.897928f, -0.114785f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.993889f, -0.629888f, 0.313229f, 0.968364f,
                          0.73319f, -0.176076f, -0.923458f, -0.821818f, 0.0353983f, 0.860069f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -0.506366f, 0.452026f, 0.994827f, 0.622989f, -0.321622f,
                          -0.970535f, -0.727143f, 0.184782f, 0.926818f, 0.816743f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.0f, 0.0f, 0.580611f, 0.998815f, 0.498713f, -0.459903f, -0.995687f, -0.61604f,
                          0.329991f, 0.97263f, 0.721038f, -0.193473f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
                          0.0f, 0.0f, 0.98024f, 0.363171f, -0.587795f, -0.998345f });

    set_values(biases, { -0.491022f, 0.467745f, 0.996469f, 0.609044f });

    std::vector<float> output_vec = {
            -0.119782f, -1.19244f, -0.495079f, -0.760084f, 0.066627f, 0.42253f, 0.365703f, 0.62729f, 0.380708f,
            0.0900432f, -0.866731f, -0.784469f, -2.18692f, -1.73267f, 0.251761f, -0.791547f, 0.634862f, 0.646589f,
            -0.321454f, 0.575675f, 0.98983f, -0.445829f, 0.523965f, -0.346374f, 0.127803f, -2.13572f, -0.796409f,
            1.5734f, -0.972705f, -1.88344f, -1.04588f, -0.0209212f, 0.78641f, -0.3878f, 0.151791f, 2.08673f, 0.698802f,
            0.584678f, -0.78199f, -0.352576f, 1.03862f, 0.229792f, -0.223219f, 1.02365f, 1.45293f, 0.0561579f, 1.95182f,
            1.59586f, 0.773778f, 0.648415f, 1.65464f, 1.311f, 0.326254f, -0.447391f, -0.858153f, -0.702836f, -0.589441f,
            1.18929f, 0.382556f, 0.499048f, 1.16212f, 1.62688f, 1.31246f, 1.82684f };

    topology topology(
            input_layout("input", input.get_layout()),
            input_layout("trans", trans.get_layout()),
            data("weights", weights),
            data("biases", biases),
            convolution(
                    "conv",
                    "input",
                    "trans",
                    { "weights" },
                    { "biases" },
                    1,
                    2,
                    { 1, 1, 1, 1 },
                    { 0, 0, -2, -2 },
                    { 1, 1, 2, 2 },
                    { 1, 4, 4, 4 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);
    network.set_input_data("trans", trans);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 4);
    EXPECT_EQ(x_size, 4);
    EXPECT_EQ(f_size, 4);
    EXPECT_EQ(b_size, 1);

    for (size_t i = 0; i < output_vec.size(); ++i) {
        EXPECT_NEAR(output_vec[i], output_ptr[i], 0.1);
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_no_bias) {
    //  Filter : 2x3
    //  Stride : 2x1
    //  Input  : 4x5
    //  Output : 2x3
    //
    //  Input:
    //  1  2  3  4  5
    //  2  2  3  4  6
    //  3  3  3  5  1
    //  1  1  1  1  1
    //
    //  Filter:
    //  1  2  1
    //  2  1  2
    //
    //  Output:
    // 21  28  39
    // 18  20  20

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32,format::yxfb,{ 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 3, 2 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(weights, { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f });
    VVF<float> output_vec = {
        { 20.0f, 27.0f, 38.0f },
        { 17.0f, 19.0f, 19.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        convolution("conv", "input", { "weights" }, { 1,1,1,2 }));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);
    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }

    //VVF temp_vec(y_size, VF(x_size, 0.0f));
    //for (int y = 0; y < y_size; ++y) {
    //    for (int x = 0; x < x_size; ++x) {
    //        temp_vec[y][x] = output_ptr[y * x_size + x];
    //    }
    //}
    //print_2d(temp_vec);
}

TEST(convolution_f32_fw_gpu, basic_convolution_int8_no_bias) {
    //  Filter : 2x3
    //  Stride : 2x1
    //  Input  : 4x5
    //  Output : 2x3
    //
    //  Input:
    //  1  2  3  4  5
    //  2  2  3  4  6
    //  3  3  3  5  1
    //  1  1  1  1  1
    //
    //  Filter:
    //  1  2  1
    //  2  1  2
    //
    //  Output:
    // 21  28  39
    // 18  20  20

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32,format::bfyx,{ 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8,format::bfyx,{ 1, 1, 3, 2 } });

    set_values(input, { 1.1f, 2.4f, 3.5f, 4.5f, 5.8f,
                        2.9f, 2.3f, 3.5f, 4.4f, 6.6f,
                        3.8f, 3.9f, 3.4f, 5.1f, 1.4f,
                        1.8f, 1.1f, 1.2f, 1.2f, 1.9f });
    set_values<int8_t>(weights, { 1, 2, 1,
                                  2, 1, 2 });
    VVF<float> output_vec = {
        { 25.0f, 31.0f, 46.0f },
        { 22.0f, 20.0f, 21.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        reorder("to_int","input", { data_types::i8,format::bfyx,{ 1, 1, 5, 4 } }),
        data("weights", weights),
        convolution("conv", "to_int", { "weights" }, { 1,1,1,2 }),
        reorder("output", "conv", { data_types::f32,format::bfyx,{ 1, 1, 3, 2 } }));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "output");

    auto output_memory = outputs.at("output").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);
    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution3D_no_bias) {
    //  data is similar as in basic_convolution_no_bias

    //  Filter : 2x3x1
    //  Stride : 2x1x1
    //  Input  : 4x5x1
    //  Output : 2x3x1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 2 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(weights, { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f });

    VVF<float> output_vec = {
        { 20.0f, 27.0f, 38.0f },
        { 17.0f, 19.0f, 19.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        convolution("conv", "input", { "weights" }, { 1,1,1,2 }));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int z_size = output_layout.size.spatial[2];
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(z_size, 1);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);
    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution3D) {
    //  Input  : 4x4x4
    //  Filter : 2x2x2
    //  Output : 3x3x3
    //
    //  Input:
    //  1  0  1  0
    //  1  1  3  1
    //  1  1  0  2
    //  0  2  1  1
    //
    //  1  0  0  1
    //  2  0  1  2
    //  3  1  1  1
    //  0  0  3  1
    //
    //  2  0  1  1
    //  3  3  1  0
    //  2  1  1  0
    //  3  2  1  2
    //
    //  1  0  2  0
    //  1  0  3  3
    //  3  1  0  0
    //  1  1  0  2
    //
    //  Filter:
    //  0  1
    //  0  0
    //
    //  2  1
    //  0  0

    //  Output:
    //  2  1  1
    //  5  4  5
    //  8  3  5
    //
    //  4  1  4
    //  9  8  4
    //  6  4  3
    //
    //  2  3  5
    //  5  4  9
    //  8  3  0
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfzyx,{ 1, 1, 4, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfzyx,{ 1, 1, 2, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1, 1 } });

    set_values(input, {
        1.0f,  0.0f,  1.0f,  0.0f,
        1.0f,  1.0f,  3.0f,  1.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        0.0f,  2.0f,  1.0f,  1.0f,
        1.0f,  0.0f,  0.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  2.0f,
        3.0f,  1.0f,  1.0f,  1.0f,
        0.0f,  0.0f,  3.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  1.0f,
        3.0f,  3.0f,  1.0f,  0.0f,
        2.0f,  1.0f,  1.0f,  0.0f,
        3.0f,  2.0f,  1.0f,  2.0f,
        1.0f,  0.0f,  2.0f,  0.0f,
        1.0f,  0.0f,  3.0f,  3.0f,
        3.0f,  1.0f,  0.0f,  0.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
    });

    set_values(weights, {
        0.0f,  1.0f,
        0.0f,  0.0f,
        2.0f,  1.0f,
        0.0f,  0.0f,
    });

    set_values(biases, { 1.0f });

    VVVF<float> output_vec = {
        {
            { 3.0f,   2.0f,   2.0f },
            { 6.0f,   5.0f,   6.0f },
            { 9.0f,   4.0f,   6.0f }
        },
        {
            { 5.0f,   2.0f,   5.0f },
            { 10.0f,   9.0f,   5.0f },
            { 7.0f,   5.0f,   4.0f }
        },
        {
            { 3.0f,   4.0f,   6.0f },
            { 6.0f,   5.0f,   10.0f },
            { 9.0f,   4.0f,   1.0f }
        }
    };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int z_size = output_layout.size.spatial[2];
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfzyx);
    EXPECT_EQ(z_size, 3);
    EXPECT_EQ(y_size, 3);
    EXPECT_EQ(x_size, 3);

    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);
    for (int z = 0; z < z_size; ++z) {
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_EQ(output_vec[z][y][x], output_ptr[z * y_size * x_size + y * x_size + x]);
            }
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution3D_split2) {
    //  data is similar as in basic_convolution3D
    const auto& engine = get_test_engine();
    auto input = memory::allocate(engine, { data_types::f32, format::bfzyx,{ 1, 2, 4, 4, 4 } });
    auto weights_1 = memory::allocate(engine, { data_types::f32, format::goizyx, tensor(cldnn::group(2), cldnn::batch(1), cldnn::feature(1), cldnn::spatial(2, 2, 2))});
    auto biases_1 = memory::allocate(engine, { data_types::f32, format::bfyx, tensor(feature(2)) });

    set_values(input, {
        1.0f,  0.0f,  1.0f,  0.0f,
        1.0f,  1.0f,  3.0f,  1.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        0.0f,  2.0f,  1.0f,  1.0f,
        1.0f,  0.0f,  0.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  2.0f,
        3.0f,  1.0f,  1.0f,  1.0f,
        0.0f,  0.0f,  3.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  1.0f,
        3.0f,  3.0f,  1.0f,  0.0f,
        2.0f,  1.0f,  1.0f,  0.0f,
        3.0f,  2.0f,  1.0f,  2.0f,
        1.0f,  0.0f,  2.0f,  0.0f,
        1.0f,  0.0f,  3.0f,  3.0f,
        3.0f,  1.0f,  0.0f,  0.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        1.0f,  0.0f,  1.0f,  0.0f,
        1.0f,  1.0f,  3.0f,  1.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        0.0f,  2.0f,  1.0f,  1.0f,
        1.0f,  0.0f,  0.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  2.0f,
        3.0f,  1.0f,  1.0f,  1.0f,
        0.0f,  0.0f,  3.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  1.0f,
        3.0f,  3.0f,  1.0f,  0.0f,
        2.0f,  1.0f,  1.0f,  0.0f,
        3.0f,  2.0f,  1.0f,  2.0f,
        1.0f,  0.0f,  2.0f,  0.0f,
        1.0f,  0.0f,  3.0f,  3.0f,
        3.0f,  1.0f,  0.0f,  0.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
    });

    set_values(weights_1, {
        0.0f,  1.0f,
        0.0f,  0.0f,
        2.0f,  1.0f,
        0.0f,  0.0f,
        0.0f,  1.0f,
        0.0f,  0.0f,
        2.0f,  1.0f,
        0.0f,  0.0f,
    });

    set_values(biases_1, { 1.0f, 2.0f });

    VVVVF<float> output_vec = {
        {
            {
                { 3.0f,   2.0f,   2.0f },
                { 6.0f,   5.0f,   6.0f },
                { 9.0f,   4.0f,   6.0f }
            },
            {
                { 5.0f,   2.0f,   5.0f },
                { 10.0f,   9.0f,   5.0f },
                { 7.0f,   5.0f,   4.0f }
            },
            {
                { 3.0f,   4.0f,   6.0f },
                { 6.0f,   5.0f,   10.0f},
                { 9.0f,   4.0f,   1.0f }
            },
        },
        {
            {
                { 4.0f,   3.0f,   3.0f },
                { 7.0f,   6.0f,   7.0f },
                { 10.0f,  5.0f,   7.0f }
            },
            {
                { 6.0f,   3.0f,   6.0f },
                { 11.0f,  10.0f,  6.0f },
                { 8.0f,   6.0f,   5.0f }
            },
            {
                { 4.0f,   5.0f,   7.0f },
                { 7.0f,   6.0f,  11.0f },
                { 10.0f,  5.0f,   2.0f }
            },
        }
    };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights_1", weights_1),
        data("biases_1", biases_1),
        convolution("conv", "input", { "weights_1" }, { "biases_1" }, 2, tensor(1), tensor(0), tensor(1), tensor{1, 2, 3, 3, 3}, data_types::f32));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int z_size = output_layout.size.spatial[2];
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfzyx);
    EXPECT_EQ(z_size, 3);
    EXPECT_EQ(y_size, 3);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(b_size, 1);
    EXPECT_EQ(f_size, 2);
    for (int f = 0; f < f_size; ++f) {
        for (int z = 0; z < z_size; ++z) {
            for (int y = 0; y < y_size; ++y) {
                for (int x = 0; x < x_size; ++x) {
                    EXPECT_EQ(output_vec[f][z][y][x],
                        output_ptr[f * z_size * y_size * x_size + z * y_size * x_size + y * x_size + x]);
                }
            }
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution3D_group2) {
    //  data is similar as in basic_convolution3D_split2
    const auto& engine = get_test_engine();
    auto input = memory::allocate(engine, { data_types::f32, format::bfzyx,{ 1, 2, 4, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfzyx,{ 2, 1, 2, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1, 1 } });

    set_values(input, {
        1.0f,  0.0f,  1.0f,  0.0f,
        1.0f,  1.0f,  3.0f,  1.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        0.0f,  2.0f,  1.0f,  1.0f,
        1.0f,  0.0f,  0.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  2.0f,
        3.0f,  1.0f,  1.0f,  1.0f,
        0.0f,  0.0f,  3.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  1.0f,
        3.0f,  3.0f,  1.0f,  0.0f,
        2.0f,  1.0f,  1.0f,  0.0f,
        3.0f,  2.0f,  1.0f,  2.0f,
        1.0f,  0.0f,  2.0f,  0.0f,
        1.0f,  0.0f,  3.0f,  3.0f,
        3.0f,  1.0f,  0.0f,  0.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        1.0f,  0.0f,  1.0f,  0.0f,
        1.0f,  1.0f,  3.0f,  1.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
        0.0f,  2.0f,  1.0f,  1.0f,
        1.0f,  0.0f,  0.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  2.0f,
        3.0f,  1.0f,  1.0f,  1.0f,
        0.0f,  0.0f,  3.0f,  1.0f,
        2.0f,  0.0f,  1.0f,  1.0f,
        3.0f,  3.0f,  1.0f,  0.0f,
        2.0f,  1.0f,  1.0f,  0.0f,
        3.0f,  2.0f,  1.0f,  2.0f,
        1.0f,  0.0f,  2.0f,  0.0f,
        1.0f,  0.0f,  3.0f,  3.0f,
        3.0f,  1.0f,  0.0f,  0.0f,
        1.0f,  1.0f,  0.0f,  2.0f,
    });

    set_values(weights, {
        0.0f,  1.0f,
        0.0f,  0.0f,
        2.0f,  1.0f,
        0.0f,  0.0f,
        0.0f,  1.0f,
        0.0f,  0.0f,
        2.0f,  1.0f,
        0.0f,  0.0f,
    });

    set_values(biases, { 1.0f, 2.0f });

    VVVVF<float> output_vec = {
        {
            {
                { 3.0f,   2.0f,   2.0f },
                { 6.0f,   5.0f,   6.0f },
                { 9.0f,   4.0f,   6.0f }
            },
            {
                { 5.0f,   2.0f,   5.0f },
                { 10.0f,   9.0f,   5.0f },
                { 7.0f,   5.0f,   4.0f }
            },
            {
                { 3.0f,   4.0f,   6.0f },
                { 6.0f,   5.0f,   10.0f },
                { 9.0f,   4.0f,   1.0f }
            },
        },
        {
            {
                { 4.0f,   3.0f,   3.0f },
                { 7.0f,   6.0f,   7.0f },
                { 10.0f,  5.0f,   7.0f }
            },
            {
                { 6.0f,   3.0f,   6.0f },
                { 11.0f,  10.0f,  6.0f },
                { 8.0f,   6.0f,   5.0f }
            },
            {
                { 4.0f,   5.0f,   7.0f },
                { 7.0f,   6.0f,  11.0f },
                { 10.0f,  5.0f,   2.0f }
            },
        }
    };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int z_size = output_layout.size.spatial[2];
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfzyx);
    EXPECT_EQ(b_size, 1);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(z_size, 3);
    EXPECT_EQ(y_size, 3);
    EXPECT_EQ(x_size, 3);
    for (int f = 0; f < f_size; ++f) {
        for (int z = 0; z < z_size; ++z) {
            for (int y = 0; y < y_size; ++y) {
                for (int x = 0; x < x_size; ++x) {
                    EXPECT_EQ(output_vec[f][z][y][x],
                        output_ptr[f * z_size * y_size * x_size + z * y_size * x_size + y * x_size + x]);
                }
            }
        }
    }
}

TEST(convolution_f32_fw_gpu, with_output_size_same_input) {

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 320, 320 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 64, 4, 7, 7 } });
    auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 64, 4, 7, 7 } });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("weights2", weights2),
        convolution::create_with_output_size("conv1", "input", { "weights" }, {1, 64, 160, 160}, {1, 1, 2, 2}, {0, 0, -3, -3}),
        convolution::create_with_output_size("conv2", "input", { "weights2" }, {1, 64, 320, 320}, {1, 1, 1, 1}, {0, 0, -3, -3})
        );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(2));
    EXPECT_EQ(outputs.begin()->first, "conv1");
    EXPECT_EQ(outputs.rbegin()->first, "conv2");
}

TEST(convolution_f32_fw_gpu, three_convolutions_same_weights) {
    //  Filter : 1x1
    //  Input  : 2x2
    //  Output : 2x2
    //
    //  Input:
    //  1  1   1  1
    //  1  1   1  1
    //
    //  Filter:
    //  1
    //
    //  Output:
    //  8  8   8  8
    //  8  8   8  8

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, {1,2,2,2} });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 2,2,1,1 } });

    set_values(input, { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        convolution("conv1", "input", { "weights" }),
        convolution("conv2", "conv1", { "weights" }),
        convolution("conv3", "conv2", { "weights" })
    );

    cldnn::build_options options;
    options.set_option(cldnn::build_option::optimize_data(true));
    network network(engine, topology, options);
    network.set_input_data("input", input);

    auto outputs = network.execute();

    auto output_memory = outputs.at("conv3").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];

    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 2);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);

    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_FLOAT_EQ(8.0f, output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution) {
    //  Filter : 2x3
    //  Stride : 2x1
    //  Input  : 4x5
    //  Output : 2x3
    //
    //  Input:
    //  1  2  3  4  5
    //  2  2  3  4  6
    //  3  3  3  5  1
    //  1  1  1  1  1
    //
    //  Filter:
    //  1  2  1
    //  2  1  2
    //
    //  Output:
    // 21  28  39
    // 18  20  20
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(weights, { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 21.0f, 28.0f, 39.0f },
        { 18.0f, 20.0f, 20.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution( "conv", "input", { "weights" }, { "biases" }, { 0,0,1,2 }));

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);
    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_bfyx_weights_as_input_layout) {
    //Same params as convolution_f32_fw_gpu, basic_convolution but with bfyx optimized data and weights set as input_layout
    const auto& engine = get_test_engine();
    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,
    { 1, 1, 5, 4 }
    });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,
    { 1, 1, 3, 2 }
    });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,
    { 1, 1, 1, 1 }
    });
    set_values(input,
    { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 2.0f, 2.0f, 3.0f, 4.0f, 6.0f, 3.0f, 3.0f, 3.0f, 5.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }
    );
    set_values(weights,
    { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f }
    );
    set_values(biases,
    { 1.0f }
    );
    VVF<float> output_vec = {
        { 21.0f, 28.0f, 39.0f }
        ,
        { 18.0f, 20.0f, 20.0f }
    };
    topology topology(
        input_layout("input", input.get_layout()),
        input_layout("weights", weights.get_layout()),
        input_layout("biases", biases.get_layout()),
        convolution("conv", "input",
        { "weights" }
            ,
            { "biases" }
            ,
            { 0,0,1,2 }
    ));
    cldnn::build_options options;
    options.set_option(cldnn::build_option::optimize_data(true));
    network network(engine, topology, options);
    network.set_input_data("input", input);
    network.set_input_data("weights", weights);
    network.set_input_data("biases", biases);
    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);
    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_input_padding) {
    //  Filter : 2x2
    //  Stride : 1x1
    //  Input  : 3x4
    //  Input padding : 2x1
    //  Output : 6x5
    //  Padding: Zero
    //
    //  Input:
    //  z  z  z  z  z  z
    //  z  z  z  z  z  z
    //  z  1  2  3  4  z
    //  z  2  2  3  4  z
    //  z  3  3  3  5  z
    //  z  z  z  z  z  z
    //  z  z  z  z  z  z
    //
    //  Filter:
    //  1  1
    //  1  1
    //
    //  Output:
    //  1  1  1  1  1
    //  2  4  6  8  5
    //  4  8 11 15  9
    //  6 11 12 16 10
    //  4  7  7  9  6
    //  1  1  1  1  1
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 2.0f, 4.0f, 6.0f, 8.0f, 5.0f },
        { 4.0f, 8.0f, 11.0f, 15.0f, 9.0f },
        { 6.0f, 11.0f, 12.0f, 16.0f, 10.0f },
        { 4.0f, 7.0f, 7.0f, 9.0f, 6.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,1,1 },
            { 0,0,-1,-2 },
            { 1, 1, 1, 1 },
            padding{ { 0,0,0,0 }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 6);
    EXPECT_EQ(x_size, 5);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);

    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }

    //VVF temp_vec(y_size, VF(x_size, 0.0f));
    //for (int y = 0; y < y_size; ++y) {
    //    for (int x = 0; x < x_size; ++x) {
    //        temp_vec[y][x] = output_ptr[y * x_size + x];
    //    }
    //}
    //print_2d(temp_vec);
}

TEST(convolution_f32_fw_gpu, basic_convolution_sym_input_padding) {
    //  Filter : 2x2
    //  Stride : 1x1
    //  Input  : 3x4
    //  Input padding : above 2x1, below 2x1
    //  Output : 6x5
    //  Padding: Zero
    //
    //  Input:
    //  z  z  z  z  z  z
    //  z  z  z  z  z  z
    //  z  1  2  3  4  z
    //  z  2  2  3  4  z
    //  z  3  3  3  5  z
    //  z  z  z  z  z  z
    //  z  z  z  z  z  z
    //
    //  Filter:
    //  1  1
    //  1  1
    //
    //  Output:
    //  1  1  1  1  1
    //  2  4  6  8  5
    //  4  8 11 15  9
    //  6 11 12 16 10
    //  4  7  7  9  6
    //  1  1  1  1  1
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 2.0f, 4.0f, 6.0f, 8.0f, 5.0f },
        { 4.0f, 8.0f, 11.0f, 15.0f, 9.0f },
        { 6.0f, 11.0f, 12.0f, 16.0f, 10.0f },
        { 4.0f, 7.0f, 7.0f, 9.0f, 6.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,1,1 },
            { 0,0,0,0 },
            { 1, 1, 1, 1 },
            { 0,0,1,2 },
            { 0,0,1,2 },
            padding{ { 0,0,0,0 }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 6);
    EXPECT_EQ(x_size, 5);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);

    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_asym_input_padding) {
    //  Filter : 2x2
    //  Stride : 1x1
    //  Input  : 3x4
    //  Input padding : above 2x1, below 3x2
    //  Output : 7x6
    //  Padding: Zero
    //
    //  Input:
    //  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z
    //  z  1  2  3  4  z  z
    //  z  2  2  3  4  z  z
    //  z  3  3  3  5  z  z
    //  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z
    //
    //  Filter:
    //  1  1
    //  1  1
    //
    //  Output:
    //  1  1  1  1  1  1
    //  2  4  6  8  5  1
    //  4  8 11 15  9  1
    //  6 11 12 16 10  1
    //  4  7  7  9  6  1
    //  1  1  1  1  1  1
    //  1  1  1  1  1  1
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f },
        { 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f },
        { 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f },
        { 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,1,1 },
            { 0,0,0,0 },
            { 1, 1, 1, 1 },
            { 0,0,1,2 },
            { 0,0,2,3 },
            padding{ { 0,0,0,0 }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 7);
    EXPECT_EQ(x_size, 6);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);

    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_sym_input_padding_with_input_offset) {
    //  Filter : 2x2
    //  Stride : 1x1
    //  Input  : 3x4
    //  Input padding : above 2x1, below 2x1
    //  Input offset: 2x1
    //  Output : 10x7
    //  Padding: Zero
    //
    //  Input:
    //  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z
    //  z  z  1  2  3  4  z  z
    //  z  z  2  2  3  4  z  z
    //  z  z  3  3  3  5  z  z
    //  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z
    //
    //  Filter:
    //  1  1
    //  1  1
    //
    //  Output:
    //  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1
    //  1  2  4  6  8  5  1
    //  1  4  8 11 15  9  1
    //  1  6 11 12 16 10  1
    //  1  4  7  7  9  6  1
    //  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f },
        { 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f },
        { 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f },
        { 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,1,1 },
            { 0,0,-1,-2 },
            { 1, 1, 1, 1 },
            { 0,0,1,2 },
            { 0,0,1,2 },
            padding{ { 0,0,0,0 }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 10);
    EXPECT_EQ(x_size, 7);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);

    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_asym_input_padding_with_input_offset) {
    //  Filter : 2x2
    //  Stride : 1x1
    //  Input  : 3x4
    //  Input padding : above 2x1, below 3x2
    //  Input offset: 2x1
    //  Output : 11x8
    //  Padding: Zero
    //
    //  Input:
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  1  2  3  4  z  z  z
    //  z  z  2  2  3  4  z  z  z
    //  z  z  3  3  3  5  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //  z  z  z  z  z  z  z  z  z
    //
    //  Filter:
    //  1  1
    //  1  1
    //
    //  Output:
    //  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1
    //  1  2  4  6  8  5  1  1
    //  1  4  8 11 15  9  1  1
    //  1  6 11 12 16 10  1  1
    //  1  4  7  7  9  6  1  1
    //  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 1, 4, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f, 1.0f },
        { 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f, 1.0f },
        { 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f, 1.0f },
        { 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,1,1 },
            { 0,0,-1,-2 },
            { 1, 1, 1, 1 },
            { 0,0,1,2 },
            { 0,0,2,3 },
            padding{ { 0,0,0,0 }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 11);
    EXPECT_EQ(x_size, 8);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);

    for (int y = 0; y < y_size; ++y) {
        for (int x = 0; x < x_size; ++x) {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }
}

TEST(convolution_f32_fw_gpu, basic_convolution_input_and_output_padding) {
    //  Filter : 2x2
    //  Stride : 1x1
    //  Input  : 3x4
    //  Input padding : 2x1
    //  Output : 8x9
    //  Padding: Zero
    //
    //  Input:
    //  z  z  z  z  z  z
    //  z  z  z  z  z  z
    //  z  1  2  3  4  z
    //  z  2  2  3  4  z
    //  z  3  3  3  5  z
    //  z  z  z  z  z  z
    //  z  z  z  z  z  z
    //
    //  Filter:
    //  1  1
    //  1  1
    //
    //  Output:
    //  1  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1  1
    //  1  1  2  4  6  8  5  1  1
    //  1  1  4  8 11 15  9  1  1
    //  1  1  6 11 12 16 10  1  1
    //  1  1  4  7  7  9  6  1  1
    //  1  1  1  1  1  1  1  1  1
    //  1  1  1  1  1  1  1  1  1
    //
    //  Bias:
    //  1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { 1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f });
    set_values(weights, { 1.0f, 1.0f, 1.0f, 1.0f });
    set_values(biases, { 1.0f });
    VVF<float> output_vec = {
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 2.0f, 4.0f, 6.0f, 8.0f, 5.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 4.0f, 8.0f, 11.0f, 15.0f, 9.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 6.0f, 11.0f, 12.0f, 16.0f, 10.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 4.0f, 7.0f, 7.0f, 9.0f, 6.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f },
        { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f } };

    const int x_pad = 2;
    const int y_pad = 1;
    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,1,1 },
            { 0,0,-1,-2 },
            { 1, 1, 1, 1 },
            padding{ { 0,0,-x_pad,-y_pad }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_memory = outputs.at("conv").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_size = output_layout.get_buffer_size();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_size.spatial[1];
    int x_size = output_size.spatial[0];
    int f_size = output_size.feature[0];
    int b_size = output_size.batch[0];
    EXPECT_EQ(output_layout.format, format::yxfb);
    EXPECT_EQ(y_size, 8);
    EXPECT_EQ(x_size, 9);
    EXPECT_EQ(f_size, 1);
    EXPECT_EQ(b_size, 1);

    for (int y = y_pad; y < y_size - y_pad; ++y)
    {
        for (int x = x_pad; x < x_size - x_pad; ++x)
        {
            EXPECT_EQ(output_vec[y][x], output_ptr[y * x_size + x]);
        }
    }

    //VVF temp_vec(y_size, VF(x_size, 0.0f));
    //for (int y = 0; y < y_size; ++y) {
    //    for (int x = 0; x < x_size; ++x) {
    //        temp_vec[y][x] = output_ptr[y * x_size + x];
    //    }
    //}
    //print_2d(temp_vec);
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad_random) {
    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 4x4
    //  Output : 2x2
    //
    //  Input:
    //  rnd  rnd  rnd  rnd
    //  rnd  rnd  rnd  rnd
    //  rnd  rnd  rnd  rnd
    //  rnd  rnd  rnd  rnd
    //
    //  Filter
    //  rnd  rnd
    //  rnd  rnd
    //
    //  Bias
    //  rnd
    //
    //  Output:
    //  rnd  rnd
    //  rnd  rnd

    size_t batch = 1, input_f = 1, input_y = 4, input_x = 4;

    VVVVF<float> input_rnd = generate_random_4d<float>(batch, input_f, input_y, input_x, -10, 10);
    VF<float> input_rnd_vec = flatten_4d<float>(format::yxfb, input_rnd);
    VVVVF<float> filter_rnd = generate_random_4d<float>(1, 1, 2, 2, -10, 10);
    VF<float> filter_rnd_vec = flatten_4d<float>(format::bfyx, filter_rnd);
    VF<float> bias_rnd = generate_random_1d<float>(1, -10, 10);
    VVVVF<float> output_rnd(batch, VVVF<float>(filter_rnd.size()));
    for (size_t b = 0; b < output_rnd.size(); ++b) {
        for (size_t of = 0; of < filter_rnd.size(); ++of) {
            output_rnd[b][of] = reference_convolve<float>(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]);
        }
    }
    VF<float> output_rnd_vec = flatten_4d<float>(format::yxfb, output_rnd);

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32,  format::yxfb, { 1, 1, 4, 4 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32,  format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32,  format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, input_rnd_vec);
    set_values(weights, filter_rnd_vec);
    set_values(biases, bias_rnd);

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", {"weights"}, {"biases"}, {1,1,2,2})
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    for (size_t i = 0; i < output_rnd.size(); ++i) {
        float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]);
        EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl;
    }
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad_random) {
    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 2x2x1x2
    //  Output : 1x1x1x2
    //
    //  Input:
    //  rnd  rnd    rnd  rnd
    //  rnd  rnd    rnd  rnd
    //
    //  Filter:
    //  rnd  rnd
    //  rnd  rnd
    //
    //  Bias:
    //  rnd
    //
    //  Output:
    //  rnd  rnd

    size_t batch = 2, input_f = 1, input_y = 2, input_x = 2;

    VVVVF<float> input_rnd = generate_random_4d<float>(batch, input_f, input_y, input_x, -10, 10);
    VF<float> input_rnd_vec = flatten_4d<float>(format::yxfb, input_rnd);
    VVVVF<float> filter_rnd = generate_random_4d<float>(1, 1, 2, 2, -10, 10);
    VF<float> filter_rnd_vec = flatten_4d<float>(format::bfyx, filter_rnd);
    VF<float> bias_rnd = generate_random_1d<float>(1, -10, 10);
    VVVVF<float> output_rnd(batch, VVVF<float>(filter_rnd.size()));
    for (size_t b = 0; b < output_rnd.size(); ++b) {
        for (size_t of = 0; of < filter_rnd.size(); ++of) {
            output_rnd[b][of] = reference_convolve<float>(input_rnd[b], filter_rnd[of], 2, 2, bias_rnd[of]);
        }
    }
    VF<float> output_rnd_vec = flatten_4d<float>(format::yxfb, output_rnd);

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, input_rnd_vec);
    set_values(weights, filter_rnd_vec);
    set_values(biases, bias_rnd);

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    for (size_t i = 0; i < output_rnd.size(); ++i) {
        float x = float_round(output_rnd_vec[i]), y = float_round(output_ptr[i]);
        EXPECT_FLOAT_EQ(x, y) << "random seed = " << random_seed << std::endl;
    }
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x1x1_nopad) {
    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 4x4
    //  Output : 2x2
    //
    //  Input:
    //  -0.5   1     0.5  2
    //   1.5  -0.5   0   -1
    //   0.5   0.5  -1    1
    //   0.5   2     1.5 -0.5
    //
    //  Filter
    //  -2   0.5
    //   3.5 1.5
    //
    //  Bias
    //  2
    //
    //  Output:
    //  8  0.5
    //  6  9

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f });
    set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f });
    set_values(biases, { 2.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(8.0f, output_ptr[0]);
    EXPECT_FLOAT_EQ(0.5f, output_ptr[1]);
    EXPECT_FLOAT_EQ(6.0f, output_ptr[2]);
    EXPECT_FLOAT_EQ(9.0f, output_ptr[3]);
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in2x2x1x2_nopad) {
    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 2x2x1x2
    //  Output : 1x1x1x2
    //
    //  Input:
    //  0.5   1.5    2.3 -0.4
    //  2.0  -4.0    1.0  3.0
    //
    //  Filter:
    //  -1.2  1.5
    //   0.5 -0.5
    //
    //  Bias:
    //  -1
    //
    //  Output:
    //  3.65 -5.36
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 1, 2, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 1, 1 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { 0.5f, 2.3f, 1.5f, -0.4f, 2.0f, 1.0f, -4.0f, 3.0f });
    set_values(weights, { -1.2f, 1.5f, 0.5f, -0.5f });
    set_values(biases, { -1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 } )
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(3.65f, output_ptr[0]);
    EXPECT_FLOAT_EQ(-5.36f, output_ptr[1]);
}

TEST(convolution_f32_fw_gpu, basic_ofm_wsiz2x1x2x1_in1x2x1_nopad) {
    //  Filter : 1x2x1x2x1
    //  Input  : 1x1x2x1
    //  Output : 1x2x1x1
    //
    //  Input:
    //  1.0    2.0
    //
    // Filter:
    //   1.0    2.0  ofm=0
    //  -1.0   -2.0  ofm=1
    //
    //  Bias:
    //  0.1 -0.2
    //
    //  Output:
    //   5.1  f=0
    //  -5.2  f=1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 1, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 2 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });

    set_values(input, { 1.0f, 2.0f });
    set_values(weights, { 1.0f, 2.0f, -1.0f, -2.0f });
    set_values(biases, { 0.1f, -0.2f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,5,5 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(5.1f, output_ptr[0]);
    EXPECT_FLOAT_EQ(-5.2f, output_ptr[1]);
}

TEST(convolution_f32_fw_gpu, basic_ofm_wsiz3x2x2x1_in2x2x1_nopad) {
    //  Filter : 1x3x2x2x1
    //  Input  : 1x2x2x1
    //  Output : 1x3x1x1
    //
    //  Input:
    //  1.0    2.0  f=0
    //  3.0    4.0  f=1
    //
    // Filter:
    //   1.0    2.0  ifm=0  ofm=0
    //   3.0    4.0  ifm=1
    //
    //   5.0    6.0  ifm=0  ofm=1
    //   7.0    8.0  ifm=1
    //
    //   9.0   10.0  ifm=0  ofm=2
    //  11.0   12.0  ifm=1
    //  Bias:
    //   -5     -6     -7
    //
    //  Output:
    //   25.0  f=0
    //   64,0  f=1
    //  103.0  f=2

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 3, 1, 1 } });

    set_values(input, { 1.0f, 3.0f, 2.0f, 4.0f });
    set_values(weights, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f });
    set_values(biases, { -5.0f, -6.0f, -7.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,5,5 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(25.0f, output_ptr[0]);
    EXPECT_FLOAT_EQ(64.0f, output_ptr[1]);
    EXPECT_FLOAT_EQ(103.0f, output_ptr[2]);
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2x1x3_wstr2x2_in2x2x1x1_nopad) {
    //  Filter : 2x2x1x3
    //  Stride : 2x2
    //  Input  : 2x2x1x1
    //  Output : 1x1x3x1
    //
    //  Input:
    //  -2.3 -0.1
    //   3.1  1.9
    //
    //  Filter:
    //  -1.1  1.5       0.1  0.2        2.0  -1.0
    //   0.5 -0.5       0.4  0.7        2.5  -1.5
    //
    //  Bias:
    //  0.1 -0.2 0.3
    //
    //  Output:
    //   0.7
    //   2.12
    //   3.08

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 1, 1 }, 3 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 3, 1, 1 } });

    set_values(input, { -2.3f, -0.1f, 3.1f, 1.9f });
    set_values(weights, { -1.1f, 1.5f, 0.5f, -0.5f, 0.1f, 0.2f, 0.4f, 0.7f, 2.0f, -1.0f, 2.5f, -1.5f });
    set_values(biases, { 0.1f, -0.2f, 0.3f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_TRUE(are_equal(3.08f, output_ptr[0]));
    EXPECT_TRUE(are_equal(2.12f, output_ptr[1]));
    EXPECT_TRUE(are_equal(0.7f,  output_ptr[2]));
}

TEST(convolution_f32_fw_gpu, wsiz3x3_wstr2x2_in2x2x1x1_zeropad) {
    //  Filter  : 3x3
    //  Stride  : 2x2
    //  Input   : 2x2
    //  Output  : 1x1
    //  Padding : zero
    //
    //  Input:
    //  -0.5   1.0   padd
    //   0.5   2.0   padd
    //  padd  padd   padd
    //
    //  Filter
    //  -2    0.5  3.5
    //   1.5  4   -5
    //   0.5  1.5 -1.5
    //
    //  Bias
    //  2
    //
    //  Output:
    //  12.25
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 3 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { -0.5f, 1.0f, 0.5f, 2.0f });
    set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f, 4.0f, -5.0f, 0.5f, 1.5f, -1.5f });
    set_values(biases, { 2.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 1,1,2,2 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(12.25f, output_ptr[0]);
}

TEST(convolution_f32_fw_gpu, offsets_wsiz3x3_wstr2x2_in2x2x1x1_zeropad) {
    //   Filter       : 3x3
    //   Stride       : 2x2
    //   Input        : 2x2
    //   Input offset : -1x-1
    //   Output       : 2x2
    //   Output offset: 1x1
    //   Padding      : zero
    //
    //   Input:
    //   padd padd  padd
    //   padd -0.5   1
    //   padd  0.5   2.0
    //
    //   Filter
    //   -2    0.5  3.5
    //    1.5  4   -5
    //    0.5  1.5 -1.5
    //
    //   Bias
    //   2
    //
    //   Output:
    //   rnd   rnd
    //   rnd   2.0
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 2, 2 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 3, 3 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, { -0.5f, 1.0f, 0.5f, 2.0f });
    set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f, 4.0f, -5.0f, 0.5f, 1.5f, -1.5f });
    set_values(biases, { 2.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,2,2 },
            { 0,0,-1,-1 },
            { 1, 1, 1, 1 },
            padding{ { 0,0,1,1 }, 0 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(-7.25f, output_ptr[4]);
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2) {
    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 4x4x2
    //  Output : 2x2x2
    //
    //  Input:
    //  f0: -0.5   1     0.5  2
    //       1.5  -0.5   0   -1
    //       0.5   0.5  -1    1
    //       0.5   2     1.5 -0.5
    //
    //  f1:  0.5   1.5   2.3 -0.4
    //       2.0  -4.0   1.0  3.0
    //       0.5   1.5   2.3 -0.4
    //       2.0  -4.0   1.0  3.0
    //
    //  Filter1:
    //  -2   0.5
    //   3.5 1.5
    //
    //  Bias1:
    //  2
    //
    //  Filter2:
    //  -1.2  1.5
    //   0.5 -0.5
    //
    //  Bias2:
    //  -1

    //  Output:
    //   8  3.65 0.5 -5.36
    //   6  3.65 9   -5.36

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 4, 4 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 2, 2 }, 2 } });
    auto weights1 = memory::allocate(engine, { data_types::f32, format::goiyx, tensor(group(2), batch(1), feature(1), spatial(2,2))});
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });

    set_values(input, {
        -0.5f,  0.5f,  1.0f,  1.5f,  0.5f,  2.3f,  2.0f, -0.4f,
        1.5f,  2.0f, -0.5f, -4.0f,  0.0f,  1.0f, -1.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  1.5f, -1.0f,  2.3f,  1.0f, -0.4f,
        0.5f,  2.0f,  2.0f, -4.0f,  1.5f,  1.0f, -0.5f,  3.0f
    });
    set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f, -1.2f, 1.5f, 0.5f, -0.5f });
    set_values(biases1, { 2.0f, -1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights1", weights1),
        data("biases1", biases1),
        convolution(
            "conv",
            "input",
            { "weights1" },
            { "biases1" },
            2,
            { 0,0,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(8.0f,   get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(0.5f,   get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 3));
    EXPECT_FLOAT_EQ(6.0f,   get_value<float>(output_ptr, 4));
    EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 5));
    EXPECT_FLOAT_EQ(9.0f,   get_value<float>(output_ptr, 6));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2) {
    //  2x Filter : 2x2
    //  Stride : 2x2
    //  Input  : 2x4x4x2
    //  Output : 2x2x2x2
    //
    //  Input:
    //  f0b0: -0.5   1     0.5  2
    //         1.5  -0.5   0   -1
    //         0.5   0.5  -1    1
    //         0.5   2     1.5 -0.5
    //
    //  f0b1: -0.5   1     0.5  2
    //         1.5  -0.5   0   -1
    //         0.5   0.5  -1    1
    //         0.5   2     1.5 -0.5
    //
    //  f1b0:  0.5   1.5   2.3 -0.4
    //         2.0  -4.0   1.0  3.0
    //         0.5   1.5   2.3 -0.4
    //         2.0  -4.0   1.0  3.0
    //
    //  f1b1:  0.5   1.5   2.3 -0.4
    //         2.0  -4.0   1.0  3.0
    //         0.5   1.5   2.3 -0.4
    //         2.0  -4.0   1.0  3.0
    //
    //
    //  Filter1:
    //  -2   0.5
    //   3.5 1.5
    //
    //  Bias1:
    //  2
    //
    //  Filter2:
    //  -1.2  1.5
    //   0.5 -0.5
    //
    //  Bias2:
    //  -1

    //  Output:
    //   8  8 3.65 3.65 0.5  0.5 -5.36 -5.36
    //   6  6 3.65 3.65 9    9   -5.36 -5.36

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 2, 2, 4, 4 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 2,{ 2, 2 }, 2 } });
    auto weights1 = memory::allocate(engine, { data_types::f32, format::goiyx, tensor(group(2), batch(1), feature(1), spatial(2,2)) });
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });

    set_values(input, {
       -0.5f, -0.5f,  0.5f,  0.5f,  1.0f,  1.0f,  1.5f,  1.5f,  0.5f,  0.5f,  2.3f,  2.3f,  2.0f,  2.0f, -0.4f, -0.4f,
        1.5f,  1.5f,  2.0f,  2.0f, -0.5f, -0.5f, -4.0f, -4.0f,  0.0f,  0.0f,  1.0f,  1.0f, -1.0f, -1.0f,  3.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  0.5f,  0.5f,  0.5f,  1.5f,  1.5f, -1.0f, -1.0f,  2.3f,  2.3f,  1.0f,  1.0f, -0.4f, -0.4f,
        0.5f,  0.5f,  2.0f,  2.0f,  2.0f,  2.0f, -4.0f, -4.0f,  1.5f,  1.5f,  1.0f,  1.0f, -0.5f, -0.5f,  3.0f,  3.0f,
    });
    set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f, -1.2f, 1.5f, 0.5f, -0.5f });
    set_values(biases1, { 2.0f, -1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights1", weights1),
        data("biases1", biases1),
        convolution(
            "conv",
            "input",
            { "weights1" },
            { "biases1" },
            2,
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(8.0f,   get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(8.0f,   get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 3));
    EXPECT_FLOAT_EQ(0.5f,   get_value<float>(output_ptr, 4));
    EXPECT_FLOAT_EQ(0.5f,   get_value<float>(output_ptr, 5));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 6));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
    EXPECT_FLOAT_EQ(6.0f,   get_value<float>(output_ptr, 8));
    EXPECT_FLOAT_EQ(6.0f,   get_value<float>(output_ptr, 9));
    EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 10));
    EXPECT_FLOAT_EQ(3.65f,  get_value<float>(output_ptr, 11));
    EXPECT_FLOAT_EQ(9.0f,   get_value<float>(output_ptr, 12));
    EXPECT_FLOAT_EQ(9.0f,   get_value<float>(output_ptr, 13));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 14));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 15));
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_group2) {
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2
    engine engine;

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(2), batch(1), feature(1), spatial(2,2)) });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });

    set_values(input, {
        -0.5f,  0.5f,  1.0f,  1.5f,  0.5f,  2.3f,  2.0f, -0.4f,
        1.5f,  2.0f, -0.5f, -4.0f,  0.0f,  1.0f, -1.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  1.5f, -1.0f,  2.3f,  1.0f, -0.4f,
        0.5f,  2.0f,  2.0f, -4.0f,  1.5f,  1.0f, -0.5f,  3.0f
    });
    set_values(weights, {
        -2.0f, 0.5f, 3.5f, 1.5f,
        -1.2f, 1.5f, 0.5f, -0.5f
    });
    set_values(biases, { 2.0f, -1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            2, // number of groups
            { 0,0,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(8.0f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(0.5f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 3));
    EXPECT_FLOAT_EQ(6.0f, get_value<float>(output_ptr, 4));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 5));
    EXPECT_FLOAT_EQ(9.0f, get_value<float>(output_ptr, 6));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_group2_bfyx) {
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x1_nopad_split2

    engine engine;

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 1, 2, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(2), batch(1), feature(1), spatial(2,2)) });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });

    set_values(input, {
        -0.5f,  0.5f,  1.0f,  1.5f,  0.5f,  2.3f,  2.0f, -0.4f,
        1.5f,  2.0f, -0.5f, -4.0f,  0.0f,  1.0f, -1.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  1.5f, -1.0f,  2.3f,  1.0f, -0.4f,
        0.5f,  2.0f,  2.0f, -4.0f,  1.5f,  1.0f, -0.5f,  3.0f
    });
    set_values(weights, {
        -2.0f, 0.5f, 3.5f, 1.5f,
        -1.2f, 1.5f, 0.5f, -0.5f
    });
    set_values(biases, { 2.0f, -1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        reorder("input_1", "input", { data_types::f32,format::bfyx,{ 1, 2, 4, 4 } }),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input_1",
            { "weights" },
            { "biases" },
            2, // number of groups
            { 0,0,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(8.0f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(0.5f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(6.0f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(9.0f, get_value<float>(output_ptr, 3));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 4));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 5));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 6));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_group2) {
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2

    engine engine;

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 2, 4, 4 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(2), batch(1), feature(1), spatial(2,2)) });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });

    set_values(input, {
        -0.5f, -0.5f,  0.5f,  0.5f,  1.0f,  1.0f,  1.5f,  1.5f,  0.5f,  0.5f,  2.3f,  2.3f,  2.0f,  2.0f, -0.4f, -0.4f,
        1.5f,  1.5f,  2.0f,  2.0f, -0.5f, -0.5f, -4.0f, -4.0f,  0.0f,  0.0f,  1.0f,  1.0f, -1.0f, -1.0f,  3.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  0.5f,  0.5f,  0.5f,  1.5f,  1.5f, -1.0f, -1.0f,  2.3f,  2.3f,  1.0f,  1.0f, -0.4f, -0.4f,
        0.5f,  0.5f,  2.0f,  2.0f,  2.0f,  2.0f, -4.0f, -4.0f,  1.5f,  1.5f,  1.0f,  1.0f, -0.5f, -0.5f,  3.0f,  3.0f,
    });
    set_values(weights, {
        -2.0f, 0.5f, 3.5f, 1.5f,
        -1.2f, 1.5f, 0.5f, -0.5f
    });
    set_values(biases, { 2.0f, -1.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            2, // number of groups
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(8.0f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(8.0f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 3));
    EXPECT_FLOAT_EQ(0.5f, get_value<float>(output_ptr, 4));
    EXPECT_FLOAT_EQ(0.5f, get_value<float>(output_ptr, 5));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 6));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 7));
    EXPECT_FLOAT_EQ(6.0f, get_value<float>(output_ptr, 8));
    EXPECT_FLOAT_EQ(6.0f, get_value<float>(output_ptr, 9));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 10));
    EXPECT_FLOAT_EQ(3.65f, get_value<float>(output_ptr, 11));
    EXPECT_FLOAT_EQ(9.0f, get_value<float>(output_ptr, 12));
    EXPECT_FLOAT_EQ(9.0f, get_value<float>(output_ptr, 13));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 14));
    EXPECT_FLOAT_EQ(-5.36f, get_value<float>(output_ptr, 15));
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt) {
    //  Test for depthwise separable optimization, there are 16 weights and biases (split 16)
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 16, 4, 4 } });

    set_values(input, {
        -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f,
        1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f,
        0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f,
        2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f,
        1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f,
        -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f,
        0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f,
        -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f,
        0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f,
        -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f,
        1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f,
        0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f,
        2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f,
        1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f,
        -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f,
    });

    auto weights1 = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(16), batch(1), feature(1), spatial(2,2)) });
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 1, 1 } });

    set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f });

    set_values(biases1, { 2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f });

    primitive_id weights_id = "weights1";
    primitive_id bias_id = "biases1";

    topology topology(
        input_layout("input", input.get_layout()),
        data(weights_id, weights1),
        data(bias_id, biases1),
        convolution(
                "conv",
                "input",
                { weights_id },
                { bias_id },
                16,  // number of groups
                { 1,1,2,2 },
                { 0,0,0,0 },
                { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    std::vector<float> expected_output_vec = {
        8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f,
        0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f,
        6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f,
        9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f,
    };

    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
    {
        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
    }
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt_bfyx) {
    //  Test for depthwise separable optimization, there are 16 weights and biases (split 16)
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2 but with batch 1
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 16, 4, 4 } });

    set_values(input, {
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
    });

    auto weights1 = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(16), batch(1), feature(1), spatial(2,2)) });
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 1, 1 } });

    set_values(weights1, { -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f,
                           -2.0f, 0.5f, 3.5f, 1.5f,  -1.2f, 1.5f, 0.5f, -0.5f });

    set_values(biases1, { 2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f,
                          2.0f, -1.0f });

    primitive_id weights_id = "weights1";
    primitive_id bias_id = "biases1";

    topology topology(
            input_layout("input", input.get_layout()),
            data(weights_id, weights1),
            data(bias_id, biases1),
            convolution(
            "conv",
            "input",
            { weights_id },
            { bias_id },
            16,  // number of groups
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    std::vector<float> expected_output_vec = {
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
    };

    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
    {
        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
    }
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_group16) {
    //  Test for grouped convolution, there are 16 joined weights and biases (group 16)
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt

    engine engine;

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb,{ 2, 16, 4, 4 } });

    set_values(input, {
        -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f, -0.5f, -0.5f,  0.5f,  0.5f,
        1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f, 1.0f,  1.0f,  1.5f,  1.5f,
        0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f, 0.5f,  0.5f,  2.3f,  2.3f,
        2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f, 2.0f,  2.0f, -0.4f, -0.4f,
        1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f, 1.5f,  1.5f,  2.0f,  2.0f,
        -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f, -0.5f, -0.5f, -4.0f, -4.0f,
        0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f, 0.0f,  0.0f,  1.0f,  1.0f,
        -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f, -1.0f, -1.0f,  3.0f,  3.0f,
        0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f, 0.5f,  0.5f,  0.5f,  0.5f,
        0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f, 0.5f,  0.5f,  1.5f,  1.5f,
        -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f, -1.0f, -1.0f,  2.3f,  2.3f,
        1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f, 1.0f,  1.0f, -0.4f, -0.4f,
        0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f, 0.5f,  0.5f,  2.0f,  2.0f,
        2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f, 2.0f,  2.0f, -4.0f, -4.0f,
        1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f, 1.5f,  1.5f,  1.0f,  1.0f,
        -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f, -0.5f, -0.5f,  3.0f,  3.0f,
    });

    topology topology(input_layout("input", input.get_layout()));

    auto weights = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(16), batch(1), feature(1), spatial(2,2)) });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 1, 1 } });

    set_values(weights,
        {
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f
        }
    );
    set_values(biases, { 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f});

    topology.add(
        data("weights", weights),
        data("bias", biases)
    );

    topology.add(
        convolution(
            "conv",
            "input",
            { "weights" },
            { "bias" },
            16,
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    std::vector<float> expected_output_vec = {
        8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f, 8.0f, 8.0f, 3.65f, 3.65f,
        0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f, 0.5f, 0.5f, -5.36f, -5.36f,
        6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f, 6.0f, 6.0f, 3.65f, 3.65f,
        9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f, 9.0f, 9.0f, -5.36f, -5.36f,
    };

    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
    {
        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
    }
}

TEST(convolution_f32_fw_gpu, basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_group16_bfyx) {
    //  Test for grouped convolution, there are 16 joined weights and biases (group 16)
    //  data is similar as in basic_wsiz2x2_wstr2x2_in4x4x2x2_nopad_split2_depthwise_sep_opt_bfyx
    engine engine;

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 2, 16, 4, 4 } });

    set_values(input, {
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
        -0.5f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f, 0.0f, -1.0f, 0.5f, 0.5f, -1.0f, 1.0f, 0.5f, 2.0f, 1.5f, -0.5f,
        0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f, 0.5f, 1.5f, 2.3f, -0.4f, 2.0f, -4.0f, 1.0f, 3.0f,
    });

    topology topology(input_layout("input", input.get_layout()));

    auto weights = memory::allocate(engine, { data_types::f32, format::goiyx ,tensor(group(16), batch(1), feature(1), spatial(2,2)) });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 16, 1, 1 } });

    set_values(weights,
        {
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f,
            -2.0f, 0.5f, 3.5f, 1.5f,
            -1.2f, 1.5f, 0.5f, -0.5f
        }
    );

    set_values(biases, { 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f, 2.0f, -1.0f});

    topology.add(
            data("weights", weights),
            data("bias", biases)
    );

    topology.add(
        convolution(
            "conv",
            "input",
            { "weights" },
            { "bias" },
            16,
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    std::vector<float> expected_output_vec = {
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
        8.0f, 0.5f,  6.0f,  9.0f, 3.65f,-5.36f, 3.65f, -5.36f,
    };

    for (unsigned int i = 0; i < expected_output_vec.size(); i++)
    {
        EXPECT_FLOAT_EQ(expected_output_vec[i], output_ptr[i]);
    }
}

/*TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_nopad_split2) {
    //  Filter : 1x1
    //  Stride : 2x2
    //  Input  : 1x1x4
    //  Output : 1x1x4
    //
    //  Input:
    //  f0:  1.5
    //  f1:  0.5
    //
    //  f2:  0.0
    //  f3: -0.5
    //
    //
    //  Filter1:
    //  -2 -0.5  ofm=0
    //   1  2    ofm=1
    //  Bias1:
    //   1  5
    //
    //  Filter2:
    //   4  1.5  ofm=0
    //   2  0.5  ofm=1
    //
    //  Bias2:
    //  -1  2.5
    //
    //  Output:
    //  -2.25
    //   7.5
    //
    //  -1.75
    //   2.25

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 4, 1, 1 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 4 } });
    auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } });
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });
    auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 1, 1 } });
    auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });

    set_values(input, {
       1.5f, 0.5f, 0.0f, -0.5f
    });
    set_values(weights1, { -2.0f, -0.5f, 1.0f, 2.0f });
    set_values(biases1, { 1.0f, 5.0f });
    set_values(weights2, { 4.0f, 1.5f, 2.0f, 0.5f });
    set_values(biases2, { -1.0f, 2.5f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights1", weights1),
        data("biases1", biases1),
        data("weights2", weights2),
        data("biases2", biases2),
        convolution(
            "conv",
            "input",
            { "weights1", "weights2" },
            { "biases1", "biases2" },
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(-2.25f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(7.5f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(-1.75f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(2.25f, get_value<float>(output_ptr, 3));
}

TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x2x1_nopad_split2) {
    //  Filter : 1x1
    //  Stride : 2x2
    //  Input  : 1x1x2
    //  Output : 1x1x4
    //
    //  Input:
    //  f0:  1.5
    //
    //  f1:  0.5
    //
    //  Filter1:
    //  -2  ofm=0
    //   1  ofm=1
    //  Bias1:
    //   1  5
    //
    //  Filter2:
    //   4  ofm=0
    //   2  ofm=1
    //
    //  Bias2:
    //  -1  2.5
    //
    //  Output:
    //  -2
    //   6.5
    //
    //   1
    //   3.5

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 2, 1, 1 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 4 } });
    auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } });
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });
    auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 1, 1, 1 } });
    auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });

    set_values(input, {
        1.5f, 0.5f
    });
    set_values(weights1, { -2.0f, 1.0f });
    set_values(biases1, { 1.0f, 5.0f });
    set_values(weights2, { 4.0f, 2.0f });
    set_values(biases2, { -1.0f, 2.5f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights1", weights1),
        data("biases1", biases1),
        data("weights2", weights2),
        data("biases2", biases2),
        convolution(
            "conv",
            "input",
            { "weights1", "weights2" },
            { "biases1", "biases2" },
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(-2.0f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(6.5f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(1.0f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(3.5f, get_value<float>(output_ptr, 3));
}

TEST(convolution_f32_fw_gpu, basic_wsiz1x1_wstr2x2_in1x1x4x1_filter_1x3x2x1x1_nopad_split2) {
    //  Filter : 1x1
    //  Stride : 2x2
    //  Input  : 1x1x4
    //  Output : 1x1x6
    //
    //  Input:
    //  f0:  1.5
    //  f1:  0.5
    //
    //  f2:  2
    //  f3: -1.0
    //
    //  Filter1:
    //  -2   1   ofm=0
    //   1   3   ofm=1
    //   0.5 8   ofm=2
    //  Bias1:
    //   1   5   3
    //
    //  Filter2:
    //   4  -4   ofm=0
    //   2   0.5 ofm=1
    //  -0.5 3   ofm=2
    //
    //  Bias2:
    //  -1   2.5 2
    //
    //  Output:
    //  -1.5
    //   8
    //   7.75
    //
    //   11
    //   6
    //  -2

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 4, 1, 1 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1,{ 1, 1 }, 6 } });
    auto weights1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } });
    auto biases1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 3, 1, 1 } });
    auto weights2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 3, 2, 1, 1 } });
    auto biases2 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 3, 1, 1 } });

    set_values(input, {
        1.5f, 0.5f, 2.0f, -1.0f
    });
    set_values(weights1, { -2.0f, 1.0f, 1.0f, 3.0f, 0.5f, 8.0f });
    set_values(biases1, { 1.0f, 5.0f, 3.0f });
    set_values(weights2, { 4.0f, -4.0f, 2.0f, 0.5f, -0.5f, 3.0f });
    set_values(biases2, { -1.0f, 2.5f, 2.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights1", weights1),
        data("biases1", biases1),
        data("weights2", weights2),
        data("biases2", biases2),
        convolution(
            "conv",
            "input",
            { "weights1", "weights2" },
            { "biases1", "biases2" },
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1,1,1,1 })
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "conv");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(-1.5f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(8.0f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(7.75f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(11.0f, get_value<float>(output_ptr, 3));
    EXPECT_FLOAT_EQ(6.0f, get_value<float>(output_ptr, 4));
    EXPECT_FLOAT_EQ(-2.0f, get_value<float>(output_ptr, 5));

}*/

TEST(convolution_gpu, trivial_convolution_relu) {

    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 4x4
    //  Output : 2x2

    //  Input:
    //  -0.5   1     0.5  2
    //   1.5  -0.5   0   -1
    //   0.5   0.5  -1    1
    //   0.5   2     1.5 -0.5
    //
    //  Filter
    //  -2   0.5
    //   3.5 1.5
    //
    //  Bias
    //  -2
    //
    //  Output:
    //  4  0.0
    //  2  5

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, {
        -0.5f,  1.0f,  0.5f,  2.0f,
        1.5f, -0.5f,  0.0f, -1.0f,
        0.5f,  0.5f, -1.0f,  1.0f,
        0.5f,  2.0f,  1.5f, -0.5f
    });
    set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f });
    set_values(biases, { -2.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1, 1, 1, 1 }),
        activation(
            "out",
            "conv",
            activation_func::relu
        )
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(4.0f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(0.0f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(2.0f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(5.0f, get_value<float>(output_ptr, 3));
}

TEST(convolution_gpu, relu_with_negative_slope) {

    //  Filter : 2x2
    //  Stride : 2x2
    //  Input  : 4x4
    //  Output : 2x2
    //  Negative Slope : 0.1

    //  Input:
    //  -0.5   1     0.5  2
    //   1.5  -0.5   0   -1
    //   0.5   0.5  -1    1
    //   0.5   2     1.5 -0.5
    //
    //  Filter
    //  -2   0.5
    //   3.5 1.5
    //
    //  Bias
    //  -2
    //
    //  Output:
    //  4  -0.35
    //  2  5

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::yxfb, { 1, 1, 4, 4 } });
    //auto output = memory::allocate({ memory::format::yxfb_f32,{ 1 ,{ 2, 2 }, 1 } });
    auto weights = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 2, 2 } });
    auto biases = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });

    set_values(input, {
        -0.5f,  1.0f,  0.5f,  2.0f,
        1.5f, -0.5f,  0.0f, -1.0f,
        0.5f,  0.5f, -1.0f,  1.0f,
        0.5f,  2.0f,  1.5f, -0.5f
    });
    set_values(weights, { -2.0f, 0.5f, 3.5f, 1.5f });
    set_values(biases, { -2.0f });

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,2,2 },
            { 0,0,0,0 },
            { 1, 1, 1, 1 }),
        activation(
            "out",
            "conv",
            activation_func::relu_negative_slope,
            {0.1f, 0.0f}
        )
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    EXPECT_FLOAT_EQ(4.0f, get_value<float>(output_ptr, 0));
    EXPECT_FLOAT_EQ(-0.35f, get_value<float>(output_ptr, 1));
    EXPECT_FLOAT_EQ(2.0f, get_value<float>(output_ptr, 2));
    EXPECT_FLOAT_EQ(5.0f, get_value<float>(output_ptr, 3));
}

TEST(convolution_gpu, DISABLED_two_1x1_kernels_after_each_other) {

    const auto& engine = get_test_engine();

    extern const std::vector<float> conv_1x1_output;

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx,{ 16, 8, 16, 16 } });
    auto weights_conv_1 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 8, 8, 1, 1 } });
    auto weights_conv_2 = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 8, 1, 1 } });

    set_random_values<float>(input);
    set_random_values<float>(weights_conv_1);
    set_random_values<float>(weights_conv_2);

    auto inp_lay = input_layout("input", input.get_layout());
    auto conv_1 = convolution(
        "conv_1",
        "input",
        { "weights_conv_1" });
    auto conv_2 = convolution(
        "conv_2",
        "conv_1",
        { "weights_conv_2" });

    topology topology(
        inp_lay,
        data("weights_conv_1", weights_conv_1),
        conv_1,
        data("weights_conv_2", weights_conv_2),
        conv_2
    );

    build_options bo;
    bo.set_option(build_option::optimize_data(true));
    network network(engine, topology, bo);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));

    auto output_prim = outputs.at("conv_2").get_memory();

    auto output_ptr = output_prim.pointer<float>();
    auto output_layout = output_prim.get_layout();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    int f_offset = y_size * x_size;
    int b_offset = f_size * f_offset;
    for (int b = 0; b < b_size; ++b)
    {
        for (int f = 0; f < f_size; ++f)
        {
            for (int y = 0; y < y_size; ++y)
            {
                for (int x = 0; x < x_size; ++x)
                {
                    int idx = b * b_offset + f * f_offset + y * x_size + x;
                    EXPECT_TRUE(are_equal(conv_1x1_output[idx], get_value<float>(output_ptr, idx)));
                }
            }
        }
    }
}

TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp32)
{
#define USE_OLD_WEIGHTS_FORMAT 0

    const auto input_format   = format::yxfb;
#if USE_OLD_WEIGHTS_FORMAT
    const auto weights_format = format::bfyx;
#else
    const auto weights_format = format::yxfb;
#endif
    const auto biases_format = format::bfyx;

    const int32_t batch_size = 16;
    const int32_t input_feature_count = 2;
    const int32_t output_feature_count = 16;

    const int32_t stride_x = 2;
    const int32_t stride_y = 2;

    const int32_t input_x = 4;
    const int32_t input_y = 4;
    const int32_t weights_x = 2;
    const int32_t weights_y = 2;
    const int32_t output_x = (input_x - weights_x) / stride_x + 1;
    const int32_t output_y = (input_y - weights_y) / stride_y + 1;

    const auto& engine = get_test_engine();

    auto input_size = tensor( batch_size, input_feature_count, input_x, input_y );
    auto input = memory::allocate(engine, { data_types::f32, input_format, input_size });
    auto weights_size = tensor( output_feature_count, input_feature_count, weights_x, weights_y );
    auto weights = memory::allocate(engine, { data_types::f32, weights_format, weights_size });
    auto biases = memory::allocate(engine, { data_types::f32, biases_format, {1,output_feature_count,1,1}});

    //auto output = memory::allocate({output_format, {batch_size, {output_x, output_y}, output_feature_count}});

    // input:
    std::vector<float> input_vals_template {
        0.25f, 0.50f, 0.75f, 1.00f,
        1.25f, 1.50f, 1.75f, 2.00f,
        2.25f, 2.50f, 2.75f, 3.00f,
        3.25f, 3.50f, 3.75f, 4.00f,
    };
    input_vals_template.resize(input_y * input_x);

    std::vector<float> input_vals;
    input_vals.reserve(input_y * input_x * input_feature_count * batch_size);
    for (uint32_t yxi = 0; yxi < input_y * input_x; ++yxi)
    {
        for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
        {
            for (uint32_t bi = 0; bi < batch_size; ++bi)
            {
                input_vals.push_back((bi * input_feature_count + ifi + 1) * input_vals_template[yxi]);
            }
        }
    }
    set_values(input, input_vals);

    // weights:
    std::vector<float> weights_vals_template {
        -4.0f, -2.0f,
         4.0f,  4.0f,
    };
    weights_vals_template.resize(weights_y * weights_x);

    std::vector<float> weights_vals;
    weights_vals.reserve(weights_y * weights_x * input_feature_count * output_feature_count);
#if USE_OLD_WEIGHTS_FORMAT
    for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
    {
        for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
        {
            for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
            {
                weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
            }
        }
    }
#else
    for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
    {
        for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
        {
            for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
            {
                weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
            }
        }
    }
#endif
    set_values(weights, weights_vals);

    // biases:
    std::vector<float> biases_vals;
    biases_vals.reserve(output_feature_count);
    for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
    {
        biases_vals.push_back(ofi * 1.0f);
    }
    set_values(biases, biases_vals);

    // output:
    std::vector<float> output_vals_template {
         9.0f, 10.0f,
        13.0f, 14.0f,
    };
    output_vals_template.resize(output_y * output_x);

    std::vector<float> output_vals;
    output_vals.reserve(output_y * output_x * output_feature_count * batch_size);
    for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
    {
        for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
        {
            for (uint32_t bi = 0; bi < batch_size; ++bi)
            {
                uint32_t template_factor = input_feature_count * input_feature_count * input_feature_count * bi * ofi +
                    input_feature_count * input_feature_count * (input_feature_count + 1) / 2 * (bi + ofi) +
                    input_feature_count * (input_feature_count + 1) * (2 * input_feature_count + 1) / 6;
                float bias_factor = ofi * 1.0f;

                output_vals.push_back(template_factor * output_vals_template[yxi] + bias_factor);
            }
        }
    }

    // Computing convolution.
    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution(
            "conv",
            "input",
            { "weights" },
            { "biases" },
            { 1,1,stride_x,stride_y },
            { 0,0,0,0 },
            { 1, 1, 1, 1 }),
            activation(
                "out",
                "conv",
                activation_func::relu,
                { 0.1f, 0.0f }
            )
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    // Checking result.
    uint32_t i = 0;
    for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
    {
        for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
        {
            for (uint32_t bi = 0; bi < batch_size; ++bi, ++i)
            {
                auto equal = are_equal(output_vals[i], get_value<float>(output_ptr, i));
                EXPECT_TRUE(equal);
                if (!equal)
                {
                    std::cout << "Failed at position (" << yxi << ", output feature = " << ofi << ", batch = " << bi << "): "
                        << output_vals[i] << " != " << get_value<float>(output_ptr, i) << std::endl;
                    return;
                }
            }
        }
    }

#undef USE_OLD_WEIGHTS_FORMAT
}

void add_primitives(const engine& engine, topology& topology)
{
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 2 } });

    std::vector<char> weights_values = { 1, 2, 1,
                                         2, 1, 2,

                                         19, 17, -1,
                                         -10, 32, 23 };
    set_values<char>(weights, weights_values);
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
    set_values(biases, { 1.0f, -8.0f });

    topology.add(
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, { 0, 0, 1, 2 }, { 0, 0, 0, 0 }, { 1, 1, 1, 1 }),
        activation( "out", "conv", activation_func::relu)
    );
}

TEST(convolution_f32_fw_gpu, byte_activation) {
    //  Filter : 2x3
    //  Stride : 2x1
    //  Input  : 4x5
    //  Output : 2x3
    //
    //  Input:
    //  1  2  3  4  5
    //  2  2  3  4  6
    //  3  3  3  5  1
    //  1  1  1  1  1
    //
    //  Filter:
    //  1  2  1
    //  2  1  2
    //
    //  19 17 -1
    // -10 32 23
    //
    //  Output:
    // 21  28  39
    // 18  20  20
    //
    // -101 -11 92
    // -114 -116 -78
    //
    //  Bias:
    //  1 -8
    auto eng_conf = get_test_engine();
    engine engine{ eng_conf };
    auto input = memory::allocate(engine, { data_types::i8, format::bfyx,{ 1, 1, 5, 4 } });

    VVVF<char> output_vec = {
        {
            { 11, 0, 15 },
            { 0,  0, 2 }
        },
        {
            { 33, 0, 0 },
            { 0, 0, 0 }
        } };

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    opts.set_option(build_option::graph_dumps_dir("graph"));

    set_values<char>(input, {  1,  2, -3,  4, -5,
                               2, -2,  3, -4,  6,
                              -3,  3, -3,  5, -1,
                              -1, -1, -1, -1, -1 });

    topology topology(
        input_layout("input", input.get_layout()));
    add_primitives(engine, topology);
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 3.0f);
            }
        }
}

TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_symmetric) {
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 3 } });
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });

    set_values<uint8_t>(input, { 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1 });
    set_values<int8_t>(weights, {  1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5 });
    set_values(biases, { 1.0f, -8.0f });

    VVVF<float> output_vec = {
        {
            { 52.0f, 78.0f, 3.0f },
            { 8.0f, 14.0f, 0.0f }
        },
        {
            { 20.0f, 35.0f, 31.0f },
            { 11.0f, 19.0f, 0.0f }
        } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        convolution("conv", "input", { "weights" }, { "biases" }, tensor{ 0, 0, 2, 2 }, tensor(0), tensor{1, 1, 1, 1}, tensor{1, 2, 3, 2}),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_ptr = output_memory.pointer<float>();

    auto output_layout = output_memory.get_layout();
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 1e-5f) <<
                " x="<<x << " y=" << y << " f=" << f;
            }
        }
}

TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_asymmetric_weight_and_activations) {
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 3 } });
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
    auto w_zp = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 1, 1 } });
    auto a_zp = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 1, 1, 1 } });

    set_values<uint8_t>(input, { 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1 });
    set_values<int8_t>(weights, {  1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5 });
    set_values<uint8_t>(a_zp, { 2 });
    set_values<int8_t>(w_zp, { 1, -1 });
    set_values(biases, { 1.0f, -8.0f });

    VVVF<float> output_vec = {
        {
            { 12.0f, 26.0f, -19.0f },
            { 2.0f, 8.0f, 4.0f }
        },
        {
            { -8.0f, 19.0f, 21.0f },
            { -7.0f, 1.0f, -18.0f }
        } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        data("a_zp", a_zp),
        data("w_zp", w_zp),
        convolution("conv", "input", { "weights" }, { "biases" }, { "w_zp" }, { "a_zp" }, 1, data_types::f32,
                    tensor{ 0, 0, 2, 2 }, tensor(0), tensor{1, 1, 1, 1}, tensor{1, 2, 3, 2}),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_ptr = output_memory.pointer<float>();

    auto output_layout = output_memory.get_layout();
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 1e-5f) <<
                " x="<< x << " y=" << y << " f=" << f;
            }
        }
}

TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_asymmetric_activations_per_tensor) {
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 3 } });
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
    auto a_zp = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 1, 1, 1 } });

    set_values<uint8_t>(input, { 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1 });
    set_values<int8_t>(weights, {  1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5 });
    set_values<uint8_t>(a_zp, { 2 });
    set_values(biases, { 1.0f, -8.0f });

    VVVF<float> output_vec = {
        {
            { 16.0f, 42.0f, -13.0f },
            { 2.0f, 8.0f, 2.0f }
        },
        {
            { -12.0f, 3.0f, 15.0f },
            { -7.0f, 1.0f, -16.0f }
        } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        data("a_zp", a_zp),
        convolution("conv", "input", { "weights" }, { "biases" }, { }, { "a_zp" }, 1, data_types::f32,
                    tensor{ 0, 0, 2, 2 }, tensor(0), tensor{1, 1, 1, 1}, tensor{1, 2, 3, 2}),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_ptr = output_memory.pointer<float>();

    auto output_layout = output_memory.get_layout();
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 1e-5f) <<
                " x="<< x << " y=" << y << " f=" << f;
            }
        }
}

TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_asymmetric_activations_per_channel) {
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 2, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 2, 3, 3 } });
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
    auto a_zp = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 2, 1, 1 } });

    set_values<uint8_t>(input, { 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1,

                                 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1 });

    set_values<int8_t>(weights, {  1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5,

                                   1, 2, -1,
                                   -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                   -1, 3,  2,
                                   0, 2,  5 });
    set_values<uint8_t>(a_zp, { 2, 5 });
    set_values(biases, { 1.0f, -8.0f });

    VVVF<float> output_vec = {
        {
            { -36.0f, 5.0f, -14.0f },
            { -24.0f, -10.0f, -30.0f }
        },
        {
            { -45.0f, -4.0f, -23.0f },
            { -33.0f, -19.0f, -39.0f }
        } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        data("a_zp", a_zp),
        convolution("conv", "input", { "weights" }, { "biases" }, { }, { "a_zp" }, 1, data_types::f32,
                    tensor{ 0, 0, 2, 2 }, tensor(0), tensor{1, 1, 1, 1}, tensor{1, 2, 3, 2}),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_ptr = output_memory.pointer<float>();

    auto output_layout = output_memory.get_layout();
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 1e-5f) <<
                " x="<< x << " y=" << y << " f=" << f;
            }
        }
}

TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_asymmetric_activations_per_channel_3ic_with_sub) {
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 3, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 3, 3, 3 } });
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
    auto a_zp = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 3, 1, 1 } });

    set_values<uint8_t>(input, { 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1,

                                 2, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1,

                                 3, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1 });

    set_values<int8_t>(weights, {  1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5,

                                   1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5 });

    set_values<uint8_t>(a_zp, { 2, 5, 6 });
    set_values(biases, { 1.0f, -8.0f });

    VVVF<float> output_vec = {
        {
            { -82.0f, -26.0f, -60.0f },
            { -35.0f, -15.0f, -25.0f }
        },
        {
            { -86.0f, -57.0f, -32.0f },
            { -68.0f, -46.0f, -79.0f }
        } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        data("a_zp", a_zp),
        activation("activation", "input", activation_func::relu),  // needed just to add padding
        eltwise("in", {"activation", "a_zp"}, eltwise_mode::sub, data_types::f32),
        convolution("conv", "in", { "weights" }, { "biases" }, 1,
                    tensor{ 0, 0, 2, 2 }, tensor(0), tensor{1, 1, 1, 1}, tensor{1, 2, 3, 2}, data_types::f32),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_ptr = output_memory.pointer<float>();

    auto output_layout = output_memory.get_layout();
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 1e-5f) <<
                " x="<< x << " y=" << y << " f=" << f;
            }
        }
}

TEST(convolution_int8_fw_gpu, quantized_convolution_u8s8f32_asymmetric_weights_per_channel) {
    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::u8, format::bfyx,{ 1, 1, 5, 4 } });
    auto weights = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 3, 3 } });
    cldnn::memory biases = memory::allocate(engine, { data_types::f32, format::bfyx,{ 1, 2, 1, 1 } });
    auto w_zp = memory::allocate(engine, { data_types::i8, format::bfyx,{ 2, 1, 1, 1 } });

    set_values<uint8_t>(input, { 1, 2, 3, 4, 5,
                                 2, 2, 3, 4, 6,
                                 3, 3, 3, 5, 1,
                                 1, 1, 1, 1, 1 });
    set_values<int8_t>(weights, {  1, 2, -1,
                                  -2, 1,  2,
                                   9, 7, -1,

                                   9, 0, -4,
                                  -1, 3,  2,
                                   0, 2,  5 });
    set_values<int8_t>(w_zp, { 1, -1 });
    set_values(biases, { 1.0f, -8.0f });

    VVVF<float> output_vec = {
        {
            { 30.0f, 44.0f, -9.0f },
            { -4.0f, 2.0f, -2.0f }
        },
        {
            { 42.0f, 69.0f, 43.0f },
            { 23.0f, 31.0f, 2.0f }
        } };

    topology topology(
        input_layout("input", input.get_layout()),
        data("weights", weights),
        data("biases", biases),
        data("w_zp", w_zp),
        convolution("conv", "input", { "weights" }, { "biases" }, { "w_zp" }, { }, 1, data_types::f32,
                    tensor{ 0, 0, 2, 2 }, tensor(0), tensor{1, 1, 1, 1}, tensor{1, 2, 3, 2}),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options opts;
    opts.set_option(build_option::optimize_data(true));
    network network(engine, topology, opts);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_ptr = output_memory.pointer<float>();

    auto output_layout = output_memory.get_layout();
    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];
    EXPECT_EQ(output_layout.format, format::bfyx);
    EXPECT_EQ(y_size, 2);
    EXPECT_EQ(x_size, 3);
    EXPECT_EQ(f_size, 2);
    EXPECT_EQ(b_size, 1);
    for (int f = 0; f < f_size; f++)
        for (int y = 0; y < y_size; ++y) {
            for (int x = 0; x < x_size; ++x) {
                EXPECT_NEAR(output_vec[f][y][x], ((float)output_ptr[f*y_size*x_size + y * x_size + x]), 1e-5f) <<
                " x="<< x << " y=" << y << " f=" << f;
            }
        }
}

TEST(convolution_gpu, basic_yxfb_4_4_yxfb_2_2_b16_if2_of16_st2_2_p0_sp1_fp16)
{
#define USE_OLD_WEIGHTS_FORMAT 0

    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const auto input_format   = format::yxfb;
#if USE_OLD_WEIGHTS_FORMAT
    const auto weights_format = format::bfyx;
#else
    const auto weights_format = format::yxfb;
#endif
    const auto biases_format  = format::bfyx;
    const auto output_format  = input_format;

    const int32_t batch_size = 16;
    const int32_t input_feature_count = 2;
    const int32_t output_feature_count = 16;

    const int32_t stride_x = 2;
    const int32_t stride_y = 2;

    const int32_t input_x = 4;
    const int32_t input_y = 4;
    const int32_t weights_x = 2;
    const int32_t weights_y = 2;
    const int32_t output_x = (input_x - weights_x) / stride_x + 1;
    const int32_t output_y = (input_y - weights_y) / stride_y + 1;

    auto input_size = tensor( batch_size, input_feature_count, input_x, input_y );
    auto input = memory::allocate(engine, { data_types::f32, input_format, input_size });
    auto weights_size = tensor( output_feature_count, input_feature_count, weights_x, weights_y );
    auto weights = memory::allocate(engine, { data_types::f32, weights_format, weights_size });
    auto biases_size = tensor( 1,output_feature_count,1,1 );
    auto biases = memory::allocate(engine, { data_types::f32, biases_format, biases_size });
    auto output_size = tensor( batch_size, output_feature_count, output_x, output_y );
    //auto output = memory::allocate({output_format, {batch_size, {output_x, output_y}, output_feature_count}});

    //auto input_cvtd = memory::allocate(engine, { data_types::f16, input_size });
    //auto weights_cvtd = memory::allocate(engine, { data_types::f16, weights_size });
    //auto biases_cvtd = memory::allocate(engine, { data_types::f16, biases_size });
    //auto output_cvtd  = memory::allocate({output_cvt_format, {batch_size, {output_x, output_y}, output_feature_count}});

    // input:
    std::vector<float> input_vals_template {
        0.25f, 0.50f, 0.75f, 1.00f,
        1.25f, 1.50f, 1.75f, 2.00f,
        2.25f, 2.50f, 2.75f, 3.00f,
        3.25f, 3.50f, 3.75f, 4.00f,
    };
    input_vals_template.resize(input_y * input_x);

    std::vector<float> input_vals;
    input_vals.reserve(input_y * input_x * input_feature_count * batch_size);
    for (uint32_t yxi = 0; yxi < input_y * input_x; ++yxi)
    {
        for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
        {
            for (uint32_t bi = 0; bi < batch_size; ++bi)
            {
                input_vals.push_back((bi * input_feature_count + ifi + 1) * input_vals_template[yxi]);
            }
        }
    }
    set_values(input, input_vals);

    // weights:
    std::vector<float> weights_vals_template {
        -0.50f, -0.25f,
         0.50f,  0.50f,
    };
    weights_vals_template.resize(weights_y * weights_x);

    std::vector<float> weights_vals;
    weights_vals.reserve(weights_y * weights_x * input_feature_count * output_feature_count);
#if USE_OLD_WEIGHTS_FORMAT
    for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
    {
        for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
        {
            for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
            {
                weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
            }
        }
    }
#else
    for (uint32_t yxi = 0; yxi < weights_y * weights_x; ++yxi)
    {
        for (uint32_t ifi = 0; ifi < input_feature_count; ++ifi)
        {
            for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
            {
                weights_vals.push_back((ofi * input_feature_count + ifi + 1) * weights_vals_template[yxi]);
            }
        }
    }
#endif
    set_values(weights, weights_vals);

    // biases:
    std::vector<float> biases_vals;
    biases_vals.reserve(output_feature_count);
    for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
    {
        biases_vals.push_back(ofi * 1.0f);
    }
    set_values(biases, biases_vals);

    // output:
    std::vector<float> output_vals_template {
        1.125f,  1.250f,
        1.625f,  1.750f,
    };
    output_vals_template.resize(output_y * output_x);

    std::vector<float> output_vals;
    output_vals.reserve(output_y * output_x * output_feature_count * batch_size);
    for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
    {
        for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
        {
            for (uint32_t bi = 0; bi < batch_size; ++bi)
            {
                uint32_t template_factor = input_feature_count * input_feature_count * input_feature_count * bi * ofi +
                    input_feature_count * input_feature_count * (input_feature_count + 1) / 2 * (bi + ofi) +
                    input_feature_count * (input_feature_count + 1) * (2 * input_feature_count + 1) / 6;
                float bias_factor = ofi * 1.0f;

                output_vals.push_back(template_factor * output_vals_template[yxi] + bias_factor);
            }
        }
    }

    //auto expected_float = memory::allocate(engine, { data_types::f32,{ format::x,{ static_cast<int32_t>(output_vals.size()) } } });
    //auto expected_half  = memory::allocate(engine, { data_types::f16,{ format::x,{ static_cast<int32_t>(output_vals.size()) } } });
    //auto expected       = memory::allocate(engine, { data_types::f32,{ format::x,{ static_cast<int32_t>(output_vals.size()) } } });

//    set_values(expected_float, output_vals);
//    auto cvt_expected_f32_f16 = reorder::create({expected_float, expected_half});
//    auto cvt_expected_f16_f32 = reorder::create({expected_half, expected});
//    execute({cvt_expected_f32_f16, cvt_expected_f16_f32}).wait();
//
//    auto expected_ptr = expected.as<const memory&>().pointer<float>();

    // Computing convolution.
    topology topology(
        input_layout("input", input.get_layout()),
        reorder("cvt_input", "input", {data_types::f16, input_format, input_size}),
        data("weights", weights),
        reorder("cvt_weights", "weights", {data_types::f16, weights_format, weights_size}),
        data("biases", biases),
        reorder("cvt_biases", "biases", {data_types::f16, biases_format, biases_size}),
        convolution(
            "conv",
            "cvt_input",
            { "cvt_weights" },
            { "cvt_biases" },
            { 1,1,stride_x,stride_y }),
        reorder("output", "conv", {data_types::f32, output_format, output_size})
    );

    network network(engine, topology);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "output");

    auto output_prim = outputs.begin()->second.get_memory();

    auto output_ptr = output_prim.pointer<float>();

    // Checking result.
    uint32_t i = 0;
    for (uint32_t yxi = 0; yxi < output_y * output_x; ++yxi)
    {
        for (uint32_t ofi = 0; ofi < output_feature_count; ++ofi)
        {
            for (uint32_t bi = 0; bi < batch_size; ++bi, ++i)
            {
                auto equal = are_equal(output_vals[i] /*get_value(expected_ptr, i)*/, output_ptr[i], 0.002f);
                EXPECT_TRUE(equal);
                if (!equal)
                {
                    std::cout << "Failed at position (" << yxi << ", output feature = " << ofi << ", batch = " << bi << "): "
                        << output_vals[i] /*get_value(expected_ptr, i)*/ << " != " << output_ptr[i] << std::endl;
                    return;
                }
            }
        }
    }

#undef USE_OLD_WEIGHTS_FORMAT
}

using TestParamType_convolution_gpu = ::testing::tuple<int,   // 0 - Filter size
                                                       int,   // 1 - Input features
                                                       int,   // 2 - Stride
                                                       int,   // 3 - Output padding
                                                       bool>; // 4 - With bias

using TestParamType_convolution_gpu_block_layout = ::testing::tuple<int,   // 0 -Batch size
        int,   // 1 - Input features
        int,   // 2 - Output features
        int,   // 3 - Filter size
        int,   // 4 - Stride
        int,   // 5 - Output padding
        bool>; // 6 - With bias


using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int,   // 0 - Input XY size
        int,   // 1 - Kernel sizeY
        int,   // 2 - Kernel sizeX
        int,   // 3 - Groups number
        int,   // 4 - Stride
        int,   // 5 - Output padding
        bool>; // 6 - With bias

using TestParamType_grouped_convolution_gpu = ::testing::tuple<  int,    // 0 - Input X size
        int,        // 1 - Input Y size
        int,        // 2 - Input features
        int,        // 3 - Output features
        int,        // 4 - Kernel sizeX
        int,        // 5 - Kernel sizeY
        int,        // 6 - Groups number
        int,        // 7 - Stride
        int,        // 8 - Batch
        format>;    // 9 - Input data format

struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
{
    static std::string
    PrintToStringParamName(testing::TestParamInfo<TestParamType_convolution_gpu> param_info)
    {
        // construct a readable name
        return std::to_string(testing::get<0>(param_info.param))
            + 'x' + std::to_string(testing::get<0>(param_info.param))
            + "_f" + std::to_string(testing::get<1>(param_info.param))
            + "_stride" + std::to_string(testing::get<2>(param_info.param))
            + "_pad" + std::to_string(testing::get<3>(param_info.param))
            + (testing::get<4>(param_info.param) ? "_bias" : "");
    }
};

struct convolution_gpu_fs_byx_fsv32 : public convolution_gpu {};

struct convolution_gpu_block_layout : public ::testing::TestWithParam<TestParamType_convolution_gpu_block_layout>
{
    static std::string
    PrintToStringParamName(testing::TestParamInfo<TestParamType_convolution_gpu_block_layout> param_info)
    {
        // construct a readable name
        return std::to_string(testing::get<0>(param_info.param))
               + "x_in" + std::to_string(testing::get<1>(param_info.param))
               + "x_out" + std::to_string(testing::get<2>(param_info.param))
               + "_k" + std::to_string(testing::get<3>(param_info.param))
               + "x" + std::to_string(testing::get<3>(param_info.param))
               + "_stride" + std::to_string(testing::get<4>(param_info.param))
               + "_pad" + std::to_string(testing::get<5>(param_info.param))
               + (testing::get<6>(param_info.param) ? "_bias" : "");
    }
};

struct convolution_depthwise_gpu : public ::testing::TestWithParam<TestParamType_convolution_depthwise_gpu>
{
    static std::string
    PrintToStringParamName(testing::TestParamInfo<TestParamType_convolution_depthwise_gpu> param_info)
    {
        // construct a readable name
        return "in" + std::to_string(testing::get<0>(param_info.param))
               + "x" + std::to_string(testing::get<0>(param_info.param))
               + "_k" + std::to_string(testing::get<1>(param_info.param))
               + 'x' + std::to_string(testing::get<2>(param_info.param))
               + "_f" + std::to_string(testing::get<3>(param_info.param))
               + "_stride" + std::to_string(testing::get<4>(param_info.param))
               + "_pad" + std::to_string(testing::get<5>(param_info.param))
               + (testing::get<6>(param_info.param) ? "_bias" : "");
    }
};

struct convolution_depthwise_gpu_fsv16 : public ::testing::TestWithParam<TestParamType_convolution_depthwise_gpu>
{
    static std::string
    PrintToStringParamName(testing::TestParamInfo<TestParamType_convolution_depthwise_gpu> param_info)
    {
        // construct a readable name
        return "in" + std::to_string(testing::get<0>(param_info.param))
            + "x" + std::to_string(testing::get<0>(param_info.param))
            + "_k" + std::to_string(testing::get<1>(param_info.param))
            + 'x' + std::to_string(testing::get<2>(param_info.param))
            + "_f" + std::to_string(testing::get<3>(param_info.param))
            + "_stride" + std::to_string(testing::get<4>(param_info.param))
            + "_pad" + std::to_string(testing::get<5>(param_info.param))
            + (testing::get<6>(param_info.param) ? "_bias" : "");
    }
};

struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_grouped_convolution_gpu> {
    static std::string PrintToStringParamName(
        testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
        // construct a readable name
        return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
               std::to_string(testing::get<1>(param_info.param)) + "y" +
               std::to_string(testing::get<2>(param_info.param)) + "f" +
               "_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
               "_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
                           std::to_string(testing::get<5>(param_info.param)) + "y" +
               "_groups" + std::to_string(testing::get<6>(param_info.param)) +
               "_stride" + std::to_string(testing::get<7>(param_info.param)) +
               "_batch"  + std::to_string(testing::get<8>(param_info.param)) +
               "_format" + std::to_string(testing::get<9>(param_info.param));
    }
};

INSTANTIATE_TEST_CASE_P(convolution_gpu_test,
                        convolution_gpu_fs_byx_fsv32,
                        ::testing::Values(
                                // Filter size, Input features, Stride, Output padding, With bias
                                TestParamType_convolution_gpu(1, 20, 1, 0, false),
                                TestParamType_convolution_gpu(3, 80, 1, 0, false),
                                TestParamType_convolution_gpu(1, 32, 1, 0, true),
                                TestParamType_convolution_gpu(3, 32, 1, 0, true),
                                TestParamType_convolution_gpu(1, 32, 1, 1, false),
                                TestParamType_convolution_gpu(3, 32, 1, 1, false),
                                TestParamType_convolution_gpu(1, 32, 2, 0, false),
                                TestParamType_convolution_gpu(3, 32, 2, 0, false),
                                TestParamType_convolution_gpu(3, 64, 2, 1, true)),
                        convolution_gpu::PrintToStringParamName);

TEST_P(convolution_gpu_fs_byx_fsv32, fs_byx_fsv32)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = 2;
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = 64;
    const int filter_xy = testing::get<0>(GetParam());
    const int stride = testing::get<2>(GetParam());
    const int output_padding = testing::get<3>(GetParam());
    const bool with_bias = testing::get<4>(GetParam());
    const int input_offset = -(filter_xy / 2);

    const int output_xy = 1 + (input_xy + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
        input_layout("input", input_mem.get_layout()),
        data("weights_fsv", weights_mem));

    // Reorder input to fs_byx_fsv32
    topology.add(reorder("input_fsv", "input", { data_types::f16, format::fs_b_yx_fsv32, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1);
        auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                    input_data[bi], weights_data[ofi],
                    stride, stride,
                    biases_data[ofi],
                    1, 1,                               // dilation
                    -input_offset, -input_offset,       // input padding
                    output_padding, output_padding);
            }
        }

        topology.add(data("biases_fsv", biases_mem));

        auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" }, { "biases_fsv" },
                                    { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_fsv);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                    input_data[bi], weights_data[ofi],
                    stride, stride,
                    0,                                  // bias
                    1, 1,                               // dilation
                    -input_offset, -input_offset,       // input padding
                    output_padding, output_padding);
            }
        }

        auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" },
            { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_fsv);
    }


    build_options options;
    implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
    options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
    options.set_option(build_option::optimize_data(true));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_fsv").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::fs_b_yx_fsv32);

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f; ++fi)
            for (int yi = 0; yi < output_xy; ++yi)
                for (int xi = 0; xi < output_xy; ++xi)
                {
                    auto val_ref = reference_result[bi][fi][yi][xi];
                    auto val = out_ptr[(fi / 32) * batch_num * output_xy * output_xy * 32 +
                                        bi * output_xy * output_xy * 32 +
                                        yi * output_xy * 32 +
                                        xi * 32 +
                                        fi % 32];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    EXPECT_TRUE(equal);
                    if (!equal)
                    {
                        std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
                    }
                }
}

TEST(convolution_f16_fsv_gpu, convolution_f16_fsv_gpu_padding) {
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = 2;
    const int input_xy = 32;
    const int input_f = 96;
    const int output_f = 192;
    const int filter_xy = 1;
    const int stride = 1;
    const int output_xy = 1 + (input_xy - filter_xy) / stride;

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
        input_layout("input", input_mem.get_layout()),
        data("weights_fsv", weights_mem));

    // add input padding by X and Y
    layout w_pad(data_types::f16, format::bfyx, input_size, padding({ 0,0,1,1 }, { 0, 0, 0, 0 }));
    topology.add(reorder("input_fsv", "input", w_pad));

    // Generate bias data
    auto biases_size = tensor(1, output_f, 1, 1);
    auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
    auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
    set_values(biases_mem, biases_data);

    // Calculate reference values
    for (auto bi = 0; bi < batch_num; ++bi)
    {
        for (auto ofi = 0; ofi < output_f; ++ofi)
        {
            reference_result[bi][ofi] = reference_convolve(
                input_data[bi], weights_data[ofi],
                stride, stride,
                biases_data[ofi],
                1, 1);
        }
    }

    topology.add(data("biases_fsv", biases_mem));

    auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" }, { "biases_fsv" },
        { 1, 1, stride, stride }, { 0, 0, 0, 0 });

    topology.add(conv_fsv);

    build_options options;
    implementation_desc conv_impl = { format::fs_b_yx_fsv32, "convolution_gpu_bfyx_to_fs_byx_fsv32" };
    options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
    options.set_option(build_option::optimize_data(true));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_fsv").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::fs_b_yx_fsv32);

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f; ++fi)
            for (int yi = 0; yi < output_xy; ++yi)
                for (int xi = 0; xi < output_xy; ++xi)
                {
                    auto val_ref = reference_result[bi][fi][yi][xi];
                    auto val = out_ptr[(fi / 32) * batch_num * output_xy * output_xy * 32 +
                        bi * output_xy * output_xy * 32 +
                        yi * output_xy * 32 +
                        xi * 32 +
                        fi % 32];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    EXPECT_TRUE(equal);
                    if (!equal)
                    {
                        std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
                    }
                }
}

using TestParamType_convolution_gpu_with_crop = ::testing::tuple<int,   // 0 - Filter size
    int,   // 1 - Input size
    int,   // 2 - Input/output features
    int,   // 3 - Stride
    int,   // 4 - Output padding
    bool>; // 5 - With bias

struct convolution_gpu_fs_byx_fsv32_crop : public ::testing::TestWithParam<TestParamType_convolution_gpu_with_crop>
{
    static std::string
        PrintToStringParamName(testing::TestParamInfo<TestParamType_convolution_gpu_with_crop> param_info)
    {
        // construct a readable name
        return std::to_string(testing::get<0>(param_info.param))
            + 'x' + std::to_string(testing::get<1>(param_info.param))
            + "_f" + std::to_string(testing::get<2>(param_info.param))
            + "_stride" + std::to_string(testing::get<3>(param_info.param))
            + "_pad" + std::to_string(testing::get<4>(param_info.param))
            + (testing::get<5>(param_info.param) ? "_bias" : "");
    }
};

INSTANTIATE_TEST_CASE_P(convolution_gpu_with_crop,
    convolution_gpu_fs_byx_fsv32_crop,
    ::testing::Values(
        TestParamType_convolution_gpu_with_crop(1, 14, 24, 1, 0, true)
    ),
    convolution_gpu_fs_byx_fsv32_crop::PrintToStringParamName);

TEST_P(convolution_gpu_fs_byx_fsv32_crop, fs_byx_fsv32_crop)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = 4;
    const int filter_xy = testing::get<0>(GetParam());
    const int input_xy = testing::get<1>(GetParam());
    const int input_f = testing::get<2>(GetParam());
    const int output_f = input_f;
    const int stride = testing::get<3>(GetParam());
    const int output_padding = testing::get<4>(GetParam());
    const bool with_bias = testing::get<5>(GetParam());

    const int input_offset = -(filter_xy / 2);
    const int output_xy = 1 + (input_xy + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // ref input
    auto half_input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
    auto input_data = VVVVF<FLOAT16>(batch_num);

    // concatenated cldnn input tensor
    for (auto bi = 0; bi < batch_num; ++bi)
    {
        for (auto ifi = 0; ifi < input_f; ++ifi)
        {
            auto fdata = half_input_data[bi][ifi];
            input_data[bi].emplace_back(std::move(fdata));
        }
        for (auto ifi = 0; ifi < input_f; ++ifi)
        {
            auto fdata = half_input_data[bi][ifi];
            input_data[bi].emplace_back(std::move(fdata));
        }
    }

    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_size = tensor(batch_num, input_f * 2, input_xy, input_xy);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    topology topology(
        input_layout("input", input_mem.get_layout()),
        data("weights_fsv", weights_mem));

    auto crop_batch_num = batch_num;
    auto crop_feature_num = input_f;
    auto crop_x_size = input_xy;
    auto crop_y_size = input_xy;

    auto left_crop = crop("left_crop", "input", { crop_batch_num, crop_feature_num, crop_x_size, crop_y_size }, { 0, 0, 0, 0 });
    auto right_crop = crop("right_crop", "input", { crop_batch_num, crop_feature_num, crop_x_size, crop_y_size }, { 0, input_f, 0, 0 });
    topology.add(left_crop);
    topology.add(right_crop);

    // Will be used to store reference values calculated in branches depending on bias
    auto half_ref_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    // reference convolution and concat
    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1);
        auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                half_ref_result[bi][ofi] = reference_convolve(
                    half_input_data[bi], weights_data[ofi],
                    stride, stride,
                    biases_data[ofi],
                    1, 1,                               // dilation
                    -input_offset, -input_offset,       // input padding
                    output_padding, output_padding);
            }
        }

        topology.add(data("biases_fsv", biases_mem));

        auto conv_fsv = convolution("conv_fsv", "right_crop", { "weights_fsv" }, { "biases_fsv" },
            { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);
        topology.add(conv_fsv);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                half_ref_result[bi][ofi] = reference_convolve(
                    half_input_data[bi], weights_data[ofi],
                    stride, stride,
                    0,                                  // bias
                    1, 1,                               // dilation
                    -input_offset, -input_offset,       // input padding
                    output_padding, output_padding);
            }
        }

        auto conv_fsv = convolution("conv_fsv", "right_crop", { "weights_fsv" },
            { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);
        topology.add(conv_fsv);
    }


    topology.add(reorder("reorder", "conv_fsv", { data_types::f16, format::bfyx, input_size }));
    topology.add(concatenation("concat", { "left_crop", "reorder" }, concatenation::concatenation_axis::along_f));

    auto ref_result = VVVVF<FLOAT16>(batch_num);
    // concatenate half ref input and ref conv output, by features
    for (auto bi = 0; bi < batch_num; ++bi)
    {
        for (auto ifi = 0; ifi < input_f; ++ifi)
        {
            auto fdata = half_input_data[bi][ifi];
            ref_result[bi].emplace_back(std::move(fdata));
        }
        for (auto ifi = 0; ifi < input_f; ++ifi)
        {
            auto fdata = half_ref_result[bi][ifi];
            ref_result[bi].emplace_back(std::move(fdata));
        }
    }

    build_options options;
    implementation_desc conv_impl = { format::fs_b_yx_fsv32, "convolution_gpu_bfyx_to_fs_byx_fsv32" };
    options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
    options.set_option(build_option::optimize_data(true));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("concat").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::bfyx);

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f * 2; ++fi)
            for (int yi = 0; yi < output_xy; ++yi)
                for (int xi = 0; xi < output_xy; ++xi)
                {
                    auto val_ref = ref_result[bi][fi][yi][xi];
                    auto val = out_ptr[bi * output_xy * output_xy * output_f * 2 +
                        fi * output_xy * output_xy +
                        yi * output_xy +
                        xi];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    EXPECT_TRUE(equal);
                    if (!equal)
                    {
                        std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
                    }
                }
}


TEST(convolution_gpu, bfyx_iyxo_5x5_fp16)
{

    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = 1;
    const int output_f = 4;

    const int input_f = 32;
    const int filter_xy = 5;
    const int stride = 1;
    const int output_padding = 0;
    const bool with_bias = false;
    const int input_size_x = 64;
    const int input_size_y = 20;


    const int input_offset = -(filter_xy / 2);

    const int output_x = 1 + (input_size_x + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;

    const int output_y = 1 + (input_size_y + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;

    auto input_size = tensor(batch_num, input_f, input_size_x, input_size_y);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_size_y, input_size_x, -1, 1);

    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });

    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
        input_layout("input", input_mem.get_layout()),
        data("weights_fsv", weights_mem)
    );

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1);
        auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                    input_data[bi], weights_data[ofi],
                    stride, stride, biases_data[ofi],
                    1, 1,                               // dilation
                    -input_offset, -input_offset,       // input padding
                    output_padding, output_padding);
            }
        }

        topology.add(data("biases_fsv", biases_mem));

        auto conv_fsv = convolution("conv_fsv", "input", { "weights_fsv" }, { "biases_fsv" },
                                    { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_fsv);
    }
    else
    {

        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                    input_data[bi], weights_data[ofi],
                    stride, stride,
                    0,                                  // bias
                    1, 1,                               // dilation
                    -input_offset, -input_offset,       // input padding
                    output_padding, output_padding);
            }
        }


        auto conv_fsv = convolution("conv_fsv", "input", { "weights_fsv" },
            { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_fsv);
    }


    build_options options;
    implementation_desc conv_impl = { format::bfyx, "" };
    options.set_option(build_option::optimize_data(true));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_fsv").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f; ++fi)
            for (int yi = 0; yi < output_y; ++yi)
                for (int xi = 0; xi < output_x; ++xi)
                {
                    auto val_ref = reference_result[bi][fi][yi][xi];
                    auto val = out_ptr[bi * output_f * output_x * output_y +
                                        fi * output_y * output_x  +
                                        yi * output_x +
                                        xi];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    EXPECT_TRUE(equal);
                    if (!equal)
                    {
                        std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
                    }
                }

}

INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
                        convolution_gpu_block_layout,
                        ::testing::Values(
                                TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 1, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(32, 64, 64, 1, 1, 0, false),
                                TestParamType_convolution_gpu_block_layout(16, 64, 64, 1, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(16, 16, 16, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(32, 16, 16, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(16, 32, 16, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(16, 16, 32, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(32, 16, 32, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(16, 32, 32, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(16, 64, 16, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(32, 32, 16, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(32, 32, 32, 3, 1, 0, true),
                                TestParamType_convolution_gpu_block_layout(64, 64, 64, 3, 1, 0, true)),
                        convolution_gpu_block_layout::PrintToStringParamName);

TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32)
{
    const auto& engine = get_test_engine();

    const int batch_num = testing::get<0>(GetParam());
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = testing::get<2>(GetParam());
    const int filter_xy = testing::get<3>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const bool with_bias = testing::get<6>(GetParam());
    const int input_offset = -(filter_xy / 2);

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
    auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy, 1);
    auto weights_data = generate_random_4d<float>(output_f, input_f, filter_xy, filter_xy, -1, 1);

    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);

    auto weights_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<float>(batch_num, VVVF<float>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights", weights_mem));

    // Reorder input to bs_fs_yx_bsv16_fsv16
    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_zyx_bsv16_fsv16, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1, 1);
        auto biases_data = generate_random_1d<float>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride, biases_data[ofi],
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        topology.add(data("biases", biases_mem));

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" }, { "biases" },
                                    { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset, 0 });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride,
                        0,                                  // bias
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" },
                                    { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset, 0 });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }

    topology.add(reorder("reorder_bfzyx", "conv_bsv16_fsv16", format::bfzyx, data_types::f32));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    options.set_option(build_option::outputs({"conv_bsv16_fsv16", "reorder_bfzyx"}));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
    auto out_ptr = out_mem.pointer<float>();

    auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
    auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();

    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);

    auto flatten_ref = flatten_4d(format::bfyx, reference_result);

    for (size_t i = 0; i < out_ptr_bfyx.size(); i++) {
        auto equal = are_equal(flatten_ref[i], out_ptr_bfyx[i], 1e-2f);
        EXPECT_TRUE(equal);
        if (!equal)
        {
            std::cout << "Difference at idx = " << i << std::endl;
            return;
        }
    }
}

TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp16)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = testing::get<0>(GetParam());
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = testing::get<2>(GetParam());
    const int filter_xy = testing::get<3>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const bool with_bias = testing::get<6>(GetParam());
    const int input_offset = -(filter_xy / 2);

    if (batch_num % 32 != 0)
    {
        std::cout << "[ SKIPPED ] The test is skipped (for fp16 batch should be multiple of 32)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, 0, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);

    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfzyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy, 1);
    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, 0, 1);

    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);

    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfzyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights", weights_mem));

    // Reorder input to bs_fs_zyx_bsv16_fsv16
    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f16, format::bs_fs_zyx_bsv16_fsv16, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1, 1);
        auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfzyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride, biases_data[ofi],
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        topology.add(data("biases", biases_mem));

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" }, { "biases" },
                                        { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset, 0 });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride,
                        0,                                  // bias
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" },
                                        { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset, 0 });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }

    topology.add(reorder("reorder_bfzyx", "conv_bsv16_fsv16", format::bfzyx, data_types::f16));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    options.set_option(build_option::outputs({"conv_bsv16_fsv16", "reorder_bfzyx"}));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
    auto out_ptr_bfyx = out_mem_bfyx.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);

    auto flatten_ref = flatten_4d(format::bfyx, reference_result);

    for (size_t i = 0; i < out_ptr_bfyx.size(); i++) {
        auto equal = are_equal(flatten_ref[i], out_ptr_bfyx[i], 1);
        EXPECT_TRUE(equal);
        if (!equal)
        {
            std::cout << "Difference at idx = " << i << std::endl;
            return;
        }
    }
}

TEST_P(convolution_gpu_block_layout, bfzyx_bsv16_fsv16_fp32_fused_ops)
{
    const auto& engine = get_test_engine();

    const int batch_num = testing::get<0>(GetParam());
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = testing::get<2>(GetParam());
    const int filter_xy = testing::get<3>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const bool with_bias = testing::get<6>(GetParam());
    const int input_offset = -(filter_xy / 2);

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy, 1);
    auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy, 1);
    auto weights_data = generate_random_4d<float>(output_f, input_f, filter_xy, filter_xy, -1, 1);

    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);

    auto weights_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<float>(batch_num, VVVF<float>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights", weights_mem));

    // Reorder input to bs_fs_yx_bsv16_fsv16
    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_zyx_bsv16_fsv16, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1, 1);
        auto biases_data = generate_random_1d<float>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride, biases_data[ofi],
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        topology.add(data("biases", biases_mem));

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" }, { "biases" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset, 0 });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride,
                        0,                                  // bias
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset, 0 });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }

    const float scalar = 5.5f;
    auto scale_mem = memory::allocate(engine, { data_types::f32, format::bfzyx, {1, 1, 1, 1, 1} });
    set_values(scale_mem, {scalar});

    topology.add(data("scalar", scale_mem));
    topology.add(scale("scale", "conv_bsv16_fsv16", "scalar"));

    topology.add(reorder("reorder_bfzyx", "scale", format::bfzyx, data_types::f32));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    options.set_option(build_option::outputs({"conv_bsv16_fsv16", "reorder_bfzyx"}));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
    auto out_ptr = out_mem.pointer<float>();

    auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
    auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();

    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_zyx_bsv16_fsv16);

    auto flatten_ref = flatten_4d(format::bfyx, reference_result);

    for (size_t i = 0; i < out_ptr_bfyx.size(); i++) {
        auto equal = are_equal(flatten_ref[i] * scalar, out_ptr_bfyx[i], 1e-2f);
        EXPECT_TRUE(equal);
        if (!equal)
        {
            std::cout << "Difference at idx = " << i << std::endl;
            return;
        }
    }
}

TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32)
{
    const auto& engine = get_test_engine();

    const int batch_num = testing::get<0>(GetParam());
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = testing::get<2>(GetParam());
    const int filter_xy = testing::get<3>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const bool with_bias = testing::get<6>(GetParam());
    const int input_offset = -(filter_xy / 2);

    if (batch_num <= 16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (for bs_fs_yx_bsv16_fsv16 batch should be greater than 16)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f32, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
    auto weights_data = generate_random_4d<float>(output_f, input_f, filter_xy, filter_xy, -1, 1);

    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);

    auto weights_mem = memory::allocate(engine, { data_types::f32, format::bfyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<float>(batch_num, VVVF<float>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights", weights_mem));

    // Reorder input to bs_fs_yx_bsv16_fsv16
    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_yx_bsv16_fsv16, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1);
        auto biases_data = generate_random_1d<float>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f32, format::bfyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride, biases_data[ofi],
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        topology.add(data("biases", biases_mem));

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" }, { "biases" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride,
                        0,                                  // bias
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }

    topology.add(reorder("reorder_bfyx", "conv_bsv16_fsv16", format::bfyx, data_types::f32));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    options.set_option(build_option::outputs({"conv_bsv16_fsv16", "reorder_bfyx"}));
    implementation_desc conv_impl = { format::bs_fs_yx_bsv16_fsv16, "" };
    options.set_option(build_option::force_implementations({{"conv_bsv16_fsv16", conv_impl}}));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
    auto out_ptr = out_mem.pointer<float>();

    auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory();
    auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();

    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_yx_bsv16_fsv16);

    auto flatten_ref = flatten_4d(format::bfyx, reference_result);

    for (size_t i = 0; i < out_ptr_bfyx.size(); i++) {
        auto equal = are_equal(flatten_ref[i], out_ptr_bfyx[i], 1e-2f);
        EXPECT_TRUE(equal);
        if (!equal)
        {
            std::cout << "Difference at idx = " << i << std::endl;
            return;
        }
    }
}

TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp16)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = testing::get<0>(GetParam());
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = testing::get<2>(GetParam());
    const int filter_xy = testing::get<3>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const bool with_bias = testing::get<6>(GetParam());
    const int input_offset = -(filter_xy / 2);

    if (batch_num % 32 != 0)
    {
        std::cout << "[ SKIPPED ] The test is skipped (for fp16 batch should be multiple of 32)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, 0, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);

    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, 0, 1);

    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);

    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights", weights_mem));

    // Reorder input to bs_fs_yx_bsv16_fsv16
    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f16, format::bs_fs_yx_bsv16_fsv16, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1);
        auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride, biases_data[ofi],
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        topology.add(data("biases", biases_mem));

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" }, { "biases" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding, 0 }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride,
                        0,                                  // bias
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }

    topology.add(reorder("reorder_bfyx", "conv_bsv16_fsv16", format::bfyx, data_types::f16));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    options.set_option(build_option::outputs({"conv_bsv16_fsv16", "reorder_bfyx"}));
    implementation_desc conv_impl = { format::bs_fs_yx_bsv16_fsv16, "" };
    options.set_option(build_option::force_implementations({{"conv_bsv16_fsv16", conv_impl}}));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory();
    auto out_ptr_bfyx = out_mem_bfyx.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_yx_bsv16_fsv16);

    auto flatten_ref = flatten_4d(format::bfyx, reference_result);

    for (size_t i = 0; i < out_ptr_bfyx.size(); i++) {
        auto equal = are_equal(flatten_ref[i], out_ptr_bfyx[i], 1);
        EXPECT_TRUE(equal);
        if (!equal)
        {
            std::cout << "Difference at idx = " << i << std::endl;
            return;
        }
    }
}

TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32_fused_ops)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = testing::get<0>(GetParam()) * 2;
    const int input_xy = 5;
    const int input_f = testing::get<1>(GetParam());
    const int output_f = testing::get<2>(GetParam());
    const int filter_xy = testing::get<3>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const bool with_bias = testing::get<6>(GetParam());
    const int input_offset = -(filter_xy / 2);

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<float>(batch_num, input_f, input_xy, input_xy, 1, 10);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f32, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy, 1);
    auto weights_data = generate_random_4d<float>(output_f, input_f, filter_xy, filter_xy, -1, 1);

    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);

    auto weights_mem = memory::allocate(engine, { data_types::f32, format::bfyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<float>(batch_num, VVVF<float>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights", weights_mem));

    // Reorder input to bs_fs_yx_bsv16_fsv16
    topology.add(reorder("input_bsv16_fsv16", "input", { data_types::f32, format::bs_fs_yx_bsv16_fsv16, input_size }));

    if (with_bias)
    {
        // Generate bias data
        auto biases_size = tensor(1, output_f, 1, 1, 1);
        auto biases_data = generate_random_1d<float>(output_f, -1, 1);
        auto biases_mem = memory::allocate(engine, { data_types::f32, format::bfyx, biases_size });
        set_values(biases_mem, biases_data);

        // Calculate reference values with bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride, biases_data[ofi],
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        topology.add(data("biases", biases_mem));

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" }, { "biases" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }
    else
    {
        // Calculate reference values without bias
        for (auto bi = 0; bi < batch_num; ++bi)
        {
            for (auto ofi = 0; ofi < output_f; ++ofi)
            {
                reference_result[bi][ofi] = reference_convolve(
                        input_data[bi], weights_data[ofi],
                        stride, stride,
                        0,                                  // bias
                        1, 1,                               // dilation
                        -input_offset, -input_offset,       // input padding
                        output_padding, output_padding);
            }
        }

        auto conv_bsv16_fsv16 = convolution("conv_bsv16_fsv16", "input_bsv16_fsv16", { "weights" },
                                       { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
        conv_bsv16_fsv16.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

        topology.add(conv_bsv16_fsv16);
    }

    const float scalar = 5.5f;
    auto scale_mem = memory::allocate(engine, { data_types::f32, format::bfyx, {1, 1, 1, 1} });
    set_values(scale_mem, {scalar});

    topology.add(data("scalar", scale_mem));
    topology.add(scale("scale", "conv_bsv16_fsv16", "scalar"));

    topology.add(reorder("reorder_bfyx", "scale", format::bfyx, data_types::f32));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    options.set_option(build_option::outputs({"conv_bsv16_fsv16", "reorder_bfyx"}));
    implementation_desc conv_impl = { format::bs_fs_yx_bsv16_fsv16, "" };
    options.set_option(build_option::force_implementations({{"conv_bsv16_fsv16", conv_impl}}));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
    auto out_ptr = out_mem.pointer<float>();

    auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory();
    auto out_ptr_bfyx = out_mem_bfyx.pointer<float>();

    ASSERT_EQ(out_mem.get_layout().format, format::bs_fs_yx_bsv16_fsv16);

    auto flatten_ref = flatten_4d(format::bfyx, reference_result);

    for (size_t i = 0; i < out_ptr_bfyx.size(); i++) {
        auto equal = are_equal(flatten_ref[i] * scalar, out_ptr_bfyx[i], 1e-2f);
        EXPECT_TRUE(equal);
        if (!equal)
        {
            std::cout << "Difference at idx = " << i << std::endl;
            return;
        }
    }
}

INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_fs_b_yx_fsv32,
                        convolution_depthwise_gpu,
                        ::testing::Values(
                                // Input size, Filter size Y, Filter size X, groups, Stride, Output padding, With bias
                                // Stride testing
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 2, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 3, 0, false),
                                // Different Features testing
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 16, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 20, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 25, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 33, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 35, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 45, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 65, 1, 0, false),
                                // Different filter's sizes testing
                                TestParamType_convolution_depthwise_gpu(5, 3, 2, 16, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 1, 16, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 2, 3, 16, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 1, 3, 16, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 2, 16, 2, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 1, 16, 2, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 2, 3, 16, 2, 0, false),
                                TestParamType_convolution_depthwise_gpu(5, 1, 3, 16, 2, 0, false),
                                // Input FeatureMap testing
                                TestParamType_convolution_depthwise_gpu(20, 3, 3, 50, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(30, 3, 3, 50, 1, 0, false),
                                TestParamType_convolution_depthwise_gpu(55, 3, 3, 50, 1, 0, false),
                                // Output padding testing + strides
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 1, 1, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 2, 2, false),
                                TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 3, 3, false)
                                ),
                        convolution_depthwise_gpu::PrintToStringParamName);

TEST_P(convolution_depthwise_gpu, depthwise_conv_fs_b_yx_fsv32)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }


    const int batch_num = 2;
    const int input_xy = testing::get<0>(GetParam());
    const int groups = testing::get<3>(GetParam());
    const int input_f = groups;
    const int output_f = groups;
    const int filter_y = testing::get<1>(GetParam());
    const int filter_x = testing::get<2>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const int input_offset_y = -(filter_y / 2);
    const int input_offset_x = -(filter_x / 2);

    const int output_y = 1 + (input_xy + 2 * (-input_offset_y) - filter_y) / stride + 2 * output_padding;
    const int output_x = 1 + (input_xy + 2 * (-input_offset_x) - filter_x) / stride + 2 * output_padding;

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(group(groups), batch(1), feature(1), spatial(filter_x, filter_y));
    auto weights_data = generate_random_4d<FLOAT16>(output_f, 1, filter_y, filter_x, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::goiyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
            input_layout("input", input_mem.get_layout()),
            data("weights_fsv", weights_mem));

    // Reorder input to fs_byx_fsv32
    topology.add(reorder("input_fsv", "input", { data_types::f16, format::fs_b_yx_fsv32, input_size }));

    // Calculate reference values without bias
    for (auto bi = 0; bi < batch_num; ++bi)
    {
        for (auto ofi = 0; ofi < output_f; ++ofi)
        {
            reference_result[bi][ofi] = reference_convolve(
                    input_data[bi], weights_data[ofi],  // input, weights
                    stride, stride,                     // strides
                    0,                                  // bias
                    1, 1,                               // dilation
                    -input_offset_y, -input_offset_x,   // input padding
                    output_padding, output_padding,     // output_padding
                    ofi, ofi + 1,                       // f_begin, f_end
                    true);                              // depthwise
        }
    }

    auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" }, groups,
                                { 1, 1, stride, stride }, { 0, 0, input_offset_x, input_offset_y });
    conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

    topology.add(conv_fsv);

    build_options options;
    options.set_option(build_option::optimize_data(true));
    implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" };
    options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_fsv").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::fs_b_yx_fsv32);

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f; ++fi)
            for (int yi = 0; yi < output_y; ++yi)
                for (int xi = 0; xi < output_x; ++xi)
                {
                    auto val_ref = reference_result[bi][fi][yi][xi];
                    auto val = out_ptr[(fi / 32) * batch_num * output_y * output_x * 32 +
                                       bi * output_y * output_x * 32 +
                                       yi * output_x * 32 +
                                       xi * 32 +
                                       fi % 32];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    EXPECT_TRUE(equal);
                    if (!equal)
                    {
                        std::cout << "At b = " << bi << ", fi = " << fi << ", yi = " << yi << ", xi = " << xi << std::endl;
                    }
                }
}

INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_b_fs_yx_fsv16,
                        convolution_depthwise_gpu_fsv16,
                        ::testing::Values(
                            // Input size, Filter size Y, Filter size X, groups, Stride, Output padding, With bias
                            // Stride testing
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 3, 0, false),
                            // Different Features testing
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 20, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 25, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 33, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 35, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 45, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 65, 1, 0, false),
                            // Different filter's sizes testing
                            TestParamType_convolution_depthwise_gpu(15, 1, 1, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 2, 2, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 3, 3, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 4, 4, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 5, 5, 16, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 8, 8, 16, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 1, 9, 16, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 17, 8, 16, 2, 0, false),
                            // Input FeatureMap testing
                            TestParamType_convolution_depthwise_gpu(20, 3, 3, 50, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(30, 3, 3, 50, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(55, 3, 3, 50, 1, 0, false),
                            // Output padding testing + strides
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 1, 1, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 2, 2, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 3, 3, false)
                        ),
                        convolution_depthwise_gpu_fsv16::PrintToStringParamName);

TEST_P(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = 2;
    const int input_xy = testing::get<0>(GetParam());
    const int groups = testing::get<3>(GetParam());
    const int input_f = groups;
    const int output_f = groups;
    const int filter_y = testing::get<1>(GetParam());
    const int filter_x = testing::get<2>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const int input_offset_y = -(filter_y / 2);
    const int input_offset_x = -(filter_x / 2);
    const int f_group_size = 16;
    const int f_group_num_in_batch = (output_f % f_group_size) ? (output_f / f_group_size + 1) : (output_f / f_group_size);

    const int output_y = 1 + (input_xy + 2 * (-input_offset_y) - filter_y) / stride + 2 * output_padding;
    const int output_x = 1 + (input_xy + 2 * (-input_offset_x) - filter_x) / stride + 2 * output_padding;

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(group(output_f), batch(1), feature(1), spatial(filter_x, filter_y));
    auto weights_data = generate_random_4d<FLOAT16>(output_f, 1, filter_y, filter_x, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::goiyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
        input_layout("input", input_mem.get_layout()),
        data("weights_fsv", weights_mem));

    // Reorder input to b_fs_yx_fsv16
    topology.add(reorder("input_fsv", "input", { data_types::f16, format::b_fs_yx_fsv16, input_size }));

    // Calculate reference values without bias
    for (auto bi = 0; bi < batch_num; ++bi)
    {
        for (auto ofi = 0; ofi < output_f; ++ofi)
        {
            reference_result[bi][ofi] = reference_convolve(
                input_data[bi], weights_data[ofi],  // input, weights
                stride, stride,                     // strides
                0,                                  // bias
                1, 1,                               // dilation
                -input_offset_y, -input_offset_x,   // input padding
                output_padding, output_padding,     // output_padding
                ofi, ofi + 1,                       // f_begin, f_end
                true);                              // depthwise
        }
    }

    auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" }, groups,
                                { 1, 1, stride, stride }, { 0, 0, input_offset_x, input_offset_y });
    conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

    topology.add(conv_fsv);

    build_options options;
    options.set_option(build_option::optimize_data(true));
    implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
    options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv_fsv").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::b_fs_yx_fsv16);

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f; ++fi)
            for (int yi = 0; yi < output_y; ++yi)
                for (int xi = 0; xi < output_x; ++xi)
                {
                    auto val_ref = reference_result[bi][fi][yi][xi];

                    auto val = out_ptr[bi * f_group_num_in_batch * f_group_size * output_y * output_x  +
                        (fi / f_group_size) * output_y * output_x * f_group_size +
                        yi * output_x * f_group_size +
                        xi * f_group_size +
                        fi % f_group_size];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    EXPECT_TRUE(equal);
                    if (!equal)
                    {
                        std::cout << "At b = " << bi << ", fi = " << fi << ", yi = " << yi << ", xi = " << xi << std::endl;
                    }
                }
}

TEST(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16_in_feature_padding) {
    //  Input:                  1x32x2x1
    //  Input padding above:    0x16x0x0
    //  Input padding below:    0x64x0x0
    //  Groups:                 32
    //  Filter:                 32x1x1x1x1
    //  Output:                 1x32x2x1

    auto num_groups = 32;
    auto input_size = tensor{ 1, num_groups, 1, 2 };
    auto weights_size = tensor(group(num_groups), batch(1), feature(1), spatial(1, 1));
    auto bias_size = tensor{ 1, num_groups, 1, 1 };
    auto stride = tensor{ 1, 1, 1, 1 };
    auto input_offset = tensor{ 0, 0, 0, 0 };
    auto dilation = tensor{ 1, 1, 1, 1 };
    auto output_size = tensor{ 1, num_groups, 1, 2};
    auto input_lower_sizes = { 0, 16, 0, 0 };
    auto input_upper_sizes = { 0, 64, 0, 0 };

    const auto& engine = get_test_engine();

    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, input_size });
    auto weights = memory::allocate(engine, { data_types::f32, format::goiyx, weights_size});
    auto bias = memory::allocate(engine, { data_types::f32, format::bfyx, bias_size});

    set_values<float>(input, {
         3, -1, -1, -1,  2, -2,  2,  2,  0,  1, -5,  4, -1,  4,  1,  0,
        -1,  4, -4,  3, -4,  4,  0,  1, -4,  3,  1,  0, -3, -1,  3,  0,
         4, -2,  0, -2, -1, -4,  1, -4,  3,  2,  2,  2,  3,  4,  0, -5,
         3, -5,  2, -3,  0,  3, -3, -4, -1, -4,  2,  4,  3,  4, -5, -5
    });
    set_values<float>(weights, {
        -4, -1, -4, -2,  1,  2,  3, -4, -5, -2, -2,  1,  2,  2, -5,  2,
        -5,  4, -3,  2, -1, -3,  0, -1, -5, -3, -5, -4,  1, -4,  4,  2
    });
    set_values<float>(bias, {
        -2,  3, -4, -4, -4,  4, -3, -1, -3, -5,  3,  3,  1,  4, -4,  1,
        -1, -2, -2, -1,  2, -1, -4,  2, -4, -1,  3,  0,  0,  4, -3, -4
    });
    std::vector<float> output_vec = {
        -14,   2,   4,   4, -12,   4,  -8,  -8,  -4,  -3,  -6,  12,  -6,   9,  -5,  -1,
          2, -23,   3, -11,  11,  -5,   3,   4,  -7,   7,   6,   4,  11,   1,   7,   1,
        -21,   9,  -2, -10,   1,  10,   1,  -9,  -1,   0,  -7,  -7,  -4,  -4,   2,   7,
        -19,  21,  -7,   8,   3, -12,  12,  16,  -1,  -4,  -4, -12,   9,  13, -14, -14
    };

    // reorder input to fsv16 format and introduce feature padding
    padding input_padding = padding(input_lower_sizes, input_upper_sizes);
    layout reordered_input_layout = layout(data_types::f32, format::b_fs_yx_fsv16, input_size, input_padding);

    topology topology(
        input_layout("input", input.get_layout()),
        reorder("input_reordered", "input", reordered_input_layout),
        data("weights", weights),
        data("bias", bias),
        convolution("conv", "input_reordered", { "weights" }, { "bias" }, num_groups, stride, input_offset, dilation, output_size, data_types::f32),
        reorder("out", "conv", format::bfyx, data_types::f32));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
    options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));

    network network(engine, topology, options);
    network.set_input_data("input", input);

    auto outputs = network.execute();
    EXPECT_EQ(outputs.size(), size_t(1));
    EXPECT_EQ(outputs.begin()->first, "out");

    auto output_memory = outputs.at("out").get_memory();
    auto output_layout = output_memory.get_layout();
    auto output_ptr = output_memory.pointer<float>();

    int y_size = output_layout.size.spatial[1];
    int x_size = output_layout.size.spatial[0];
    int f_size = output_layout.size.feature[0];
    int b_size = output_layout.size.batch[0];

    EXPECT_EQ(output_layout.format, format::bfyx);

    EXPECT_EQ(y_size, output_size.spatial[1]);
    EXPECT_EQ(x_size, output_size.spatial[0]);
    EXPECT_EQ(f_size, output_size.feature[0]);
    EXPECT_EQ(b_size, output_size.batch[0]);

    for (int b = 0; b < b_size; ++b) {
        for (int f = 0; f < f_size; ++f) {
            for (int y = 0; y < y_size; ++y) {
                for (int x = 0; x < x_size; ++x) {
                    EXPECT_EQ(
                        output_vec[b * f_size * y_size * x_size + f * y_size * x_size + y * x_size + x],
                        output_ptr[b * f_size * y_size * x_size + f * y_size * x_size + y * x_size + x]
                    );
                }
            }
        }
    }
}

struct convolution_depthwise_gpu_bfyx : public convolution_depthwise_gpu {};

TEST_P(convolution_depthwise_gpu_bfyx, depthwise_conv_bfyx)
{
    const auto& engine = get_test_engine();

    if (!engine.get_info().supports_fp16)
    {
        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
        EXPECT_EQ(1, 1);
        return;
    }

    const int batch_num = 2;
    const int input_xy = testing::get<0>(GetParam());
    const int groups = testing::get<3>(GetParam());
    const int input_f = groups;
    const int output_f = groups;
    const int filter_y = testing::get<1>(GetParam());
    const int filter_x = testing::get<2>(GetParam());
    const int stride = testing::get<4>(GetParam());
    const int output_padding = testing::get<5>(GetParam());
    const int input_offset_y = -(filter_y / 2);
    const int input_offset_x = -(filter_x / 2);
    const int f_group_size = 1;
    const int f_group_num_in_batch = (output_f % f_group_size) ? (output_f / f_group_size + 1) : (output_f / f_group_size);

    const int output_y = 1 + (input_xy + 2 * (-input_offset_y) - filter_y) / stride + 2 * output_padding;
    const int output_x = 1 + (input_xy + 2 * (-input_offset_x) - filter_x) / stride + 2 * output_padding;

    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
    set_values(input_mem, input_data_bfyx);

    auto weights_size = tensor(group(output_f), batch(1), feature(1), spatial(filter_x, filter_y));
    auto weights_data = generate_random_4d<FLOAT16>(output_f, 1, filter_y, filter_x, -1, 1);
    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
    auto weights_mem = memory::allocate(engine, { data_types::f16, format::goiyx, weights_size });
    set_values(weights_mem, weights_data_bfyx);

    // Will be used to store reference values calculated in branches depending on bias
    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));

    topology topology(
        input_layout("input", input_mem.get_layout()),
        data("weights", weights_mem));

    // Calculate reference values without bias
    for (auto bi = 0; bi < batch_num; ++bi)
    {
        for (auto ofi = 0; ofi < output_f; ++ofi)
        {
            reference_result[bi][ofi] = reference_convolve(
                input_data[bi], weights_data[ofi],  // input, weights
                stride, stride,                     // strides
                0,                                  // bias
                1, 1,                               // dilation
                -input_offset_y, -input_offset_x,   // input padding
                output_padding, output_padding,     // output_padding
                ofi, ofi + 1,                       // f_begin, f_end
                true);                              // depthwise
        }
    }

    auto conv_fsv = convolution("conv", "input", { "weights" }, groups,
                                { 1, 1, stride, stride }, { 0, 0, input_offset_x, input_offset_y });
    conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);

    topology.add(conv_fsv);

    build_options options;
    options.set_option(build_option::optimize_data(true));
    implementation_desc conv_impl = { format::bfyx, "" };
    options.set_option(build_option::force_implementations({ {"conv", conv_impl} }));
    network network(engine, topology, options);

    network.set_input_data("input", input_mem);

    network.execute();

    auto out_mem = network.get_output("conv").get_memory();
    auto out_ptr = out_mem.pointer<FLOAT16>();

    ASSERT_EQ(out_mem.get_layout().format, format::bfyx);

    for (int bi = 0; bi < batch_num; ++bi)
        for (int fi = 0; fi < output_f; ++fi)
            for (int yi = 0; yi < output_y; ++yi)
                for (int xi = 0; xi < output_x; ++xi)
                {
                    auto val_ref = reference_result[bi][fi][yi][xi];

                    auto val = out_ptr[bi * f_group_num_in_batch * f_group_size * output_y * output_x  +
                        (fi / f_group_size) * output_y * output_x * f_group_size +
                        yi * output_x * f_group_size +
                        xi * f_group_size +
                        fi % f_group_size];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    ASSERT_TRUE(equal) << "At b = " << bi << ", fi = " << fi << ", yi = " << yi << ", xi = " << xi << std::endl;
                }
}


INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_bfyx,
                        convolution_depthwise_gpu_bfyx,
                        ::testing::Values(
                            // Input size, Filter size Y, Filter size X, groups, Stride, Output padding, With bias
                            // Stride testing
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 3, 0, false),
                            // Different Features testing
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 20, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 25, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 33, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 35, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 45, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 65, 1, 0, false),
                            // Different filter's sizes testing
                            TestParamType_convolution_depthwise_gpu(15, 1, 1, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 2, 2, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 3, 3, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 4, 4, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 5, 5, 16, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 5, 5, 16, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 8, 8, 16, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 1, 9, 16, 2, 0, false),
                            TestParamType_convolution_depthwise_gpu(15, 17, 8, 16, 2, 0, false),
                            // Input FeatureMap testing
                            TestParamType_convolution_depthwise_gpu(20, 3, 3, 50, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(30, 3, 3, 50, 1, 0, false),
                            TestParamType_convolution_depthwise_gpu(55, 3, 3, 50, 1, 0, false),
                            // Output padding testing + strides
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 1, 1, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 2, 2, false),
                            TestParamType_convolution_depthwise_gpu(5, 3, 3, 32, 3, 3, false)
                        ),
                        convolution_depthwise_gpu::PrintToStringParamName);

INSTANTIATE_TEST_CASE_P(convolution_grouped_fsv4_fsv16,
                        convolution_grouped_gpu,
                        ::testing::Values(
                            // Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size Y,
                            // Groups number, Stride, Output padding, Batch, Input data format
                            // Format: b_fs_yx_fsv4
                            TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv4),
                            TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv4),
                            // Format: b_fs_yx_fsv16
                            TestParamType_grouped_convolution_gpu(4, 4, 16, 17, 3, 3, 1, 1, 1, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1, format::b_fs_yx_fsv16),
                            TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1, format::b_fs_yx_fsv16)
                        ),
                        convolution_grouped_gpu::PrintToStringParamName);

TEST_P(convolution_grouped_gpu, base) {
    const auto& engine = get_test_engine();

    const int input_x = testing::get<0>(GetParam()),
              input_y = testing::get<1>(GetParam()),
              input_f = testing::get<2>(GetParam()),
              output_f = testing::get<3>(GetParam()),
              filter_x = testing::get<4>(GetParam()),
              filter_y = testing::get<5>(GetParam()),
              groups = testing::get<6>(GetParam()),
              stride = testing::get<7>(GetParam()),
              batch_num = testing::get<8>(GetParam()),
              output_padding = 0,
              input_offset_y = (filter_x - 1) / 2,
              input_offset_x = (filter_y - 1) / 2;
    auto input_data_format = testing::get<9>(GetParam());

    auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
    auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
    auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
    auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
    set_values(input, input_rnd_vec);

    auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
    VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
    auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);

    std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
    for (int gi = 0; gi < groups; ++gi)
        for (int ofi = 0; ofi < output_f / groups; ++ofi)
            for (int ifi = 0; ifi < input_f / groups; ++ifi)
                for (int kyi = 0; kyi < filter_y; ++kyi)
                    for (int kxi = 0; kxi < filter_x; ++kxi) {
                        tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
                        size_t offset = weights_lay.get_linear_offset(coords);
                        weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
                    }
    auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
    set_values(weights, weights_flat);

    VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));

    // Calculate reference values without bias
    for (int bi = 0; bi < batch_num; ++bi)
        for (int gi = 0; gi < groups; ++gi)
            for (int ofi = 0; ofi < (int)weights_rnd[0].size(); ++ofi) {
                bool grouped = groups > 1;
                int f_begin = gi * input_f / groups;
                int f_end = gi * input_f / groups + input_f / groups;

                expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
                    input_rnd[bi], weights_rnd[gi][ofi],  // input, weights
                    stride, stride,                       // strides
                    0,                                    // bias
                    1, 1,                                 // dilation
                    input_offset_y, input_offset_x,       // input padding
                    0, 0,                                 // output_padding
                    f_begin, f_end,                       // f_begin, f_end
                    false,                                // depthwise
                    grouped);                             // grouped
            }

    topology topology(input_layout("input", input.get_layout()),
                      data("weights", weights),
                      reorder("input_fsv", "input", {data_types::u8, input_data_format, input_size}),
                      convolution("conv",
                                  "input_fsv",
                                  {"weights"},
                                  groups,
                                  {1, 1, stride, stride},
                                  {0, 0, -input_offset_x, -input_offset_y},
                                  {1, 1, 1, 1},
                                  padding({0, 0, output_padding, output_padding}, 0.f)));

    build_options options;
    options.set_option(build_option::optimize_data(true));
    implementation_desc conv_impl = {input_data_format, "fused_conv_eltwise_gpu_imad"};
    options.set_option(build_option::force_implementations({{"conv", conv_impl}}));

    network network(engine, topology, options);
    network.set_input_data("input", input);
    network.execute();

    auto out_mem = network.get_output("conv").get_memory();
    auto out_ptr = out_mem.pointer<float>();
    auto out_lay = out_mem.get_layout();

    ASSERT_EQ(out_mem.get_layout().format, input_data_format);
    ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
    ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
    ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
    ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());

    for (int bi = 0; bi < batch_num; ++bi)
        for (int ofi = 0; ofi < output_f; ++ofi)
            for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
                for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
                    tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
                    auto offset = out_lay.get_linear_offset(coords);
                    auto val = out_ptr[offset];
                    auto val_ref = expected_result[bi][ofi][yi][xi];
                    auto equal = are_equal(val_ref, val, 1e-2f);
                    if (!equal) {
                        std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
                        std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
                    }
                    EXPECT_TRUE(equal);
                }
}

template <typename InputT, typename WeightsT, typename OutputT>
class convolution_test_base {
public:
    virtual topology build_topology(const cldnn::engine& engine) {
        auto input_lay = layout(input_type(), format::bfyx, input_size());
        auto wei_lay = layout(weights_type(), format::bfyx, weights_size());

        auto wei_mem = memory::allocate(engine, wei_lay);
        auto weights_flat = flatten_4d(format::bfyx, _weights);
        set_values(wei_mem, weights_flat);
        layout reordered_layout = layout{input_type(), input_format(), input_size(), padding_size()};
        auto topo = topology();
        topo.add(input_layout("input", input_lay));
        topo.add(reorder("input_reorder", "input", reordered_layout));
        std::string input_id = "input_reorder";
        if (has_input_zp()) {
            auto input_zp_lay = layout(input_type(), format::bfyx, tensor(feature(input_features())));
            auto input_zp_mem = memory::allocate(engine, input_zp_lay);
            set_values(input_zp_mem, _input_zp);
            topo.add(data("input_zp", input_zp_mem));
            topo.add(eltwise("input_asymm", { "input_reorder", "input_zp" }, eltwise_mode::sub));
            input_id = "input_asymm";
        }
        topo.add(data("weights", wei_mem));
        std::string weights_id = "weights";
        if (has_weights_zp()) {
            auto weights_zp_lay = layout(weights_type(), format::bfyx, tensor(batch(output_features())));
            auto weights_zp_mem = memory::allocate(engine, weights_zp_lay);
            set_values(weights_zp_mem, _weights_zp);
            topo.add(data("weights_zp", weights_zp_mem));
            topo.add(eltwise("weights_asymm", { "weights", "weights_zp" }, eltwise_mode::sub));
            weights_id = "weights_asymm";
        }
        if (!has_bias()) {
            auto conv_prim = convolution(
                "conv",
                input_id,
                { weights_id },
                static_cast<uint32_t>(groups()),
                tensor(batch(0), feature(0), spatial(_stride_x, _stride_y)),
                tensor(batch(0), feature(0), spatial(_offset_x, _offset_y)),
                tensor(batch(0), feature(0), spatial(_dilation_x, _dilation_y)));
            conv_prim.output_data_type = output_type();
            topo.add(conv_prim);
        } else {
            auto bias_lay = layout(output_type(), format::bfyx, tensor(feature(output_features())));
            auto bias_mem = memory::allocate(engine, bias_lay);
            set_values(bias_mem, _bias);
            topo.add(data("bias", bias_mem));
            auto conv_prim = convolution(
                "conv",
                input_id,
                { weights_id },
                { "bias" },
                static_cast<uint32_t>(groups()),
                tensor(batch(0), feature(0), spatial(_stride_x, _stride_y)),
                tensor(batch(0), feature(0), spatial(_offset_x, _offset_y)),
                tensor(batch(0), feature(0), spatial(_dilation_x, _dilation_y)));
            conv_prim.output_data_type = output_type();
            topo.add(conv_prim);
        }

        return topo;
    }

    virtual primitive_id output_primitive_id() const {
        return "conv";
    }

    virtual void run_expect(const VVVVF<OutputT>& expected) {
        auto engine = get_test_engine();

        auto topo = build_topology(engine);

        auto build_opts = build_options(
            build_option::optimize_data(true),
            build_option::force_implementations({ {"conv", {input_format(), ""}} })
        );
        auto prog = program(engine, topo, build_opts);

        auto net = network(prog, 0);

        auto input_lay = layout(input_type(), format::bfyx, input_size());
        auto input_mem = memory::allocate(engine, input_lay);
        std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
        for (size_t bi = 0; bi < batch_num(); ++bi)
            for (size_t fi = 0; fi < input_features(); ++fi)
                for (size_t yi = 0; yi < input_y(); ++yi)
                    for (size_t xi = 0; xi < input_x(); ++xi) {
                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
                        size_t offset = input_lay.get_linear_offset(coords);
                        input_flat[offset] = _input[bi][fi][yi][xi];
                    }
        set_values(input_mem, input_flat);

        net.set_input_data("input", input_mem);
        auto result = net.execute();
        auto out_mem = result.at(output_primitive_id()).get_memory();
        auto out_lay = out_mem.get_layout();
        auto out_ptr = out_mem.cldnn::memory::template pointer<OutputT>();

        std::stringstream description;
        for (auto i : net.get_primitives_info()) {
            if (i.original_id == "conv") {
                std::cout << i.kernel_id << std::endl;
                description << "  kernel: " << i.kernel_id << std::endl;
            }
        }
        description << "  executed: ";
        for (auto e : net.get_executed_primitive_ids()) {
            description << e << ", ";
        }

        ASSERT_EQ(out_lay.data_type, output_type());
        ASSERT_EQ(out_lay.size.batch[0], expected.size());
        ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
        ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
        ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());

        for (size_t bi = 0; bi < batch_num(); ++bi)
            for (size_t fi = 0; fi < output_features(); ++fi)
                for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
                    for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
                        size_t offset = out_lay.get_linear_offset(coords);

                        ASSERT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
                            << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi << std::endl
                            << description.str();
                    }
    }

    void set_input(format::type fmt, VVVVF<InputT> input) {
        _input_fmt = fmt;
        _input = std::move(input);
    }

    void set_weights(VVVVF<WeightsT> weights) {
        _weights = std::move(weights);
    }

    void set_bias(VF<OutputT> bias) {
        _bias = std::move(bias);
    }

    void set_strides(int stride_x, int stride_y) {
        _stride_x = stride_x;
        _stride_y = stride_y;
    }

    void set_offsets(int offset_x, int offset_y) {
        _offset_x = offset_x;
        _offset_y = offset_y;
    }

    void set_dilation(int dilation_x, int dilation_y) {
        _dilation_x = dilation_x;
        _dilation_y = dilation_y;
    }

    void set_input_zp(VF<InputT> input_zp) {
        _input_zp = std::move(input_zp);
    }

    void set_weights_zp(VF<WeightsT> weights_zp) {
        _weights_zp = std::move(weights_zp);
    }

    void set_padded_input(bool padded_input) {
        _padded_input = padded_input;
    }

protected:
    VVVVF<InputT> _input;
    VVVVF<WeightsT> _weights;
    VF<OutputT> _bias;
    VF<InputT> _input_zp;
    VF<WeightsT> _weights_zp;
    format::type _input_fmt;
    int _stride_x, _stride_y;
    int _offset_x, _offset_y;
    int _dilation_x, _dilation_y;
    bool _padded_input;

    size_t batch_num() const { return _input.size(); }
    size_t input_features() const { return _input[0].size(); }
    size_t input_x() const { return _input[0][0][0].size(); }
    size_t input_y() const { return _input[0][0].size(); }
    size_t output_features() const { return _weights.size(); }
    size_t weights_input_features() const { return _weights[0].size(); }
    size_t filter_x() const { return _weights[0][0][0].size(); }
    size_t filter_y() const { return _weights[0][0].size(); }
    size_t groups() const { return input_features() / weights_input_features(); }

    bool has_bias() { return _bias.size() > 0; }
    bool has_input_zp() { return _input_zp.size() > 0; }
    bool has_weights_zp() { return _weights_zp.size() > 0; }
    bool need_padded_input() { return _padded_input; }

    data_types input_type() const { return type_to_data_type<InputT>::value; }
    format input_format() const { return _input_fmt; }
    tensor input_size() const {
        return tensor(TensorValue(batch_num()),
                      TensorValue(input_features()),
                      TensorValue(input_x()),
                      TensorValue(input_y()));
    }

    data_types weights_type() const { return type_to_data_type<WeightsT>::value; }
    tensor weights_size() const {
        return tensor(TensorValue(output_features()),
                      TensorValue(weights_input_features()),
                      TensorValue(filter_x()),
                      TensorValue(filter_y()));
    }
    padding padding_size() const {
        if (_padded_input) {
            return padding{
                      tensor(0, 0, TensorValue(filter_x() / 2), TensorValue(filter_y() / 2)).sizes(),
                      tensor(0, 0, TensorValue(filter_x() / 2), TensorValue(filter_y() / 2)).sizes()};
        } else {
            return padding{};
        }
    }

    data_types output_type() const { return type_to_data_type<OutputT>::value; }
};

struct convolution_random_test_all_params {
    static constexpr size_t spatials = 2;
    size_t batch;
    size_t input_features;
    size_t output_features;
    std::array<size_t, spatials> input_xy;
    std::array<size_t, spatials> filter_xy;
    std::array<int, spatials> stride_xy;
    std::array<int, spatials> offset_xy;
    std::array<int, spatials> dilation_xy;
    bool with_bias;
    size_t groups;
    format::type input_format;
    bool asymmetric_weights;
    bool asymmetric_data;
    bool need_padded_input;
};

template <typename InputT, typename WeightsT, typename OutputT>
class convolution_random_test_base : public convolution_test_base<InputT, WeightsT, OutputT> {
public:
    virtual VVVVF<OutputT> calculate_reference() {
        VVVVF<OutputT> expected = VVVVF<OutputT>(this->batch_num(), VVVF<OutputT>(this->output_features()));
        bool depthwise = this->groups() == this->input_features();
        bool grouped = (this->groups() > 1 && !depthwise) ? true : false;
        for (size_t bi = 0; bi < this->batch_num(); ++bi)
        for (size_t fi = 0; fi < this->output_features(); ++fi) {
            size_t f_begin = depthwise ? fi : 0;
            size_t f_end = (depthwise ? fi : 0) + this->weights_input_features();
            auto bias = this->has_bias() ? this->_bias[fi] : static_cast<OutputT>(0);
            auto weights_zp = this->has_weights_zp() ? this->_weights_zp[fi] : static_cast<WeightsT>(0);
            expected[bi][fi] = reference_convolve<InputT, OutputT, WeightsT>(
                this->_input[bi],
                this->_weights[fi],
                this->_stride_y,
                this->_stride_x,
                static_cast<float>(bias),
                this->_dilation_y,
                this->_dilation_x,
                -this->_offset_y,
                -this->_offset_x,
                0,
                0,
                f_begin,
                f_end,
                depthwise,
                grouped,
                this->_input_zp,
                weights_zp);
        }
        return expected;
    }

    virtual void param_set_up(const convolution_random_test_all_params& params) {
        auto wei_in_f = params.input_features / params.groups;

        auto input_data = generate_random_4d<InputT>(
            params.batch, params.input_features, params.input_xy[1], params.input_xy[0], -256, 256);
        auto weights_data = generate_random_4d<WeightsT>(
            params.output_features, wei_in_f, params.filter_xy[1], params.filter_xy[0], -256, 256);
        auto bias_data = params.with_bias ? generate_random_1d<OutputT>(params.output_features, -256, 256) : VF<OutputT>();
        auto weights_zp_data = params.asymmetric_weights ? generate_random_1d<WeightsT>(params.output_features, -256, 256) : VF<WeightsT>();
        auto input_zp_data = params.asymmetric_data ? generate_random_1d<InputT>(params.input_features, -256, 256) : VF<InputT>();

        this->set_input(params.input_format, std::move(input_data));
        this->set_weights(std::move(weights_data));
        this->set_bias(std::move(bias_data));
        this->set_strides(params.stride_xy[0], params.stride_xy[1]);
        this->set_offsets(params.offset_xy[0], params.offset_xy[1]);
        this->set_dilation(params.dilation_xy[0], params.dilation_xy[1]);
        this->set_weights_zp(std::move(weights_zp_data));
        this->set_input_zp(std::move(input_zp_data));
        this->set_padded_input(params.need_padded_input);
    }

    void run_random(const convolution_random_test_all_params& params) {
        param_set_up(params);

        VVVVF<OutputT> expected = calculate_reference();
        ASSERT_NO_FATAL_FAILURE(this->run_expect(expected));
    }
};

// construct a readable name in format as follows:
// <out format>_i<input>_w<weights>_s<stride>_ofs<offset>_d<dilation>_g<groups>_<bias>
static std::string to_string_convolution_all_params(const testing::TestParamInfo<convolution_random_test_all_params>& param_info) {
    auto& params = param_info.param;
    int Batch = (int)params.batch;
    int iF = (int)params.input_features;
    int oF = (int)params.output_features;
    auto& iSize = params.input_xy;
    auto& fSize = params.filter_xy;
    auto& Stride = params.stride_xy;
    auto& Offset = params.offset_xy;
    auto& Dilation = params.dilation_xy;
    auto groups = params.groups;
    bool Bias = params.with_bias;
    format::type iType = params.input_format;  // input format
    bool asymm_weights = params.asymmetric_weights;
    bool asymm_input = params.asymmetric_data;
    bool padded_input = params.need_padded_input;
    // Wrapper for negative walues as ex. "-1" will generate invalid gtest param string
    auto to_string_neg = [](int val) {
        if (val >= 0)
            return std::to_string(val);
        else
            return "m" + std::to_string(-val);
    };

    return fmt_to_str(iType) +
        "_i" + std::to_string(Batch) + 'x' + std::to_string(iF) + 'x' + std::to_string(iSize[0]) + 'x' + std::to_string(iSize[1]) +
        "_w" + std::to_string(oF) + 'x' + std::to_string(iF) + 'x' + std::to_string(fSize[0]) + 'x' + std::to_string(fSize[1]) +
        "_s" + std::to_string(Stride[0]) + 'x' + std::to_string(Stride[1]) +
        "_ofs" + to_string_neg(Offset[0]) + 'x' + to_string_neg(Offset[1]) +
        "_d" + std::to_string(Dilation[0]) + 'x' + std::to_string(Dilation[1]) +
        "_g" + std::to_string(groups) +
        (Bias ? "_bias" : "") + (asymm_weights ? "_wzp" : "") + (asymm_input ? "_izp" : "") +
        (padded_input ? "_in_pad" : "");
}

template <typename InputT, typename WeightsT, typename OutputT>
class convolution_random_test_fsv4_input : public convolution_random_test_base<InputT, WeightsT, OutputT> {
public:
    using parent = convolution_random_test_base<InputT, WeightsT, OutputT>;
    topology build_topology(const cldnn::engine& engine) override {
        auto input_lay = layout(this->input_type(), format::b_fs_yx_fsv4, this->input_size());
        auto wei_lay = layout(this->weights_type(), format::bfyx, this->weights_size());

        auto wei_mem = memory::allocate(engine, wei_lay);
        auto wei_flat = flatten_4d(format::bfyx, this->_weights);
        set_values(wei_mem, wei_flat);
        layout reordered_layout = layout{this->input_type(), this->input_format(), this->input_size(), this->padding_size()};
        auto topo = topology();
        topo.add(input_layout("input", input_lay));
        topo.add(reorder("input_reorder", "input", reordered_layout));
        std::string input_id = "input_reorder";
        if (this->has_input_zp()) {
            auto input_zp_lay = layout(this->input_type(), format::bfyx, tensor(feature(this->input_features())));
            auto input_zp_mem = memory::allocate(engine, input_zp_lay);
            set_values(input_zp_mem, this->_input_zp);
            topo.add(data("input_zp", input_zp_mem));
            topo.add(eltwise("input_asymm", { "input_reorder", "input_zp" }, eltwise_mode::sub));
            input_id = "input_asymm";
        }
        topo.add(data("weights", wei_mem));
        std::string weights_id = "weights";
        if (this->has_weights_zp()) {
            auto weights_zp_lay = layout(this->weights_type(), format::bfyx, tensor(batch(this->output_features())));
            auto weights_zp_mem = memory::allocate(engine, weights_zp_lay);
            set_values(weights_zp_mem, this->_weights_zp);
            topo.add(data("weights_zp", weights_zp_mem));
            topo.add(eltwise("weights_asymm", { "weights", "weights_zp" }, eltwise_mode::sub));
            weights_id = "weights_asymm";
        }
        if (!this->has_bias()) {
            auto conv_prim = convolution(
                "conv",
                input_id,
                { weights_id },
                static_cast<uint32_t>(this->groups()),
                tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
                tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
                tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
            conv_prim.output_data_type = this->output_type();
            topo.add(conv_prim);
        } else {
            auto bias_lay = layout(this->output_type(), format::bfyx, tensor(feature(this->output_features())));
            auto bias_mem = memory::allocate(engine, bias_lay);
            set_values(bias_mem, this->_bias);
            topo.add(data("bias", bias_mem));
            auto conv_prim = convolution(
                "conv",
                input_id,
                { weights_id },
                { "bias" },
                static_cast<uint32_t>(this->groups()),
                tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
                tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
                tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
            conv_prim.output_data_type = this->output_type();
            topo.add(conv_prim);
        }

        return topo;
    }
    void run_expect(const VVVVF<OutputT>& expected) override {
        auto engine = get_test_engine();

        auto topo = this->build_topology(engine);

        auto build_opts = build_options(
            build_option::optimize_data(true),
            build_option::force_implementations({ {"conv", { this->input_format(), ""}} })
        );
        auto prog = program(engine, topo, build_opts);

        auto net = network(prog, 0);

        auto input_lay = layout(this->input_type(), format::b_fs_yx_fsv4,  this->input_size());
        auto input_mem = memory::allocate(engine, input_lay);
        std::vector<InputT> input_flat(input_lay.get_linear_size(), static_cast<InputT>(0));
        for (size_t bi = 0; bi < this->batch_num(); ++bi)
            for (size_t fi = 0; fi < this->input_features(); ++fi)
                for (size_t yi = 0; yi < this->input_y(); ++yi)
                    for (size_t xi = 0; xi < this->input_x(); ++xi) {
                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
                        size_t offset = input_lay.get_linear_offset(coords);
                        input_flat[offset] = this->_input[bi][fi][yi][xi];
                    }
        set_values(input_mem, input_flat);

        net.set_input_data("input", input_mem);
        auto result = net.execute();
        auto out_mem = result.at(this->output_primitive_id()).get_memory();
        auto out_lay = out_mem.get_layout();
        auto out_ptr = out_mem.cldnn::memory::template pointer<OutputT>();

        std::stringstream description;
        for (auto i : net.get_primitives_info()) {
            if (i.original_id == "conv") {
                std::cout << i.kernel_id << std::endl;
                description << "  kernel: " << i.kernel_id << std::endl;
            }
        }
        description << "  executed: ";
        for (auto e : net.get_executed_primitive_ids()) {
            description << e << ", ";
        }

        ASSERT_EQ(out_lay.data_type, this->output_type());
        ASSERT_EQ(out_lay.size.batch[0], expected.size());
        ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
        ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
        ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());

        for (size_t bi = 0; bi < this->batch_num(); ++bi)
            for (size_t fi = 0; fi < this->output_features(); ++fi)
                for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
                    for (size_t xi = 0; xi < expected[0][0][0].size(); ++xi) {
                        tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
                        size_t offset = out_lay.get_linear_offset(coords);

                        ASSERT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
                            << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi << std::endl
                            << description.str();
                    }
    }
};

template <typename InputT, typename WeightsT, typename OutputT>
class convolution_scale_random_test : public convolution_random_test_base<InputT, WeightsT, OutputT> {
public:
    using parent = convolution_random_test_base<InputT, WeightsT, OutputT>;

    virtual primitive_id output_primitive_id() const {
        return "scale_wa_reorder";
    }

    topology build_topology(const cldnn::engine& engine) override {
        topology topo = parent::build_topology(engine);

        auto scale_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->output_features())));
        auto shift_lay = layout(this->output_type(), format::bfyx, tensor(batch(1), feature(this->output_features())));

        auto scale_mem = memory::allocate(engine, scale_lay);
        auto shift_mem = memory::allocate(engine, shift_lay);

        set_values(scale_mem, _scale);
        set_values(shift_mem, _shift);

        topo.add(cldnn::data("scale_scale", scale_mem));
        topo.add(cldnn::data("scale_shift", shift_mem));
        topo.add(cldnn::scale("scale", "conv", "scale_scale", "scale_shift"));
        // Work-around since if scale is output it will not be fused
        topo.add(cldnn::reorder("scale_wa_reorder", "scale", format::bfyx, this->output_type()));
        return topo;
    }

    VVVVF<OutputT> calculate_reference() override {
        auto expected = parent::calculate_reference();

        for (size_t bi = 0; bi < this->batch_num(); ++bi)
            for (size_t fi = 0; fi < this->output_features(); ++fi) {
                expected[bi][fi] = reference_scale_post_op<OutputT>(expected[bi][fi], _scale[fi], _shift[fi]);
            }
        return expected;
    }

    void param_set_up(const convolution_random_test_all_params& params) override {
        parent::param_set_up(params);

        _scale = generate_random_1d<OutputT>(this->output_features(), -1, 1);
        _shift = generate_random_1d<OutputT>(this->output_features(), 128, 128);
    }
protected:
    VF<OutputT> _scale;
    VF<OutputT> _shift;
};

class convolution_random_smoke_test : public testing::TestWithParam<convolution_random_test_all_params> {};

using convolution_random_test_s8s8f32 = convolution_random_test_base<int8_t, int8_t, float>;
using convolution_random_test_u8s8f32 = convolution_random_test_base<uint8_t, int8_t, float>;

using convolution_random_test_fsv4_input_s8s8f32 = convolution_random_test_fsv4_input<int8_t, int8_t, float>;
using convolution_random_test_fsv4_input_u8s8f32 = convolution_random_test_fsv4_input<uint8_t, int8_t, float>;

using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test<int8_t, int8_t, float>;
using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test<uint8_t, int8_t, float>;

struct params_generator : std::vector<convolution_random_test_all_params> {
    params_generator& smoke_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
        std::vector<size_t> batches = { 1, 2 };
        for (auto b : batches) {
            // first conv
            push_back(convolution_random_test_all_params{
                b, 3, 32, { 28, 28 }, { 7, 7 }, { 2, 2 }, { -3, -3 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 3, 64, { 1024, 10 }, { 5, 5 }, { 2, 2 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 3, 15, { 10, 10 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 4, 18, { 10, 10 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // 3x3
            push_back(convolution_random_test_all_params{
                b, 32, 48, { 14, 14 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 32, 48, { 14, 14 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // 1x1
            push_back(convolution_random_test_all_params{
                b, 32, 48, { 28, 28 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 32, 48, { 28, 28 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // 5x5
            push_back(convolution_random_test_all_params{
                b, 32, 48, { 28, 28 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 32, 48, { 28, 28 }, { 5, 5 }, { 2, 2 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // depthwise
            push_back(convolution_random_test_all_params{
                b, 64, 64, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 64, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 64, 64, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 1, 1 }, true, 64, input_format, asymm_weights, asymm_data, padded_input });
            // dilation
            push_back(convolution_random_test_all_params{
                b, 32, 24, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 2, 2 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 32, 24, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 2, 2 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // depthwise + dilation
            push_back(convolution_random_test_all_params{
                b, 64, 64, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 2, 2 }, true, 64, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 64, 64, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 2, 2 }, true, 64, input_format, asymm_weights, asymm_data, padded_input });
        }
        return *this;
    }

    params_generator& extra_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
        std::vector<size_t> batches = { 1, 2 };
        for (auto b : batches) {
            // 1x1
            push_back(convolution_random_test_all_params{
                b, 23, 41, { 19, 19 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 23, 41, { 19, 19 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // 3x3
            push_back(convolution_random_test_all_params{
                b, 16, 28, { 14, 14 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 23, 41, { 19, 17 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // 5x5
            push_back(convolution_random_test_all_params{
                b, 16, 28, { 14, 14 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                b, 23, 41, { 19, 17 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
        }
        return *this;
    }

    params_generator& bs_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
        std::vector<int> strides = { 1, 2 };
        for (auto s : strides) {
            // 1x1
            push_back(convolution_random_test_all_params{
            //      feature   input     filter    stride    offset  dilation  bias  groups
            //batch in  out   x  y      x  y      x  y      x  y      x  y
                16, 32, 32, { 4, 4 }, { 1, 1 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                16, 32, 32, { 9, 9 }, { 1, 1 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            // 3x3
            push_back(convolution_random_test_all_params{
                16, 32, 32, { 4, 4 }, { 3, 3 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
            push_back(convolution_random_test_all_params{
                16, 32, 32, { 9, 9 }, { 3, 3 }, { s, s }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data, padded_input });
        }
        return *this;
    }

    params_generator& all_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false, bool padded_input = false) {
        return smoke_test_params(input_format, asymm_weights, asymm_data, padded_input)
            .extra_test_params(input_format, asymm_weights, asymm_data, padded_input);
    }

    params_generator& add(convolution_random_test_all_params params) {
        push_back(params);
        return *this;
    }
};

TEST_P(convolution_random_smoke_test, u8s8f32) {
    convolution_random_test_u8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

TEST_P(convolution_random_smoke_test, u8s8f32_scale) {
    convolution_scale_random_test_u8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

TEST_P(convolution_random_smoke_test, s8s8f32_fsv4_input) {
    convolution_random_test_fsv4_input_s8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

TEST_P(convolution_random_smoke_test, u8s8f32_fsv4_input) {
    convolution_random_test_fsv4_input_u8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

INSTANTIATE_TEST_CASE_P(
    basic,
    convolution_random_smoke_test,
    testing::ValuesIn(
        params_generator()
        .smoke_test_params(format::b_fs_yx_fsv4)
        .smoke_test_params(format::bfyx)
        .smoke_test_params(format::b_fs_yx_fsv32)
        .smoke_test_params(format::b_fs_yx_fsv32, true, true)
        .smoke_test_params(format::b_fs_yx_fsv32, false, true)
        .smoke_test_params(format::b_fs_yx_fsv32, true, false)
        .smoke_test_params(format::b_fs_yx_fsv32, false, false, true)
        .smoke_test_params(format::b_fs_yx_fsv16, false, false, true)
        .smoke_test_params(format::b_fs_yx_fsv16)
        .bs_test_params(format::bs_fs_yx_bsv16_fsv16)
    ),
    to_string_convolution_all_params
);

class convolution_random_all_test : public testing::TestWithParam<convolution_random_test_all_params> {};

TEST_P(convolution_random_all_test, u8s8f32) {
    convolution_random_test_u8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

TEST_P(convolution_random_all_test, s8s8f32) {
    convolution_random_test_s8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

TEST_P(convolution_random_all_test, u8s8f32_scale) {
    convolution_scale_random_test_u8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

TEST_P(convolution_random_all_test, s8s8f32_scale) {
    convolution_scale_random_test_s8s8f32 test;
    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
}

INSTANTIATE_TEST_CASE_P(
    DISABLED_basic,
    convolution_random_all_test,
    testing::ValuesIn(
        params_generator()
        .all_test_params(format::bfyx)
        .all_test_params(format::bfyx, true, true)
        .all_test_params(format::bfyx, false, true)
        .all_test_params(format::bfyx, true, false)
        .all_test_params(format::b_fs_yx_fsv4)
        // byxf_af32 - depthwise broken for batch > 1
        // .smoke_test_params(format::byxf_af32)
        .all_test_params(format::b_fs_yx_fsv32)
        .all_test_params(format::b_fs_yx_fsv32, true, true)
        .all_test_params(format::b_fs_yx_fsv32, false, true)
        .all_test_params(format::b_fs_yx_fsv32, true, false)
        .all_test_params(format::b_fs_yx_fsv16)
        .add(convolution_random_test_all_params{
            1, 89, 3, { 1, 1 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, format::b_fs_yx_fsv4, false, false, false })
        .add(convolution_random_test_all_params{
            1, 16, 32, { 3, 3 }, { 17, 17 }, { 1, 1 }, { -8, -8 }, { 1, 1 }, true, 1, format::b_fs_yx_fsv16, false, false, true })
    ),
    to_string_convolution_all_params
);

class convolution_test : public tests::generic_test
{

public:

    static void TearDownTestCase()
    {
        all_generic_params.clear();
        all_layer_params.clear();
        all_test_params.clear();
    }

    static std::vector<std::shared_ptr<cldnn::primitive>> generate_specific_test_params()
    {
        // TODO: check split

        // TODO: check convolution without bias

        const std::vector<primitive_id>& weights = { "input1" };
        const std::vector<primitive_id>& bias = { "input2" };

        std::vector<tensor> stride_sizes = { tensor(1, 1, 1, 1), tensor(1, 1, 2, 3), tensor(1, 1, 4, 1), tensor(1, 1, 5, 5) };
        std::vector<tensor> dilation_sizes = { tensor(1, 1, 1, 1), tensor(1, 1, 5, 4), tensor(1, 1, 1, 3), tensor(1, 1, 7, 2) };
        std::vector<tensor> input_offset_sizes = { tensor(0, 0, 0, 0), tensor(0, 0, 2, 2), tensor(0, 0, -5, -2), tensor(0, 0, 3, -3) };

        // No padding
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "input0", weights, bias, stride_sizes[0], input_offset_sizes[0], dilation_sizes[0]));
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "input0", weights, bias, stride_sizes[1], input_offset_sizes[1], dilation_sizes[1]));
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "input0", weights, bias, stride_sizes[2], input_offset_sizes[2], dilation_sizes[2]));
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "input0", weights, bias, stride_sizes[3], input_offset_sizes[3], dilation_sizes[3]));

        // Input padding
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "reorder0", weights, bias, stride_sizes[1], input_offset_sizes[1], dilation_sizes[1]));
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "reorder0", weights, bias, stride_sizes[3], input_offset_sizes[3], dilation_sizes[3]));

        // Output padding
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "input0", weights, bias, stride_sizes[1], input_offset_sizes[1], dilation_sizes[1], { { 0, 0, 2, 4 },{ 0, 0, 0, 19 } }));
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "input0", weights, bias, stride_sizes[2], input_offset_sizes[2], dilation_sizes[2], { { 0, 0, 1, 0 },{ 0, 0, 13, 9 } }));

        // Input + Output padding
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "reorder0", weights, bias, stride_sizes[0], input_offset_sizes[0], dilation_sizes[0], { { 0, 0, 1, 5 },{ 0, 0, 19, 4 } }));
        all_layer_params.emplace_back(new convolution("convolution_no_relu", "reorder0", weights, bias, stride_sizes[3], input_offset_sizes[3], dilation_sizes[3], { { 0, 0, 1, 2 },{ 0, 0, 3, 4 } }));

        return all_layer_params;
    }

    static std::vector<std::tuple<std::shared_ptr<tests::test_params>, std::shared_ptr<cldnn::primitive>>> generate_all_test_params()
    {
        generate_specific_test_params();

        std::vector<cldnn::format> input_formats = { cldnn::format::bfyx, cldnn::format::yxfb };
        std::vector<cldnn::format> weights_formats = { cldnn::format::bfyx, cldnn::format::yxfb };

        std::vector<int32_t> output_features_sizes = { 1, 3, 16 };
        std::vector<cldnn::tensor> kernel_sizes = { tensor(1, 1, 1, 1), tensor(1, 1, 4, 7), tensor(1, 1, 5, 3) };

        std::vector<tensor> input_tensor_size = { tensor(1, 5, 59, 72), tensor(8, 3, 63, 56), tensor(16, 2, 50, 50), tensor(32, 1, 44, 62) };

        auto data_types = test_data_types();

        for (cldnn::data_types data_type : data_types)
        {
            for (cldnn::format input_format : input_formats)
            {
                for (cldnn::format weights_format : weights_formats)
                {
                    cldnn::build_options network_build_options;
                    if (input_format == cldnn::format::bfyx)
                    {
                        network_build_options.set_option(cldnn::build_option::optimize_data(true));
                    }
                    for (cldnn::tensor input_size : input_tensor_size)
                    {
                        for (cldnn::tensor kernel_size : kernel_sizes)
                        {
                            for (auto output_features : output_features_sizes)
                            {
                                std::shared_ptr<tests::test_params> params = std::make_shared<test_params>(data_type, input_format, input_size.batch[0], input_size.feature[0], tensor(1, 1, input_size.spatial[0], input_size.spatial[1]), network_build_options);
                                int input_features = params->input_layouts[0].size.feature[0];
                                params->input_layouts.push_back(cldnn::layout(params->data_type, weights_format, cldnn::tensor(output_features, input_features, kernel_size.spatial[0], kernel_size.spatial[1]))); // weights
                                params->input_layouts.push_back(cldnn::layout(params->data_type, params->fmt, cldnn::tensor(1, 1, output_features, 1))); // biases
                                all_generic_params.push_back(params);
                            }
                        }
                    }
                }
            }
        }

        // Create all the combinations for the test.
        for (const auto& layer_param : all_layer_params)
        {
            for (auto test_param : all_generic_params)
            {
                all_test_params.push_back(std::make_tuple(test_param, layer_param));
            }
        }

        return all_test_params;
    }

    virtual bool is_format_supported(cldnn::format format)
    {
        return ((format == cldnn::format::bfyx) || (format == cldnn::format::yxfb));
    }

    virtual cldnn::tensor get_expected_output_tensor()
    {
        auto convolution = std::static_pointer_cast<const cldnn::convolution>(layer_params);
        tensor input_size = generic_params->input_layouts[0].size;
        tensor dilation = convolution->dilation;
        tensor stride = convolution->stride;
        tensor input_offset = convolution->input_offset;
        tensor weights_size = generic_params->input_layouts[1].size;

        int kernel_extent_y = dilation.spatial[1] * (weights_size.spatial[1] - 1) + 1;
        int kernel_extent_x = dilation.spatial[0] * (weights_size.spatial[0] - 1) + 1;

        // Calculate output size
        int output_size_y = 1 + (input_size.spatial[1] - kernel_extent_y - 2 * input_offset.spatial[1]) / stride.spatial[1];
        int output_size_x = 1 + (input_size.spatial[0] - kernel_extent_x - 2 * input_offset.spatial[0]) / stride.spatial[0];
        int output_features = weights_size.batch[0];

        return cldnn::tensor(input_size.batch[0], output_features, output_size_x, output_size_y);
    }

    virtual void prepare_input_for_test(std::vector<cldnn::memory>& inputs)
    {
        if (generic_params->data_type == data_types::f32)
        {
            prepare_input_for_test_typed<float>(inputs);
        }
        else
        {
            prepare_input_for_test_typed<FLOAT16>(inputs);
        }
    }

    template<typename Type>
    void prepare_input_for_test_typed(std::vector<cldnn::memory>& inputs)
    {
        int k = (generic_params->data_type == data_types::f32) ? 8 : 4;

        // Update inputs.
        auto input = inputs[0];
        auto input_size = inputs[0].get_layout().size;
        VVVVF<Type> input_rnd = generate_random_4d<Type>(input_size.batch[0], input_size.feature[0], input_size.spatial[1], input_size.spatial[0], -2, 2, k);
        VF<Type> input_rnd_vec = flatten_4d<Type>(input.get_layout().format, input_rnd);
        set_values(input, input_rnd_vec);

        // Update weights.
        auto weight_input = inputs[1];
        auto weight_size = inputs[1].get_layout().size;
        VVVVF<Type> weight_rnd = generate_random_4d<Type>(weight_size.batch[0], weight_size.feature[0], weight_size.spatial[1], weight_size.spatial[0], -2, 2, k);
        VF<Type> weight_rnd_vec = flatten_4d<Type>(weight_input.get_layout().format, weight_rnd);
        set_values(weight_input, weight_rnd_vec);

        // Update biases.
        auto bias_input = inputs[2];
        auto bias_size = inputs[2].get_layout().size;
        VF<Type> bias_rnd = generate_random_1d<Type>(bias_size.spatial[0], -2, 2, k);
        set_values(bias_input, bias_rnd);
    }

    template<typename Type>
    memory generate_reference_typed(const std::vector<cldnn::memory>& inputs)
    {
        // Output reference is always bfyx.

        auto convolution = std::static_pointer_cast<const cldnn::convolution>(layer_params);

        data_types dt = inputs[0].get_layout().data_type;

        tensor input_size = inputs[0].get_layout().size;
        tensor dilation = convolution->dilation;
        tensor stride = convolution->stride;
        tensor input_offset = convolution->input_offset;
        tensor weights_size = inputs[1].get_layout().size;
        padding output_padding = convolution->output_padding;

        tensor output_size = get_expected_output_tensor();

        // Calculate output size
        int output_size_y = output_size.spatial[1];
        int output_size_x = output_size.spatial[0];
        int output_features = weights_size.batch[0];
        int input_features = weights_size.feature[0];

        auto output = memory::allocate( engine, cldnn::layout(dt, cldnn::format::bfyx, output_size, output_padding) );

        auto input_mem = inputs[0].pointer<Type>();
        auto weights_mem = inputs[1].pointer<Type>();
        auto bias_mem = inputs[2].pointer<Type>();
        auto output_mem = output.pointer<Type>();

        tensor output_buffer_size = output.get_layout().get_buffer_size();

        // Initialized output with zeros.
        std::fill(output_mem.begin(), output_mem.end(), static_cast<Type>(0));

        // Add the bias
        for (int b = 0; b < input_size.batch[0]; b++)
        {
            for (int out_f = 0; out_f < output_features; out_f++)
            {
                for (int y = 0; y < output_size_y; y++)
                {
                    for (int x = 0; x < output_size_x; x++)
                    {
                        int output_index = (b * output_buffer_size.feature[0] + out_f) * output_buffer_size.spatial[1] * output_buffer_size.spatial[0];
                        tensor lower_output_padding = convolution->output_padding.lower_size();
                        output_index += (lower_output_padding.spatial[1] + y) * output_buffer_size.spatial[0] + lower_output_padding.spatial[0] + x;

                        output_mem[output_index] += bias_mem[out_f];
                    }
                }
            }
        }

        const auto input0_desc = get_linear_memory_desc(inputs[0].get_layout());
        const auto input1_desc = get_linear_memory_desc(inputs[1].get_layout());

        // Convolve with weights
        for (int b = 0; b < input_size.batch[0]; b++)
        {
            int input_bi = b;
            for (int out_f = 0; out_f < output_features; out_f++)
            {
                for (int in_f = 0; in_f < input_features; in_f++)
                {
                    int input_fi = in_f;
                    for (int y = 0; y < output_size_y; y++)
                    {
                        for (int x = 0; x < output_size_x; x++)
                        {
                            int output_bi = b;
                            int output_fi = out_f;
                            int output_yi = y;
                            int output_xi = x;
                            int output_index = (output_bi * output_buffer_size.feature[0] + output_fi) * output_buffer_size.spatial[1] * output_buffer_size.spatial[0];
                            tensor lower_output_padding = convolution->output_padding.lower_size();
                            output_index += (lower_output_padding.spatial[1] + output_yi) * output_buffer_size.spatial[0] + lower_output_padding.spatial[0] + output_xi;

                            for (int kernel_y = 0; kernel_y < weights_size.spatial[1]; kernel_y++)
                            {
                                int input_yi = y * stride.spatial[1] + input_offset.spatial[1] + kernel_y * dilation.spatial[1];
                                if ((input_yi < 0) || (input_yi >= input_size.spatial[1]))
                                {
                                    continue;
                                }

                                for (int kernel_x = 0; kernel_x < weights_size.spatial[0]; kernel_x++)
                                {
                                    int input_xi = x * stride.spatial[0] + input_offset.spatial[0] + kernel_x * dilation.spatial[0];
                                    if ((input_xi < 0) || (input_xi >= input_size.spatial[0]))
                                    {
                                        continue;
                                    }

                                    size_t input_index = get_linear_index(inputs[0].get_layout(), input_bi, input_fi, input_yi, input_xi, input0_desc);

                                    int weight_bi = out_f;
                                    int weight_fi = in_f;
                                    int weight_yi = kernel_y;
                                    int weight_xi = kernel_x;
                                    size_t weight_index = get_linear_index(inputs[1].get_layout(), weight_bi, weight_fi, weight_yi, weight_xi, input1_desc);
                                    output_mem[output_index] += input_mem[input_index] * weights_mem[weight_index];
                                }
                            }
                        }
                    }
                }
            }
        }

        return output;
    }

    virtual memory generate_reference(const std::vector<cldnn::memory>& inputs)
    {
        if (generic_params->data_type == data_types::f32)
        {
            return generate_reference_typed<float>(inputs);
        }
        else
        {
            return generate_reference_typed<FLOAT16>(inputs);
        }
    }

private:

    static std::vector<std::shared_ptr<tests::test_params>> all_generic_params;
    static std::vector<std::shared_ptr<cldnn::primitive>> all_layer_params;
    static std::vector<std::tuple<std::shared_ptr<tests::test_params>, std::shared_ptr<cldnn::primitive>>> all_test_params;
};

std::vector<std::shared_ptr<tests::test_params>> convolution_test::all_generic_params = {};
std::vector<std::shared_ptr<cldnn::primitive>> convolution_test::all_layer_params = {};
std::vector<std::tuple<std::shared_ptr<tests::test_params>, std::shared_ptr<cldnn::primitive>>> convolution_test::all_test_params = {};

TEST_P(convolution_test, CONVOLUTION)
{
    run_single_test();
}

INSTANTIATE_TEST_CASE_P(DISABLED_CONVOLUTION,
                        convolution_test,
                        ::testing::ValuesIn(convolution_test::generate_all_test_params()),
                        tests::generic_test::custom_param_name_functor());