[GPU] MaxPool-8 (#9064)

2021-12-24 10:18:58 +02:00
parent da20993272
commit 31b6b034bc
22 changed files with 391 additions and 48 deletions
--- a/inference-engine/thirdparty/clDNN/api/intel_gpu/primitives/pooling.hpp
+++ b/inference-engine/thirdparty/clDNN/api/intel_gpu/primitives/pooling.hpp
@@ -164,9 +164,54 @@ struct pooling : public primitive_base<pooling> {
          size(0, 0, 0, 0),
          with_output_size(false) {}

+    /// @brief Constructs pooling primitive that supports MaxPool features from opset8 (dilation and indices output).
+    /// @param id This primitive id.
+    /// @param input Input primitive id.
+    /// @param indices_output Indices output primitive id.
+    /// @param size Pooling kernel size.
+    /// @param stride Defines shift in input buffer between adjacent calculations of output values.
+    /// @param dilation Defines index of next pixel to select when pooling.
+    /// @param pad Defines logical pad value added to input tensor.
+    /// @param pad_end Defines a shift, relative to the end of padding shape.
+    /// @param axis First dimension of input that should be used to calculate the upper bound of index output.
+    /// @param index_element_type Data type of index output.
+    /// @param output_size User-defined output data size of the primitive (w/o padding).
+    pooling(const primitive_id& id,
+            const primitive_id& input,
+            const primitive_id& indices_output,
+            const tensor& size,
+            const tensor& stride,
+            const tensor& dilation,
+            const tensor& pad,
+            const tensor& pad_end,
+            int64_t axis,
+            data_types index_element_type,
+            tensor output_size,
+            const data_types output_data_type,
+            const primitive_id& ext_prim_id = "",
+            const padding& output_padding = padding())
+            : primitive_base(id, {input, indices_output}, ext_prim_id, output_padding, optional_data_type{output_data_type}),
+              argmax(""),
+              indices_output(indices_output),
+              mode(pooling_mode::max),
+              global_pooling(false),
+              pad(pad),
+              stride(stride),
+              dilation(dilation),
+              size(size),
+              with_output_size(true),
+              output_size(output_size),
+              pad_end(pad_end),
+              axis(axis),
+              index_element_type(index_element_type),
+              maxPoolOpset8Features(true)
+              {}
+
    /// @brief Primitive id which contains indices of each max pooling region.
    /// Indices must be in flattened bfyx format with no padding. Needs to be fp32 data type.
    primitive_id argmax;
+    /// @brief Primitive id which contains indices output.
+    primitive_id indices_output;
    /// @brief Pooling mode.
    pooling_mode mode;
    /// @brief Global pooling (kernel size is equal to the spatial dimension of input tensor)
@@ -175,6 +220,8 @@ struct pooling : public primitive_base<pooling> {
    tensor pad;
    /// @brief Defines shift in input buffer between adjacent calculations of output values.
    tensor stride;
+    /// @brief Defines index of next pixel to select when pooling
+    tensor dilation;
    /// @brief Pooling kernel size.
    tensor size;
    /// @brief Indicates that the primitive has user-defined output size (non-zero value).
@@ -183,12 +230,20 @@ struct pooling : public primitive_base<pooling> {
    tensor output_size;
    /// @brief Defines a shift, relative to the end of padding shape.
    tensor pad_end;
+    /// @brief first dimension of input that should be used to calculate the upper bound of index output
+    int64_t axis;
+    /// @brief type of index output
+    data_types index_element_type;
+    bool maxPoolOpset8Features{false};

 protected:
    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
-        if (argmax.empty())
-            return {};
-        return {argmax};
+        std::vector<std::reference_wrapper<const primitive_id>> ret;
+        if (!argmax.empty())
+            ret.push_back(argmax);
+        if (!indices_output.empty())
+            ret.push_back(indices_output);
+        return ret;
    }
 };
 /// @}
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp
@@ -60,8 +60,25 @@ JitConstants PoolingKernelBase::GetJitConstants(const pooling_params& pp, Poolin
        MakeJitConstant(toString(pp.divMode) + "_KERNEL_DIVIDER", 1),
    });

+    if (pp.maxPoolOpset8Features) {
+        mem_consts.AddConstants({MakeJitConstant("DILATION", pp.poolDilation)});
+
+        if (pp.poolAxis != 0) {
+            size_t indices_upper_bound = 1;
+            const auto& dims = pp.inputs[0].GetDims();
+            for (auto d = dims.crbegin() + pp.poolAxis; d != dims.crend(); ++d) {
+                indices_upper_bound *= d->v;
+            }
+            if (indices_upper_bound != 0 && indices_upper_bound != 1) {
+                mem_consts.AddConstants({MakeJitConstant("INDICES_UPPER_BOUND", indices_upper_bound)});
+            }
+        }
+
+        mem_consts.Merge(MakeTypeJitConstants(pp.poolIndexElementType, "SELECTED_INDICES"));
+    }
+
    if (dispatchData.needsBoundary) {
-        mem_consts.AddConstant(MakeJitConstant("CHECK_BOUNDRY", 1));
+        mem_consts.AddConstant(MakeJitConstant("CHECK_BOUNDARY", 1));
    }

    if (EnableRound(pp)) {
@@ -78,6 +95,8 @@ bool PoolingKernelBase::NeedsBoundaryCheck(const pooling_params& pp) const {

    if (pp.poolPad.x != 0 || pp.poolPad.y != 0 || pp.poolPad.z != 0) {
        return true;
+    } else if (pp.poolDilation.x > 1 || pp.poolDilation.y > 1 || pp.poolDilation.z > 1) {
+        return true;
    } else if ((((input.X().v - pp.poolSize.x) / pp.poolStride.x) + 1) < output.X().v ||
               (((input.Y().v - pp.poolSize.y) / pp.poolStride.y) + 1) < output.Y().v ||
               (((input.Z().v - pp.poolSize.z) / pp.poolStride.z) + 1) < output.Z().v) {
@@ -181,9 +200,13 @@ KernelsData PoolingKernelBase::GetCommonKernelsData(const Params& params,
    auto& kernel = kd.kernels[0];
    FillCLKernelData(kernel, dispatchData, params.engineInfo, kernelName, jit, entry_point, DEFAULT, false, false, 1,
                     GetFusedPrimitiveInputsCount(params));
+    uint32_t param_idx = 1;
    if (orgParams.poolType == PoolType::MAX_WITH_ARGMAX)
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, param_idx++});

+    if (orgParams.maxPoolOpset8Features) {
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, param_idx++});
+    }

    return {kd};
 }
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.h
@@ -4,6 +4,7 @@

 #pragma once

+#include "common_types.h"
 #include "kernel_base_opencl.h"
 #include "kernel_selector_params.h"

@@ -21,6 +22,10 @@ struct pooling_params : public base_params {
    uSize poolSize;
    uSize poolStride;
    uSize poolPad;
+    bool maxPoolOpset8Features = false;
+    uSize poolDilation{1, 1, 1};
+    Datatype poolIndexElementType = Datatype::INT64;
+    int64_t poolAxis = 0;

    ParamsKey GetParamsKey() const override {
        ParamsKey k = base_params::GetParamsKey();
@@ -29,6 +34,11 @@ struct pooling_params : public base_params {
        k.EnablePoolRemainder(remainderAction);
        k.EnablePoolKernelDividerMode(divMode);

+        if (maxPoolOpset8Features) {
+            k.EnablePoolDilation();
+            k.EnablePoolIndicesOutput();
+        }
+
        return k;
    }
 };
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp
@@ -27,6 +27,8 @@ ParamsKey PoolingKernelGPURef::GetSupportedKey() const {
    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
    k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
    k.EnableDifferentTypes();
+    k.EnablePoolDilation();
+    k.EnablePoolIndicesOutput();
    return k;
 }

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl
@@ -51,7 +51,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(

    ACCUMULATOR_TYPE result[4] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
    {
@@ -96,7 +96,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
    const uint num_elements = (hend - offset_y) * (wend - offset_x);
 #endif
-#else // !CHECK_BOUNDRY
+#else // !CHECK_BOUNDARY
    uint input_idx = GET_DATA_B_FS_YX_FSV4_INDEX(INPUT0, b, f, offset_y, offset_x);

    for(uint j = 0; j < POOL_SIZE_Y; j++)
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_zyx_fsv16_imad.cl
@@ -255,7 +255,7 @@ KERNEL(pooling_gpu_b_fs_zyx_fsv16)(
    ACCUMULATOR_TYPE result[FEATURE_SLICE_SIZE] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL,
                                                    INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
        offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
@@ -341,7 +341,7 @@ KERNEL(pooling_gpu_b_fs_zyx_fsv16)(
    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
    const uint num_elements = (dend - offset_z) * (hend - offset_y) * (wend - offset_x);
 #endif
-#else // !CHECK_BOUNDRY
+#else // !CHECK_BOUNDARY
 #if INPUT0_DIMS == 4
    uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
 #else
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bs_fs_yx_bsv16_fsv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bs_fs_yx_bsv16_fsv16.cl
@@ -48,7 +48,7 @@ KERNEL(pooling_gpu_bs_fs_yx_bsv16_fsv16)(const __global INPUT0_TYPE* input,
    const uint input_fs_pitch = input_y_pitch * (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y);
    int16 result = INIT_VAL;

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    uint batch_and_feature_offset = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, 0, 0);
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X || offset_y + POOL_SIZE_Y < 0 ||
        offset_y >= INPUT0_SIZE_Y) {
@@ -88,7 +88,7 @@ KERNEL(pooling_gpu_bs_fs_yx_bsv16_fsv16)(const __global INPUT0_TYPE* input,
    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
    const uint num_elements = (hend - offset_y) * (wend - offset_x);
 #endif
-#else  // !CHECK_BOUNDRY
+#else  // !CHECK_BOUNDARY
    uint input_idx = GET_DATA_BS_FS_YX_BSV16_FSV16_INDEX(INPUT0, b, f, offset_y, offset_x);
    __attribute__((opencl_unroll_hint(POOL_SIZE_Y)))
    for (uint j = 0; j < POOL_SIZE_Y; j++) {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl
@@ -53,7 +53,7 @@ KERNEL(pooling_gpu_byxf_opt)(
    const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
    const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
    {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
@@ -84,7 +84,7 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
    const size_t fs_offset = fs * fs_pitch; // locate beginning of feature tile
    const size_t b_offset = b * b_pitch;   // locate beginning of batch

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
    {
@@ -121,7 +121,7 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
    const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
    const uint num_elements = (hend - offset_y) * (wend - offset_x);
 #endif
-#else // !CHECK_BOUNDRY
+#else // !CHECK_BOUNDARY
    for(uint in_dy = 0; in_dy < POOL_SIZE_Y; in_dy++)
    {
        const size_t input_offset_y = (offset_y + in_dy) * y_pitch;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
@@ -92,7 +92,7 @@ KERNEL(pooling_gpu_int8_ref)(

    ACCUMULATOR_TYPE result = INIT_VAL;

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
        offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
@@ -155,7 +155,7 @@ KERNEL(pooling_gpu_int8_ref)(

 #endif  // DYNAMIC_WITH_PADDING_KERNEL_DIVIDER

-#else  // CHECK_BOUNDRY
+#else  // CHECK_BOUNDARY

 #if OUTPUT_DIMS == 5
    for(uint l = 0; l < POOL_SIZE_Z; l++)
@@ -179,7 +179,7 @@ KERNEL(pooling_gpu_int8_ref)(
    const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
 #endif

-#endif // CHECK_BOUNDRY
+#endif // CHECK_BOUNDARY

 #if defined AVG_POOLING
 #if ENABLE_ROUND
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl
@@ -28,6 +28,9 @@ KERNEL(pooling_gpu)(
 #if MAX_WITH_ARGMAX_POOLING
 , __global float* arg_max
 #endif
+#ifdef SELECTED_INDICES_TYPE
+, __global SELECTED_INDICES_TYPE* indices
+#endif
 #if HAS_FUSED_OPS_DECLS
    , FUSED_OPS_DECLS
 #endif
@@ -91,11 +94,15 @@ KERNEL(pooling_gpu)(

    ACCUMULATOR_TYPE result = INIT_VAL;

+#ifdef SELECTED_INDICES_TYPE
+    uint result_idx = 0;
+#endif
+
 #if MAX_WITH_ARGMAX_POOLING
    uint arg_max_idx = 0;
 #endif

-#ifdef CHECK_BOUNDRY
+#ifdef CHECK_BOUNDARY
    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
        offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
@@ -107,6 +114,16 @@ KERNEL(pooling_gpu)(
    uint num_elementes = 0;
 #endif

+#ifndef DILATION_SIZE_X
+    #define DILATION_SIZE_X 1
+#endif
+#ifndef DILATION_SIZE_Y
+    #define DILATION_SIZE_Y 1
+#endif
+#ifndef DILATION_SIZE_Z
+    #define DILATION_SIZE_Z 1
+#endif
+
 #if OUTPUT_DIMS == 5
    const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0);
 #else
@@ -116,20 +133,20 @@ KERNEL(pooling_gpu)(
 #if OUTPUT_DIMS == 5
    for(uint l = 0; l < POOL_SIZE_Z; l++)
    {
-        int input_offset_z = offset_z + l;
+        int input_offset_z = offset_z + (l * DILATION_SIZE_Z);
        bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0;
        if (!zero_z)
        {
 #endif
            for(uint j = 0; j < POOL_SIZE_Y; j++)
            {
-                int input_offset_y = offset_y + j;
+                int input_offset_y = offset_y + (j * DILATION_SIZE_Y);
                bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
                if(!zero_y)
                {
                    for(uint i = 0; i < POOL_SIZE_X; i++)
                    {
-                        int input_offset_x = offset_x + i;
+                        int input_offset_x = offset_x + (i * DILATION_SIZE_X);
                        bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
                        if(!zero)
                        {
@@ -159,7 +176,17 @@ KERNEL(pooling_gpu)(
                                arg_max_idx = input_idx_bfyx_no_padding;
                            }
 #endif
-                            result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+                            const ACCUMULATOR_TYPE casted_input = TO_ACCUMULATOR_TYPE(input[input_idx]);
+                            #ifdef SELECTED_INDICES_TYPE
+                                if (casted_input > result)
+                                {
+                                    result = casted_input;
+                                    result_idx = input_idx;
+                                }
+                            #else
+                                result = FUNC_CALL(apply_pooling)(result, casted_input);
+                            #endif
+

 #ifdef DYNAMIC_KERNEL_DIVIDER
                            num_elementes++;
@@ -185,7 +212,7 @@ KERNEL(pooling_gpu)(

 #endif  // DYNAMIC_WITH_PADDING_KERNEL_DIVIDER

-#else  // CHECK_BOUNDRY
+#else  // CHECK_BOUNDARY

 #if  OUTPUT_DIMS == 5  // 3D
    uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x);
@@ -227,7 +254,16 @@ KERNEL(pooling_gpu)(
                uint input_idx = INPUT0_GET_INDEX(b, f, offset_y + j, offset_x + i);
                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
    #else
-                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+                #ifdef SELECTED_INDICES_TYPE
+                    const current_input_value = input[input_idx];
+                    if (current_input_value > result)
+                    {
+                        result = current_input_value;
+                        result_idx = input_idx;
+                    }
+                #else
+                    result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+                #endif
                input_idx += INPUT0_X_PITCH;
    #endif
 #endif
@@ -253,7 +289,7 @@ KERNEL(pooling_gpu)(
    const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
 #endif

-#endif // CHECK_BOUNDRY
+#endif // CHECK_BOUNDARY

 #if defined AVG_POOLING
    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
@@ -280,6 +316,13 @@ KERNEL(pooling_gpu)(
 #endif
    output[output_pos] = final_result;

+#ifdef SELECTED_INDICES_TYPE
+    #ifdef INDICES_UPPER_BOUND
+        result_idx %= INDICES_UPPER_BOUND;
+    #endif
+    indices[output_pos] = TO_SELECTED_INDICES_TYPE(result_idx);
+#endif
+
 #if MAX_WITH_ARGMAX_POOLING
    //INPUT1 macro stands for Argmax
    const uint arg_max_pos = GET_DATA_INDEX_5D(INPUT1, b, f, z, y, x);
@@ -288,3 +331,7 @@ KERNEL(pooling_gpu)(
 }

 #undef INIT_VAL
+
+#undef DILATION_SIZE_X
+#undef DILATION_SIZE_Y
+#undef DILATION_SIZE_Z
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
@@ -124,6 +124,8 @@ public:
                        uint32_t dynamicKenrelDivider : 1;
                        uint32_t dynamicKenrelDividerWithPadding : 1;
                        uint32_t position_sensitive : 1;
+                        uint32_t dilation : 1;
+                        uint32_t indices_output : 1;
                    } pooling;
                    struct conv_t {
                        uint32_t split : 1;
@@ -281,6 +283,8 @@ public:
    void EnablePoolKernelDividerMode(KernelDividerMode m);
    void EnablePoolType(PoolType t);
    void EnablePoolRemainder(PoolRemainder r);
+    void EnablePoolDilation() { key.restrict.val.dedicated.pooling.dilation = 1; }
+    void EnablePoolIndicesOutput() { key.restrict.val.dedicated.pooling.indices_output = 1; }
    void EnableQuantization(QuantizationType q);
    void EnablePositionSensitivePooling() { key.restrict.val.dedicated.pooling.position_sensitive = 1; }
    void EnableSplitSupport() { key.restrict.val.dedicated.conv.split = 1; }
--- a/inference-engine/thirdparty/clDNN/src/impls/ocl/pooling.cpp
+++ b/inference-engine/thirdparty/clDNN/src/impls/ocl/pooling.cpp
@@ -89,6 +89,24 @@ public:
            get_default_optional_params<kernel_selector::pooling_optional_params>(arg.get_program());

        const auto primitive = arg.get_primitive();
+
+        pool_params.maxPoolOpset8Features = primitive->maxPoolOpset8Features;
+        if (pool_params.maxPoolOpset8Features) {
+            switch (primitive->index_element_type) {
+                case cldnn::data_types::i32: {
+                    pool_params.poolIndexElementType = kernel_selector::Datatype::INT32;
+                    break;
+                }
+                case cldnn::data_types::i64: {
+                    pool_params.poolIndexElementType = kernel_selector::Datatype::INT64;
+                    break;
+                }
+                default:
+                    throw std::runtime_error{"Not supported index element type"};
+            }
+            pool_params.poolAxis = primitive->axis;
+        }
+
        const auto& stride = primitive->stride;
        const auto& pad = primitive->pad;
        const auto& input_sizes = arg.input().get_output_layout().size;
@@ -134,6 +152,9 @@ public:

        pp.poolStride = {(uint32_t)stride.spatial[0], (uint32_t)stride.spatial[1], (uint32_t)stride.spatial[2]};

+        const auto& dilation = primitive->dilation;
+        pp.poolDilation = {(uint32_t)dilation.spatial[0], (uint32_t)dilation.spatial[1], (uint32_t)dilation.spatial[2]};
+
        auto& kernel_selector = kernel_selector::pooling_kernel_selector::Instance();
        auto best_kernels = kernel_selector.GetBestKernels(pool_params, pool_optional_params);

--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -221,6 +221,7 @@ REGISTER_FACTORY(v8, NV12toBGR);
 REGISTER_FACTORY(v8, I420toRGB);
 REGISTER_FACTORY(v8, I420toBGR);
 REGISTER_FACTORY(v8, RandomUniform)
+REGISTER_FACTORY(v8, MaxPool);

 // --------------------------- Supported internal ops --------------------------- //
 REGISTER_FACTORY(internal, NonMaxSuppressionIEInternal);
--- a/src/plugins/intel_gpu/src/plugin/ops/pooling.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/pooling.cpp
@@ -8,6 +8,7 @@
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/avg_pool.hpp"

+#include "intel_gpu/primitives/mutable_data.hpp"
 #include "intel_gpu/primitives/pooling.hpp"

 namespace ov {
@@ -17,6 +18,7 @@ namespace intel_gpu {
 struct PoolingParameters {
    cldnn::tensor kernel;
    cldnn::tensor stride;
+    cldnn::tensor dilation;
    cldnn::tensor pad_begin;
    cldnn::tensor pad_end;
 };
@@ -24,10 +26,15 @@ struct PoolingParameters {
 static PoolingParameters GetPoolingParameters(const ngraph::Shape& kernel,
                                              const ngraph::Strides& strides,
                                              const ngraph::Shape& pads_begin,
-                                              const ngraph::Shape& pads_end) {
+                                              const ngraph::Shape& pads_end,
+                                              const ngraph::Strides& dilations = {}) {
    cldnn::tensor k, s, pb, pe;
-    if (pads_begin.size() != strides.size() || pads_end.size() != strides.size() || kernel.size() != strides.size())
-        IE_THROW() << "Strides, KernelSizes and Pads are supposed to have the same elements count";
+    cldnn::tensor d{cldnn::batch(1), cldnn::feature(1), cldnn::spatial(1, 1, 1)};
+    const auto is_dilation_specified = !dilations.empty();
+
+    if (pads_begin.size() != strides.size() || pads_end.size() != strides.size() || kernel.size() != strides.size()
+        || (is_dilation_specified && dilations.size() != strides.size()))
+        IE_THROW() << "Strides, KernelSizes, Pads (and Dilations, if specified) are supposed to have the same elements count";

    std::vector<cldnn::tensor::value_type> pb_casted(pads_begin.begin(), pads_begin.end());
    std::vector<cldnn::tensor::value_type> pe_casted(pads_end.begin(), pads_end.end());
@@ -37,6 +44,9 @@ static PoolingParameters GetPoolingParameters(const ngraph::Shape& kernel,
            s = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(strides[2], strides[1], strides[0]));
            pb = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), cldnn::spatial(pb_casted[2], pb_casted[1], pb_casted[0]));
            pe = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), cldnn::spatial(pe_casted[2], pe_casted[1], pe_casted[0]));
+            if (is_dilation_specified) {
+                d = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(dilations[2], dilations[1], dilations[0]));
+            }
            break;
        }
        case 2: {
@@ -44,6 +54,9 @@ static PoolingParameters GetPoolingParameters(const ngraph::Shape& kernel,
            s = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(strides[1], strides[0], 1));
            pb = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), cldnn::spatial(pb_casted[1], pb_casted[0], 0));
            pe = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), cldnn::spatial(pe_casted[1], pe_casted[0], 0));
+            if (is_dilation_specified) {
+                d = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(dilations[1], dilations[0], 1));
+            }
            break;
        }
        case 1: {
@@ -51,12 +64,15 @@ static PoolingParameters GetPoolingParameters(const ngraph::Shape& kernel,
            s = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(strides[0], 1, 1));
            pb = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), cldnn::spatial(pb_casted[0], 0, 0));
            pe = cldnn::tensor(cldnn::batch(0), cldnn::feature(0), cldnn::spatial(pe_casted[0], 0, 0));
+            if (is_dilation_specified) {
+                d = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(dilations[0], 1, 1));
+            }
            break;
        }
        default: IE_THROW() << "Unsupported pooling parameters size. Only 1d, 2d, and 3d cases are supported";
    }

-    return {k, s, pb, pe};
+    return {k, s, d, pb, pe};
 }

 static void CreateAvgPoolOp(Program& p, const std::shared_ptr<ngraph::op::v1::AvgPool>& op) {
@@ -99,7 +115,60 @@ static void CreateMaxPoolOp(Program& p, const std::shared_ptr<ngraph::op::v1::Ma
    p.AddPrimitiveToProfiler(op);
 }

+static void CreateMaxPoolOp(Program& p, const std::shared_ptr<ngraph::op::v8::MaxPool>& op) {
+    p.ValidateInputs(op, {1});
+    if (op->get_output_size() != 2) {
+        IE_THROW() << "MaxPool opset 8 requires 2 outputs";
+    }
+    auto inputPrimitives = p.GetInputPrimitiveIDs(op);
+    const auto layer_type_name = layer_type_name_ID(op);
+    const auto layerName = layer_type_name + ".0";
+
+    const auto mutable_precision = op->get_output_element_type(1);
+    const auto output_shape = op->get_output_shape(1);
+    cldnn::layout mutableLayout = cldnn::layout(DataTypeFromPrecision(mutable_precision),
+                                                DefaultFormatForDims(output_shape.size()),
+                                                tensor_from_dims(output_shape));
+    const auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
+    const cldnn::primitive_id maxpool_mutable_id_w = layer_type_name + "_md_write";
+    const auto op_friendly_name = op->get_friendly_name();
+    const auto indices_mutable_prim = cldnn::mutable_data(maxpool_mutable_id_w,
+                                                          shared_memory,
+                                                          op_friendly_name);
+    p.primitiveIDs[maxpool_mutable_id_w] = maxpool_mutable_id_w;
+    p.AddPrimitive(indices_mutable_prim);
+    inputPrimitives.push_back(maxpool_mutable_id_w);
+
+    const auto params = GetPoolingParameters(op->get_kernel(), op->get_strides(), op->get_pads_begin(), op->get_pads_end(), op->get_dilations());
+    auto poolPrim = cldnn::pooling(layerName,
+                                   inputPrimitives[0],
+                                   inputPrimitives.back(),
+                                   params.kernel,
+                                   params.stride,
+                                   params.dilation,
+                                   params.pad_begin,
+                                   params.pad_end,
+                                   op->get_axis(),
+                                   DataTypeFromPrecision(op->get_index_element_type()),
+                                   tensor_from_dims(op->get_output_shape(0)),
+                                   DataTypeFromPrecision(op->get_output_element_type(0)),
+                                   op_friendly_name);
+    p.AddPrimitive(poolPrim);
+
+    const cldnn::primitive_id maxpool_mutable_id_r = layer_type_name + ".1";
+    const auto indices_mutable_id_r = cldnn::mutable_data(maxpool_mutable_id_r,
+                                                          { layerName },
+                                                          shared_memory,
+                                                          op_friendly_name);
+    p.primitiveIDs[maxpool_mutable_id_r] = maxpool_mutable_id_r;
+    p.AddPrimitive(indices_mutable_id_r);
+
+    p.AddPrimitiveToProfiler(poolPrim, op);
+}
+
+
 REGISTER_FACTORY_IMPL(v1, MaxPool);
+REGISTER_FACTORY_IMPL(v8, MaxPool);
 REGISTER_FACTORY_IMPL(v1, AvgPool);

 }  // namespace intel_gpu
--- a/src/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/pooling.cpp
+++ b/src/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/pooling.cpp
@@ -363,6 +363,8 @@ const auto maxPoolv8_ExplicitPad_FloorRounding_Params = ::testing::Combine(
        ::testing::ValuesIn(dilation),
        ::testing::ValuesIn(padBegins),
        ::testing::ValuesIn(padEnds),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::FLOOR),
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );
@@ -386,6 +388,8 @@ const auto maxPoolv8_SameUpperPad_FloorRounding_Params = ::testing::Combine(
        ::testing::ValuesIn(dilation),
        ::testing::ValuesIn(padBegins),
        ::testing::ValuesIn(padEnds),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::FLOOR),
        ::testing::Values(ngraph::op::PadType::SAME_UPPER)
 );
@@ -409,6 +413,8 @@ const auto maxPoolv8_SameLowerPad_FloorRounding_Params = ::testing::Combine(
        ::testing::ValuesIn(dilation),
        ::testing::ValuesIn(padBegins),
        ::testing::ValuesIn(padEnds),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::FLOOR),
        ::testing::Values(ngraph::op::PadType::SAME_LOWER)
 );
@@ -432,6 +438,8 @@ const auto maxPoolv8_ExplicitPad_FloorRounding_5Dinput_Params = ::testing::Combi
        ::testing::Values(dilation3D[0]),
        ::testing::ValuesIn(padBegins3D),
        ::testing::ValuesIn(padEnds3D),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::FLOOR),
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );
@@ -455,6 +463,8 @@ const auto maxPoolv8_SameUpperPad_FloorRounding_5Dinput_Params = ::testing::Comb
        ::testing::ValuesIn(dilation3D),
        ::testing::ValuesIn(padBegins3D),
        ::testing::ValuesIn(padEnds3D),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::FLOOR),
        ::testing::Values(ngraph::op::PadType::SAME_UPPER)
 );
@@ -478,6 +488,8 @@ const auto maxPoolv8_SameLowerPad_CeilRounding_5Dinput_Params = ::testing::Combi
        ::testing::ValuesIn(dilation3D),
        ::testing::ValuesIn(padBegins3D),
        ::testing::ValuesIn(padEnds3D),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::CEIL),
        ::testing::Values(ngraph::op::PadType::SAME_LOWER)
 );
@@ -501,6 +513,8 @@ const auto maxPoolv8_ExplicitPad_CeilRounding_Params = ::testing::Combine(
        ::testing::ValuesIn(dilation),
        ::testing::ValuesIn(padBegins),
        ::testing::ValuesIn(padEnds),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::CEIL),
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );
@@ -549,6 +563,8 @@ const auto maxPoolv8_ValidPad_Params = ::testing::Combine(
        ::testing::ValuesIn(dilation),
        ::testing::Values(std::vector<size_t>({0, 0})),
        ::testing::Values(std::vector<size_t>({0, 0})),
+        ::testing::Values(ngraph::element::Type_t::i32),
+        ::testing::Values(0),
        ::testing::Values(ngraph::op::RoundingType::FLOOR),  // placeholder value - Rounding Type not applicable for Valid pad type
        ::testing::Values(ngraph::op::PadType::VALID)
 );
--- a/src/tests/functional/plugin/cpu/single_layer_tests/pooling.cpp
+++ b/src/tests/functional/plugin/cpu/single_layer_tests/pooling.cpp
@@ -144,7 +144,9 @@ public:
        std::vector<size_t> padBegin, padEnd;
        ngraph::op::PadType padType;
        ngraph::op::RoundingType roundingType;
-        std::tie(kernel, stride, dilation, padBegin, padEnd, roundingType, padType) = basicParamsSet;
+        ngraph::element::Type indexElementType;
+        int64_t axis;
+        std::tie(kernel, stride, dilation, padBegin, padEnd, indexElementType, axis, roundingType, padType) = basicParamsSet;

        std::ostringstream results;
        results << "IS=(";
@@ -181,7 +183,9 @@ protected:
        std::vector<size_t> padBegin, padEnd;
        ngraph::op::PadType padType;
        ngraph::op::RoundingType roundingType;
-        std::tie(kernel, stride, dilation, padBegin, padEnd, roundingType, padType) = basicParamsSet;
+        ngraph::element::Type indexElementType;
+        int64_t axis;
+        std::tie(kernel, stride, dilation, padBegin, padEnd, indexElementType, axis, roundingType, padType) = basicParamsSet;
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        if (selectedType.empty()) {
            selectedType = getPrimitiveType();
@@ -192,7 +196,8 @@ protected:

        auto params = ngraph::builder::makeDynamicParams(inPrc, inputDynamicShapes);
        std::shared_ptr<ngraph::Node> pooling = ngraph::builder::makeMaxPoolingV8(params[0], stride, dilation, padBegin, padEnd,
-                                                                                  kernel, roundingType, padType);
+                                                                                  kernel, roundingType, padType,
+                                                                                  indexElementType, axis);
        pooling->get_rt_info() = getCPUInfo();
        ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(pooling->output(0))};
        function = std::make_shared<ngraph::Function>(results, params, "MaxPooling");
@@ -375,15 +380,19 @@ const std::vector<LayerTestsDefinitions::poolSpecificParams> paramsMax4D = {

 const std::vector<LayerTestsDefinitions::maxPoolV8SpecificParams> paramsMaxV84D = {
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {2, 2}, {2, 2}, {1, 1}, {0, 0}, {0, 0},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::SAME_LOWER },
 };

 const std::vector<LayerTestsDefinitions::maxPoolV8SpecificParams> paramsMaxV84D_ref = {
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {2, 2}, {2, 2}, {2, 2}, {0, 0}, {0, 0},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::SAME_UPPER },
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {4, 2}, {2, 2}, {1, 2}, {0, 0}, {0, 0},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::EXPLICIT },
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {4, 2}, {2, 1}, {2, 2}, {0, 0}, {0, 0},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::EXPLICIT },
 };

@@ -467,15 +476,19 @@ const std::vector<LayerTestsDefinitions::poolSpecificParams> paramsMax5D = {

 const std::vector<LayerTestsDefinitions::maxPoolV8SpecificParams> paramsMaxV85D = {
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::SAME_LOWER },
 };

 const std::vector<LayerTestsDefinitions::maxPoolV8SpecificParams> paramsMaxV85D_ref = {
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {2, 2, 2}, {1, 1, 1}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::SAME_UPPER },
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {2, 2, 2}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::EXPLICIT },
        LayerTestsDefinitions::maxPoolV8SpecificParams{ {2, 3, 4}, {2, 2, 2}, {2, 1, 1}, {1, 1, 1}, {1, 2, 2},
+                                                        ngraph::element::Type_t::i32, 0,
                                                        ngraph::op::RoundingType::CEIL, ngraph::op::PadType::EXPLICIT },
 };

--- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp
@@ -9,6 +9,7 @@

 using namespace ngraph::helpers;
 using namespace LayerTestsDefinitions;
+using namespace ngraph::element;

 namespace {
 const std::vector<InferenceEngine::Precision> netPrecisions = {
@@ -20,14 +21,20 @@ const std::vector<std::vector<size_t >> kernels = {{3, 3},
                                                          {3, 5}};
 const std::vector<std::vector<size_t >> strides = {{1, 1},
                                                          {1, 2}};
+const std::vector<std::vector<size_t >> dilations = {{1, 1},
+                                                          {1, 2}};
 const std::vector<std::vector<size_t >> padBegins = {{0, 0},
                                                            {0, 2}};
 const std::vector<std::vector<size_t >> padEnds = {{0, 0},
                                                          {0, 2}};
 const std::vector<ngraph::op::RoundingType> roundingTypes = {ngraph::op::RoundingType::CEIL,
                                                             ngraph::op::RoundingType::FLOOR};
+const std::vector<ngraph::element::Type_t> indexElementTypes = {ngraph::element::Type_t::i32};
+const std::vector<int64_t> axes = {0, 2};
+const std::vector<size_t > inputShapeSmall = {1, 3, 30, 30};
+const std::vector<size_t > inputShapeLarge = {1, 3, 50, 50};

-////* ========== Max Polling ========== */
+////* ========== Max Pooling ========== */
 /* +========== Explicit Pad Floor Rounding ========== */
 const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine(
        ::testing::Values(PoolingTypes::MAX),
@@ -40,7 +47,7 @@ const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine(
        ::testing::Values(false)  // placeholder value - exclude pad not applicable for max pooling
 );

-INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_FloorRpunding, PoolingLayerTest,
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_FloorRounding, PoolingLayerTest,
                        ::testing::Combine(
                                maxPool_ExplicitPad_FloorRounding_Params,
                                ::testing::ValuesIn(netPrecisions),
@@ -48,7 +55,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_FloorRpunding, PoolingLayerTe
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
-                                ::testing::Values(std::vector<size_t >({1, 3, 50, 50})),
+                                ::testing::Values(inputShapeLarge),
                                ::testing::Values(CommonTestUtils::DEVICE_GPU)),
                        PoolingLayerTest::getTestCaseName);

@@ -65,7 +72,7 @@ const auto maxPool_ExplicitPad_CeilRounding_Params = ::testing::Combine(
        ::testing::Values(false)  // placeholder value - exclude pad not applicable for max pooling
 );

-INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_CeilRpunding, PoolingLayerTest,
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_CeilRounding, PoolingLayerTest,
                        ::testing::Combine(
                                maxPool_ExplicitPad_CeilRounding_Params,
                                ::testing::ValuesIn(netPrecisions),
@@ -73,7 +80,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_MaxPool_ExplicitPad_CeilRpunding, PoolingLayerTes
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
-                                ::testing::Values(std::vector<size_t >({1, 3, 50, 50})),
+                                ::testing::Values(inputShapeLarge),
                                ::testing::Values(CommonTestUtils::DEVICE_GPU)),
                        PoolingLayerTest::getTestCaseName);

@@ -100,7 +107,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_ExplicitPad_CeilRounding, PoolingLayerTes
                               ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                               ::testing::Values(InferenceEngine::Layout::ANY),
                               ::testing::Values(InferenceEngine::Layout::ANY),
-                               ::testing::Values(std::vector<size_t >({1, 3, 30, 30})),
+                               ::testing::Values(inputShapeSmall),
                               ::testing::Values(CommonTestUtils::DEVICE_GPU)),
                       PoolingLayerTest::getTestCaseName);

@@ -125,11 +132,11 @@ INSTANTIATE_TEST_SUITE_P(smoke_AvgPool_ExplicitPad_FloorRounding, PoolingLayerTe
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
-                                ::testing::Values(std::vector<size_t >({1, 3, 30, 30})),
+                                ::testing::Values(inputShapeSmall),
                                ::testing::Values(CommonTestUtils::DEVICE_GPU)),
                        PoolingLayerTest::getTestCaseName);

-////* ========== Avg and Max Polling Cases ========== */
+////* ========== Avg and Max Pooling Cases ========== */
 /*    ========== Valid Pad Rounding Not Applicable ========== */
 const auto allPools_ValidPad_Params = ::testing::Combine(
        ::testing::Values(PoolingTypes::MAX, PoolingTypes::AVG),
@@ -151,7 +158,61 @@ INSTANTIATE_TEST_SUITE_P(smoke_MAX_and_AVGPool_ValidPad, PoolingLayerTest,
                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                ::testing::Values(InferenceEngine::Layout::ANY),
                                ::testing::Values(InferenceEngine::Layout::ANY),
-                                ::testing::Values(std::vector<size_t >({1, 3, 50, 50})),
+                                ::testing::Values(inputShapeLarge),
                                ::testing::Values(CommonTestUtils::DEVICE_GPU)),
                        PoolingLayerTest::getTestCaseName);
+
+
+
+////* ========== MaxPool v8 ========== */
+///* +========== Explicit Pad Floor Rounding ========== */
+const auto maxPool8_ExplicitPad_FloorRounding_Params = ::testing::Combine(
+        ::testing::ValuesIn(kernels),
+        ::testing::ValuesIn(strides),
+        ::testing::ValuesIn(dilations),
+        ::testing::ValuesIn(padBegins),
+        ::testing::ValuesIn(padEnds),
+        ::testing::ValuesIn(indexElementTypes),
+        ::testing::ValuesIn(axes),
+        ::testing::Values(ngraph::op::RoundingType::FLOOR),
+        ::testing::Values(ngraph::op::PadType::EXPLICIT)
+);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPool8_ExplicitPad_FloorRounding, MaxPoolingV8LayerTest,
+                        ::testing::Combine(
+                                maxPool8_ExplicitPad_FloorRounding_Params,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(inputShapeSmall),
+                                ::testing::Values(CommonTestUtils::DEVICE_GPU)),
+                         MaxPoolingV8LayerTest::getTestCaseName);
+
+/* ========== Explicit Pad Ceil Rounding ========== */
+const auto maxPool8_ExplicitPad_CeilRounding_Params = ::testing::Combine(
+        ::testing::ValuesIn(kernels),
+        ::testing::Values(std::vector<size_t>({1, 1})),
+        ::testing::ValuesIn(dilations),
+        ::testing::ValuesIn(padBegins),
+        ::testing::ValuesIn(padEnds),
+        ::testing::ValuesIn(indexElementTypes),
+        ::testing::ValuesIn(axes),
+        ::testing::Values(ngraph::op::RoundingType::CEIL),
+        ::testing::Values(ngraph::op::PadType::EXPLICIT)
+);
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPool8_ExplicitPad_CeilRounding, MaxPoolingV8LayerTest,
+                         ::testing::Combine(
+                                 maxPool8_ExplicitPad_CeilRounding_Params,
+                                 ::testing::ValuesIn(netPrecisions),
+                                 ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                 ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                 ::testing::Values(InferenceEngine::Layout::ANY),
+                                 ::testing::Values(InferenceEngine::Layout::ANY),
+                                 ::testing::Values(inputShapeSmall),
+                                 ::testing::Values(CommonTestUtils::DEVICE_GPU)),
+                         MaxPoolingV8LayerTest::getTestCaseName);
+
 }  // namespace
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/pooling.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/pooling.hpp
@@ -54,6 +54,8 @@ typedef std::tuple<
        std::vector<size_t>,            // Dilation
        std::vector<size_t>,            // Pad begin
        std::vector<size_t>,            // Pad end
+        ngraph::element::Type_t,        // Index element type
+        int64_t,                        // Axis
        ngraph::op::RoundingType,       // Rounding type
        ngraph::op::PadType             // Pad type
 > maxPoolV8SpecificParams;
--- a/src/tests/functional/shared_test_classes/src/single_layer/pooling.cpp
+++ b/src/tests/functional/shared_test_classes/src/single_layer/pooling.cpp
@@ -106,7 +106,9 @@ std::string MaxPoolingV8LayerTest::getTestCaseName(const testing::TestParamInfo<
    std::vector<size_t> padBegin, padEnd;
    ngraph::op::PadType padType;
    ngraph::op::RoundingType roundingType;
-    std::tie(kernel, stride, dilation, padBegin, padEnd, roundingType, padType) = poolParams;
+    ngraph::element::Type indexElementType;
+    int64_t axis;
+    std::tie(kernel, stride, dilation, padBegin, padEnd, indexElementType, axis, roundingType, padType) = poolParams;

    std::ostringstream result;
    result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
@@ -115,6 +117,8 @@ std::string MaxPoolingV8LayerTest::getTestCaseName(const testing::TestParamInfo<
    result << "D" << CommonTestUtils::vec2str(dilation) << "_";
    result << "PB" << CommonTestUtils::vec2str(padBegin) << "_";
    result << "PE" << CommonTestUtils::vec2str(padEnd) << "_";
+    result << "IET" << indexElementType << "_";
+    result << "A" << axis << "_";
    result << "Rounding=" << roundingType << "_";
    result << "AutoPad=" << padType << "_";
    result << "netPRC=" << netPrecision.name() << "_";
@@ -201,7 +205,9 @@ void MaxPoolingV8LayerTest::SetUp() {
    std::vector<size_t> padBegin, padEnd;
    ngraph::op::PadType padType;
    ngraph::op::RoundingType roundingType;
-    std::tie(kernel, stride, dilation, padBegin, padEnd, roundingType, padType) = poolParams;
+    ngraph::element::Type indexElementType;
+    int64_t axis;
+    std::tie(kernel, stride, dilation, padBegin, padEnd, indexElementType, axis, roundingType, padType) = poolParams;

    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
@@ -209,9 +215,17 @@ void MaxPoolingV8LayerTest::SetUp() {
            ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));

    std::shared_ptr<ngraph::Node> maxPool = ngraph::builder::makeMaxPoolingV8(paramOuts[0], stride, dilation, padBegin, padEnd,
-                                                                              kernel, roundingType, padType);
+                                                                              kernel, roundingType, padType,
+                                                                              indexElementType, axis);

-    ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(maxPool->output(0))};
+    const auto maxPoolV8_second_output_is_supported = targetDevice == CommonTestUtils::DEVICE_GPU;
+    ngraph::ResultVector results;
+    if (maxPoolV8_second_output_is_supported) {
+        results = {std::make_shared<ngraph::opset3::Result>(maxPool->output(0)),
+                   std::make_shared<ngraph::opset3::Result>(maxPool->output(1))};
+    } else {
+        results = { std::make_shared<ngraph::opset3::Result>(maxPool->output(0)) };
+    }
    function = std::make_shared<ngraph::Function>(results, params, "MaxPoolV8");
 }

--- a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
+++ b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/builders.hpp
@@ -435,7 +435,9 @@ std::shared_ptr<Node> makeMaxPoolingV8(const ngraph::Output<Node> &in,
                                       const std::vector<size_t> &padsEnd,
                                       const std::vector<size_t> &kernel,
                                       const op::RoundingType &roundingType,
-                                       const op::PadType &padType);
+                                       const op::PadType &padType,
+                                       const ov::element::Type &indexElementType,
+                                       const int64_t axis);

 std::shared_ptr<Node> makeROIPooling(const Output<Node>& input,
                                     const Output<Node>& coords,
--- a/src/tests/ngraph_helpers/ngraph_functions/src/pooling.cpp
+++ b/src/tests/ngraph_helpers/ngraph_functions/src/pooling.cpp
@@ -42,9 +42,12 @@ std::shared_ptr<Node> makeMaxPoolingV8(const ngraph::Output<Node> &in,
                                       const std::vector<size_t> &padsEnd,
                                       const std::vector<size_t> &kernel,
                                       const op::RoundingType &roundingType,
-                                       const op::PadType &padType) {
+                                       const op::PadType &padType,
+                                       const ov::element::Type &indexElementType,
+                                       const int64_t axis) {
    std::shared_ptr<ngraph::Node> pooling = std::make_shared<ngraph::opset8::MaxPool>(in, strides, dilation, padsBegin, padsEnd,
-                                                                                      kernel, roundingType, padType);
+                                                                                      kernel, roundingType, padType,
+                                                                                      indexElementType, axis);
    return pooling;
 }