[IE CLDNN] Fully Connected layer 3d support (#2709)

2020-12-17 10:15:59 +01:00 · 2020-12-17 10:15:59 +01:00 · b91047e9fe
commit b91047e9fe
parent f2f5e99f9f
14 changed files with 453 additions and 85 deletions
--- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp
@ -43,6 +43,7 @@
 #include <legacy/transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/convert_prior_to_ie_prior.hpp>
 #include <legacy/transformations/convert_opset1_to_legacy/convert_nms_5_to_legacy.hpp>
+#include <legacy/transformations/convert_opset1_to_legacy/reshape_fully_connected.hpp>
 #include <legacy/convert_function_to_cnn_network.hpp>
 #include <legacy/ie_util_internal.hpp>
 #include <legacy/graph_transformer.h>
@ -241,11 +242,17 @@ InferenceEngine::ICNNNetwork::Ptr clDNNEngine::CloneAndTransformNetwork(const In
            transformer.transform(nGraphFunc);
        }

+        const auto reshape_fc_callback = [](const std::shared_ptr<const ::ngraph::Node>& node) -> bool {
+            return node->input_value(0).get_shape().size() <= 3lu;
+        };
+
        {
            ngraph::pass::Manager manager = ngraph::pass::Manager();
            manager.register_pass<ngraph::pass::ConvertOpSet1ToLegacy>();
            manager.register_pass<ngraph::pass::UnrollTensorIterator>();
            manager.set_callback(transformations_callback);
+            auto pass_config = manager.get_pass_config();
+            pass_config->set_callback<ngraph::pass::ReshapeFullyConnected>(reshape_fc_callback);
            manager.run_passes(nGraphFunc);
        }

--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@ -1084,6 +1084,8 @@ void Program::CreateWeightAndBiasPrimitives(cldnn::topology& topology,
    case FullyConnected: {
        groupSize = 1;
        outFeatures = static_cast<cldnn::tensor::value_type>(layer->outData[0]->getTensorDesc().getDims()[1]);
+        if (in0dims.size() == 3)
+            outFeatures = static_cast<cldnn::tensor::value_type>(layer->outData[0]->getTensorDesc().getDims()[2]);
        switch (in0dims.size()) {
            case 4:
                weightDimsVec = { TensorValue(layer->outData[0]->getTensorDesc().getDims().back()),
@ -1093,8 +1095,8 @@ void Program::CreateWeightAndBiasPrimitives(cldnn::topology& topology,
                break;
            case 3:
                weightDimsVec = { TensorValue(layer->outData[0]->getTensorDesc().getDims().back()),
-                                  TensorValue(in0dims[1]),
                                  TensorValue(in0dims[2]),
+                                  1,
                                  1 };
                break;
            case 2:
@ -2927,11 +2929,14 @@ void Program::CreateFullyConnectedPrimitive(cldnn::topology& topology, Inference
    IE_ASSERT(weightPrimID.size() == 1);
    IE_ASSERT(biasPrimID.size() <= 1);

+    auto outDims = layer->outData[0]->getTensorDesc().getDims().size();
    auto fcPrim = cldnn::fully_connected(fcLayerName,
                                         inputPrimitives[0],
                                         weightPrimID[0],
                                         biasPrimID.empty() ? "" : biasPrimID[0],
-                                         DataTypeFromPrecision(fcLayer->outData[0]->getTensorDesc().getPrecision()));
+                                         DataTypeFromPrecision(fcLayer->outData[0]->getTensorDesc().getPrecision()),
+                                         cldnn::padding(),
+                                         layer->outData[0]->getTensorDesc().getDims().size());

    topology.add(fcPrim);

--- a/inference-engine/thirdparty/clDNN/api/fully_connected.hpp
+++ b/inference-engine/thirdparty/clDNN/api/fully_connected.hpp
@ -61,10 +61,12 @@ struct fully_connected : public primitive_base<fully_connected> {
                    const primitive_id& input,
                    const primitive_id& weights,
                    const primitive_id& bias = "",
-                    const padding& output_padding = padding())
+                    const padding& output_padding = padding(),
+                    const size_t input_size = 2)
        : primitive_base(id, {input}, output_padding),
          weights(weights),
-          bias(bias)
+          bias(bias),
+          input_size(input_size)
    {}

    /// @brief Constructs fully connected layer.
@ -77,16 +79,20 @@ struct fully_connected : public primitive_base<fully_connected> {
        const primitive_id& weights,
        const primitive_id& bias,
        const data_types data_type,
-        const padding& output_padding = padding())
+        const padding& output_padding = padding(),
+        const size_t input_size = 2)
        : primitive_base(id, { input }, output_padding, optional_data_type{data_type}),
        weights(weights),
-        bias(bias)
+        bias(bias),
+        input_size(input_size)
    {}

    /// @brief Primitive id containing weights data.
    primitive_id weights;
    /// @brief Primitive id containing bias data.
    primitive_id bias;
+    /// @brief Primitive dimension size.
+    size_t input_size;

 protected:
    std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@ -52,6 +52,7 @@ ParamsKey FullyConnected_bf_tiled::GetSupportedKey() const {
    k.EnableInputLayout(DataLayout::bf);
    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::bf);
+    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableBatching();
    k.EnableBiasPerFeature();
    k.EnableNonBiasTerm();
@ -68,11 +69,17 @@ bool FullyConnected_bf_tiled::Validate(const Params& params, const optional_para

    auto& fc_params = static_cast<const fully_connected_params&>(params);
    auto& input = fc_params.inputs[0];
+    auto& output = fc_params.output;

    // Block reads must be aligned to 4 bytes, for fp16 we can correct for offset misalignment,
    // but we need to ensure that batch pitch preserves alignment.
-    if (input.GetDType() == Datatype::F16 && input.Batch().pitch % 2 != 0 && input.Batch().v > 1)
+    if (input.GetDType() == Datatype::F16) {
+        if (input.Batch().pitch % 2 != 0 && input.Batch().v > 1)
            return false;
+        // for 3d case we have to check feature alignment as well
+        if (output.GetLayout() == DataLayout::bfyx && input.Feature().pitch % 2 != 0 && input.Feature().v > 1)
+            return false;
+    }

    if (input.GetLayout() == DataLayout::bfyx) {
        // Padding on input is not supported.
@ -83,6 +90,12 @@ bool FullyConnected_bf_tiled::Validate(const Params& params, const optional_para
            return false;
    }

+    // We don't support 4d output
+    if (fc_params.output.GetLayout() == DataLayout::bfyx) {
+        if (input.X().v > 1)
+            return false;
+    }
+
    return true;
 }

@ -127,13 +140,20 @@ struct TuneParamsSelector {

 bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, const tune_params& tparams) {
    // Check divisibility by dispatch tile sizes.
-    if (params.output.Batch().v % (tparams.tile_b * tparams.dispatch_bsv) != 0)
+    size_t output_f = params.output.Feature().v;
+    size_t output_b = params.output.Batch().v;
+    if (params.output.GetLayout() == DataLayout::bfyx) {
+        output_b *= params.output.Feature().v;
+        output_f = params.output.Y().v;
+    }
+
+    if (output_b % (tparams.tile_b * tparams.dispatch_bsv) != 0)
        return false;
-    if (CeilDiv(params.output.Feature().v, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0)
+    if (CeilDiv(output_f, tparams.tile_ofm * simd) % tparams.dispatch_fsv != 0)
        return false;

    // Same result can be achieved with smaller tile_ofm.
-    if (params.output.Feature().v <= (tparams.tile_ofm / 2) * simd)
+    if (output_f <= (tparams.tile_ofm / 2) * simd)
        return false;
    // No weights layout for such huge tile ofm.
    if (tparams.tile_ofm * simd > 64)
@ -163,6 +183,12 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,

    size_t batch = params.output.Batch().v;
    size_t output_f = params.output.Feature().v;
+
+    // 3d output
+    if (params.output.GetLayout() == DataLayout::bfyx) {
+        batch *= params.output.Feature().v;
+        output_f = params.output.Y().v;
+    }
    Datatype dtype = params.inputs[0].GetDType();

    auto selector = TuneParamsSelector(params);
@ -219,6 +245,10 @@ FullyConnected_bf_tiled::SetDefault(const fully_connected_params& params, int au

    size_t feature_threads = CeilDiv(params.output.Feature().v, tparams.tile_ofm * simd);
    size_t batch_threads = params.output.Batch().v / tparams.tile_b;
+    if (params.output.GetLayout() == DataLayout::bfyx) {
+        feature_threads = CeilDiv(params.output.Y().v, tparams.tile_ofm * simd);
+        batch_threads = (params.output.Batch().v * params.output.Feature().v) / tparams.tile_b;
+    }

    dispatchData.gws[0] = feature_threads * batch_threads * simd;
    dispatchData.gws[1] = 1;
@ -252,7 +282,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para

    jit.Merge(MakeConstantLoopUnrollJitConstants(dispatchData.tile_m));

-    bool realign_fp16_offset = params.inputs[0].GetDType() == Datatype::F16 && params.output.GetFirstElementOffset() % 2 != 0;
+    bool realign_fp16_offset = params.inputs[0].GetDType() == Datatype::F16 && params.inputs[0].GetFirstElementOffset() % 2 != 0;
    jit.AddConstant(MakeJitConstant("REALIGN_FP16_OFFSET", realign_fp16_offset));

    auto activation_dt = GetActivationType(params);
@ -260,13 +290,32 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
    jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
    jit.Merge(MakeActivationJitConstants(params.activations, activation_dt, "_TYPED"));

+    // for 3d output we are treating spatial as features
+    if (params.output.GetLayout() == DataLayout::bfyx) {
+        jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.output.Y().v));
+        jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.output.Y().pitch));
+        jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
+        jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.output.Feature().pitch));
+        jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
+    }
+    else {
+        jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.output.Feature().v));
+        jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.output.Feature().pitch));
+        jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
+        jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.output.Batch().pitch));
+    }
+
+    size_t output_f = params.output.GetLayout() == DataLayout::bfyx ? params.output.Y().v : params.output.Feature().v;
    if (!params.fused_ops.empty()) {
        auto boundary_check = BoundaryCheck::DISABLED;
-        if (params.output.Feature().v % (dispatchData.tile_n * simd) != 0)
+        if (output_f % (dispatchData.tile_n * simd) != 0)
            boundary_check = BoundaryCheck::ENABLED;

+        std::vector<std::string> idx_order = {"(out_b + bi)", "out_f", "0", "0"};
+        if (params.output.GetLayout() == DataLayout::bfyx)
+            idx_order = {"(out_b + bi) % OUTPUT_BATCH_NUM", "(out_b + bi) / OUTPUT_BATCH_NUM", "out_f", "0"};
        FusedOpsConfiguration conf = { "",
-                                       {"(out_b + bi)", "out_f", "0", "0"},
+                                       idx_order,
                                       "activated[bi]",
                                       activation_dt,
                                       dispatchData.tile_n,
@ -284,6 +333,9 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
                                                                const optional_params &options,
                                                                const int autoTuneIndex) const {
    auto& fc_params = static_cast<const fully_connected_params&>(params);
+    size_t output_b = fc_params.output.Batch().v;
+    if (fc_params.output.GetLayout() == DataLayout::bfyx)
+        output_b *= fc_params.output.Feature().v;

    if (autoTuneIndex >= 0 && autoTuneIndex < (int)auto_tune_params.size()
        && !TuneParamsSelector::VerifyTuneParams(fc_params, auto_tune_params[autoTuneIndex]))
@ -298,9 +350,9 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
        weights_layout = WeightsLayout::os_iyx_osv64;

    float estimated_time = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-    if (fc_params.output.Batch().v > 1 && fc_params.inputs[0].GetDType() == Datatype::F32)
+    if (output_b > 1 && fc_params.inputs[0].GetDType() == Datatype::F32)
        estimated_time = FORCE_PRIORITY_3;
-    if (fc_params.output.Batch().v > 1 && fc_params.inputs[0].GetDType() == Datatype::F16)
+    if (output_b > 1 && fc_params.inputs[0].GetDType() == Datatype::F16)
        estimated_time = FORCE_PRIORITY_4;

    return GetCommonKernelsData(params,
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_bfyx_ref.cpp
@ -34,8 +34,10 @@ ParamsKey FullyConnected_bfyx_Ref::GetSupportedKey() const {
    k.EnableDifferentInputWeightsTypes();
    k.EnableDifferentTypes();
    k.EnableInputLayout(DataLayout::bf);
+    k.EnableInputLayout(DataLayout::bfyx);
    k.EnableOutputLayout(DataLayout::bf);
    k.EnableOutputLayout(DataLayout::fb);
+    k.EnableOutputLayout(DataLayout::bfyx);
    k.EnableBiasPerOutput();
    k.EnableBiasPerFeature();
    k.EnableNonBiasTerm();
@ -50,7 +52,11 @@ FullyConnected_bfyx_Ref::DispatchData FullyConnected_bfyx_Ref::SetDefault(const
                                                                          int) const {
    auto dispatchData = Parent::SetDefault(params);

-    dispatchData.gws = { params.output.Feature().v, params.output.Batch().v, 1 };
+    std::vector<size_t> global = {params.output.Feature().v, params.output.Batch().v, 1};
+    if (params.output.GetLayout() == DataLayout::bfyx)
+        global = {params.output.Feature().v * params.output.Y().v, params.output.Batch().v, 1};
+
+    dispatchData.gws = global;
    dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo);

    return dispatchData;
@ -69,13 +75,14 @@ JitConstants FullyConnected_bfyx_Ref::GetJitConstants(const fully_connected_para
        accumulator_dt = Datatype::F32;
        activation_dt = Datatype::F32;
    }
-
+    if (params.output.GetLayout() == DataLayout::bfyx)
+        jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
    jit.Merge(MakeTypeJitConstants(activation_dt, "ACTIVATION"));
    jit.Merge(MakeTypeJitConstants(accumulator_dt, "ACCUMULATOR"));
    jit.Merge(MakeActivationJitConstants(params.activations, activation_dt, "_TYPED"));

    if (!params.fused_ops.empty()) {
-        FusedOpsConfiguration conf = { "", {"b", "ofm", "y", "x"}, "dequantized", activation_dt, 1 };
+        FusedOpsConfiguration conf = { "", {"b", "ofm", "oym", "0"}, "dequantized", activation_dt, 1 };
        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
    }
    return jit;
@ -126,6 +133,10 @@ bool FullyConnected_bfyx_Ref::Validate(const Params& params, const optional_para
    if (!is_quantization && !has_fused_op)
        return false;

+    // We don't support 4d output
+    if (fc_params.output.GetLayout() == DataLayout::bfyx && fc_params.output.X().v > 1)
+        return false;
+
    return true;
 }

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_mmad.cpp
@ -66,9 +66,21 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get

    const auto& input = params.inputs[0];
    const auto& output = params.output;
+    size_t input_feature = input.Feature().v;
+    size_t input_batch = input.Batch().v;
+    size_t output_feature = output.Feature().v;
+    size_t output_batch = output.Batch().v;
+    // for 3d case
+    if (output.GetLayout() == DataLayout::bfyx) {
+        input_batch *= input.Feature().v;
+        input_feature = input.Y().v;
+        output_batch *= output.Feature().v;
+        output_feature = output.Y().v;
+    }

    tuning_data.sub_group_size = 8;
-    if (input.X().v == 1 && input.Y().v == 1 && input.Z().v == 1 && input.Batch().v == 1) {
+    if (input.X().v == 1 && input.Z().v == 1 && input.Batch().v == 1 && 
+        ((input.Y().v == 1 && output.GetLayout() != DataLayout::bfyx) || (input.Feature().v == 1 && output.GetLayout() == DataLayout::bfyx)) ) {
        // Known cases for TGL where simd16 works better than simd8
        bool simd16_exception_1 = input.Feature().v == 25088 && output.Feature().v == 512;
        bool simd16_exception_2 = input.Feature().v == 21504 && output.Feature().v == 512;
@ -79,14 +91,14 @@ FullyConnectedKernelMMAD::FullyConnectedTuningData FullyConnectedKernelMMAD::Get

    size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;

-    tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size != 0 ?
-                                       input.Feature().v / sub_group_pack_size :
+    tuning_data.feature_blocks_count = input.GetLayout() == DataLayout::bfyx && input_feature % sub_group_pack_size != 0 ?
+                                       input_feature / sub_group_pack_size :
                                       input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 ?
-                                       CeilDiv(input.Feature().v, 32) % 2 == 0 ? CeilDiv(input.Feature().v, 64) : CeilDiv(input.Feature().v, 64) - 1 :
-                                       CeilDiv(input.Feature().v, sub_group_pack_size);
+                                       CeilDiv(input_feature, 32) % 2 == 0 ? CeilDiv(input_feature, 64) : CeilDiv(input_feature, 64) - 1 :
+                                       CeilDiv(input_feature, sub_group_pack_size);

-    bool slm_div_factor_exception = input.Batch().v == 300 && input.Feature().v == 2048 &&
-                                    output.Batch().v == 300 && (output.Feature().v == 324 || output.Feature().v == 81);
+    bool slm_div_factor_exception = input_batch == 300 && input_feature == 2048 &&
+                                    output_batch == 300 && (output_feature == 324 || output_feature == 81);

    if (tuning_data.feature_blocks_count && tuning_data.sub_group_size == 8 && !slm_div_factor_exception)
        while (tuning_data.feature_blocks_count % (tuning_data.slm_div_factor * 2) == 0 &&
@ -120,7 +132,11 @@ FullyConnectedKernelMMAD::DispatchData FullyConnectedKernelMMAD::SetDefault(cons
    auto dispatchData = Parent::SetDefault(params);
    const auto& output = params.output;

-    dispatchData.gws = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
+    std::vector<size_t> global = { Align(output.Feature().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, 1 };
+    if (output.GetLayout() == DataLayout::bfyx)
+        global = { Align(output.Y().v, tuning_data.sub_group_size) * tuning_data.slm_div_factor, output.Batch().v, output.Feature().v };
+
+    dispatchData.gws = global;
    dispatchData.lws = { tuning_data.work_group_size, 1, 1 };

    return dispatchData;
@ -133,6 +149,7 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
    auto jit = Parent::GetJitConstants(params, runInfo);

    auto& input = params.inputs[0];
+    auto& output = params.output;
    auto& weights = params.weights;

    size_t sub_group_pack_size = tuning_data.sub_group_size * tuning_data.pack_size;
@ -181,6 +198,9 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
    bool has_feature_leftovers = (input.GetLayout() == DataLayout::bfyx && input.Feature().v % sub_group_pack_size) ||
                                 (input.GetLayout() != DataLayout::bfyx && tuning_data.sub_group_size == 16 && CeilDiv(input.Feature().v, 32) % 2);

+    if (output.GetLayout() == DataLayout::bfyx)
+        has_feature_leftovers = input.Y().v % sub_group_pack_size;
+
    jit.AddConstant(MakeJitConstant("HAS_FEATURE_LEFTOVERS", has_feature_leftovers));
    jit.AddConstant(MakeJitConstant("FEATURE_BLOCKS_COUNT", tuning_data.feature_blocks_count));
    jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
@ -200,9 +220,24 @@ JitConstants FullyConnectedKernelMMAD::GetJitConstants(const fully_connected_par
    jit.AddConstant(MakeJitConstant("SPLIT_SPATIAL", split_spatial));
    jit.AddConstant(MakeJitConstant("SPATIAL_MAJOR", spatial_major));

+    if (output.GetLayout() == DataLayout::bfyx) {
+        jit.AddConstant(MakeJitConstant("FEATURE_PITCH", input.Y().pitch));
+        jit.AddConstant(MakeJitConstant("OUT_FEATURE_NUM", output.Y().v));
+        jit.AddConstant(MakeJitConstant("IN_FEATURE_NUM", input.Y().v));
+        jit.AddConstant(MakeJitConstant("IS_3D", true));
+    } else {
+        jit.AddConstant(MakeJitConstant("FEATURE_PITCH", input.Feature().pitch));
+        jit.AddConstant(MakeJitConstant("OUT_FEATURE_NUM", output.Feature().v));
+        jit.AddConstant(MakeJitConstant("IN_FEATURE_NUM", input.Feature().v));
+    }
+
    if (!params.fused_ops.empty()) {
        auto input_dt = GetActivationType(params);
-        FusedOpsConfiguration conf = { "", {"batch", "feature", "0", "0"}, "dequantized", input_dt, 1 };
+        std::vector<std::string> idx_order = {"batch", "feature", "0", "0"};
+        if (output.GetLayout() == DataLayout::bfyx)
+            idx_order = {"batch", "skip_f", "feature", "0"};
+
+        FusedOpsConfiguration conf = { "", idx_order, "dequantized", input_dt, 1 };
        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
    }

--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_MMAD.cl
@ -49,6 +49,7 @@ KERNEL(fully_connected_gpu_MMAD)(
    const uint feature = (uint)get_group_id(0) * feature_per_wg + (uint)get_global_id(0) % feature_per_wg;
    const uint feature_block = lid0 / feature_per_wg;
    const uint batch = (uint)get_global_id(1);
+    const uint skip_f = (uint)get_global_id(2);

    int dotProd = 0;

@ -56,7 +57,7 @@ KERNEL(fully_connected_gpu_MMAD)(
 #if INPUT0_DIMS == 5
    const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0, 0);
 #else
-    const uint input_offset = INPUT0_GET_INDEX(batch, 0, 0, 0);
+    const uint input_offset = INPUT0_GET_INDEX(batch, skip_f, 0, 0);
 #endif

 #if SLM_DIV_FACTOR > 1
@ -221,17 +222,17 @@ KERNEL(fully_connected_gpu_MMAD)(
 #endif  // SPATIAL_MAJOR

 #if !SPLIT_SPATIAL
-            uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH;
+            uint input_idx = input_offset + spatial * MMAD_INPUT_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * FEATURE_PITCH;
 #else
-            uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * INPUT0_FEATURE_PITCH +
+            uint input_idx = input_offset + FEATURE_BLOCKS_COUNT * FEATURE_PITCH +
                             zi * MMAD_INPUT_Z_PITCH + yi * MMAD_INPUT_Y_PITCH + xi * MMAD_INPUT_X_PITCH;
 #endif // !SPLIT_SPATIAL
            uint filter_idx = filter_offset + spatial * MMAD_FILTER_SPATIAL_PITCH + FEATURE_BLOCKS_COUNT * MMAD_FILTER_FBLOCK_PITCH;

            MAKE_VECTOR_TYPE(INPUT0_TYPE, 4) input_data_u = (0, 0, 0, 0);
            for (uint i = 0; i < 4; i++) {
-                if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < INPUT0_FEATURE_NUM) {
-                    input_data_u[i] = input[input_idx + (sglid * 4 + i) * INPUT0_FEATURE_PITCH];
+                if (FEATURE_BLOCKS_COUNT * SUB_GROUP_SIZE * 4 + sglid * 4 + i < IN_FEATURE_NUM) {
+                    input_data_u[i] = input[input_idx + (sglid * 4 + i) * FEATURE_PITCH];
                }
            }
            INPUT_PACKED_TYPE input_data = AS_TYPE(INPUT_PACKED_TYPE, input_data_u);
@ -269,7 +270,7 @@ KERNEL(fully_connected_gpu_MMAD)(
        }
 #endif  // HAS_FEATURE_LEFTOVERS

-    if (OUTPUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUTPUT_FEATURE_NUM)
+    if (OUT_FEATURE_NUM % SUB_GROUP_SIZE != 0 && feature >= OUT_FEATURE_NUM)
        return;

 #if BIAS_TERM
@ -284,7 +285,11 @@ KERNEL(fully_connected_gpu_MMAD)(
    float dequantized = (float)dotProd;
 #endif // BIAS_TERM

+#if IS_3D
+    const uint out_idx = OUTPUT_GET_INDEX(batch, skip_f, feature, 0);
+#else
    const uint out_idx = OUTPUT_GET_INDEX(batch, feature, 0, 0);
+#endif

 #if HAS_FUSED_OPS
    FUSED_OPS;
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bf_tiled.cl
@ -67,13 +67,27 @@
 #define MAX(a, b)      ((a) > (b) ? (a) : (b))

 // Check alignment restrictions for using block writes on output.
-#define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * OUTPUT_BATCH_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)
+#define USE_BLOCK_WRITE ((OUTPUT_TYPE_SIZE * TILE_OUT_B_PITCH) % 16 == 0 && (OUTPUT_TYPE_SIZE * OUTPUT_OFFSET) % 16 == 0)

 #if !REALIGN_FP16_OFFSET
+#   if OUTPUT_3D
+#       define MAIN_LOOP_ELEMENTS_COUNT  INPUT0_SIZE_Y
+#   else
 #       define MAIN_LOOP_ELEMENTS_COUNT  INPUT0_ELEMENTS_COUNT
+#   endif
 #else
 // For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
+#   if OUTPUT_3D
+#       define MAIN_LOOP_ELEMENTS_COUNT  (INPUT0_SIZE_Y - 1)
+#   else
 #       define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1)
+#   endif
+#endif
+
+#if OUTPUT_3D
+#   define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y
+#else
+#   define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
 #endif

 __attribute__((intel_reqd_sub_group_size(SIMD)))
@ -97,25 +111,24 @@ KERNEL(fc)(
    // full dispatch pipeline.
    uint feature_mini_block = gid % DISPATCH_FSV;
    uint batch_mini_block = gid / DISPATCH_FSV % DISPATCH_BSV;
-    uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(OUTPUT_FEATURE_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
-    uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(OUTPUT_FEATURE_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    uint feature_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV) % (CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);
+    uint batch_mega_block = gid / (DISPATCH_FSV * DISPATCH_BSV * CEIL_DIV(TILE_OUT_F_NUM, TILE_OFM * SIMD) / DISPATCH_FSV);

    uint out_f = (feature_mega_block * DISPATCH_FSV + feature_mini_block) * (TILE_OFM * SIMD);
-    uint out_b = (batch_mega_block * DISPATCH_BSV + batch_mini_block) * TILE_B;
+    uint out_b = ((batch_mega_block * DISPATCH_BSV + batch_mini_block) * TILE_B);

    ACCUMULATOR_VEC_TYPE acc[TILE_B] = { };
    INPUT_VEC_TYPE       in_0[TILE_B] = { };

    FILTER_VEC_TYPE wei = 0;
-
-    uint weights_offset = out_f * INPUT0_ELEMENTS_COUNT;
-    uint input_offset = out_b * INPUT0_BATCH_PITCH + INPUT0_OFFSET;
+    uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
+    uint weights_offset = out_f * INPUT_ELEMENTS_COUNT;

 #if REALIGN_FP16_OFFSET
    // For fp16 we need to ensure that all block reads are aligned to 4 byte (2 words) boundary.
    // To do this solve first input feature separately.
    {
-        INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * INPUT0_BATCH_PITCH];
+        INPUT0_TYPE tmp_input = input[input_offset + get_sub_group_local_id() % TILE_B * TILE_IN_B_PITCH];
        MAKE_VECTOR_TYPE(FILTER_TYPE, TILE_OFM) tmp_wei = BLOCK_READN(FILTER_TYPE, TILE_OFM, weights, weights_offset);

        __attribute__((opencl_unroll_hint))
@ -135,13 +148,12 @@ KERNEL(fc)(
        // Load input.
        #define LOAD_IN_0(bi) do {                                  \
                in_0[bi] = INPUT_BLOCK_READ(input, input_offset);   \
-                input_offset += INPUT0_BATCH_PITCH;                 \
+                input_offset += TILE_IN_B_PITCH;                    \
            } while (false)

        CONST_LOOP(TILE_B, LOAD_IN_0);
        #undef LOAD_IN_0
-        input_offset += TILE_IFM * SIMD - INPUT0_BATCH_PITCH * TILE_B;
-
+        input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
        // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
        //       but significantly degrades readability and generality of code.
        //       It doesn't also show noticable performance improvement on tested configurations.
@ -172,13 +184,12 @@ KERNEL(fc)(
    {
        #define LOAD_IN_0(bi) do {                                  \
                in_0[bi] = INPUT_BLOCK_READ(input, input_offset);   \
-                input_offset += INPUT0_BATCH_PITCH;                 \
+                input_offset += TILE_IN_B_PITCH;                    \
            } while (false)

        CONST_LOOP(TILE_B, LOAD_IN_0);
        #undef LOAD_IN_0
-        input_offset += TILE_IFM * SIMD - INPUT0_BATCH_PITCH * TILE_B;
-
+        input_offset += TILE_IFM * SIMD - TILE_IN_B_PITCH * TILE_B;
        __attribute__((opencl_unroll_hint))
        for (uint ki = 0; ki < CEIL_DIV(LEFTOVER_IFM, TILE_K); ++ki) {
            wei = FILTER_BLOCK_READ(weights, weights_offset);
@ -210,7 +221,7 @@ KERNEL(fc)(
    }

 #if BIAS_TERM
-    #if OUTPUT_FEATURE_NUM % (TILE_OFM * SIMD) == 0
+    #if TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0
        BIAS_VEC_TYPE bias = BIAS_BLOCK_READ(biases, out_f);
    #else
        BIAS_VEC_TYPE bias = 0;
@ -242,12 +253,12 @@ KERNEL(fc)(
 #endif
    // =====================================================================================================================================
    // Write results
-    uint output_offset = out_f * OUTPUT_FEATURE_PITCH + out_b * OUTPUT_BATCH_PITCH + OUTPUT_OFFSET;
+    uint output_offset = out_f * TILE_OUT_F_PITCH + out_b * TILE_OUT_B_PITCH + OUTPUT_OFFSET;

-    if (USE_BLOCK_WRITE && (OUTPUT_FEATURE_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= OUTPUT_FEATURE_NUM)) {
+    if (USE_BLOCK_WRITE && (TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 || out_f + (TILE_OFM * SIMD) <= TILE_OUT_F_NUM)) {
        #define WRITE_OUTPUT(bi) do {                                       \
                OUTPUT_BLOCK_WRITE(output, output_offset, result[bi]);      \
-                output_offset += OUTPUT_BATCH_PITCH;                        \
+                output_offset += TILE_OUT_B_PITCH;                          \
            } while (false)

        CONST_LOOP(TILE_B, WRITE_OUTPUT);
@ -258,8 +269,8 @@ KERNEL(fc)(
        // TODO: Investigate why below code doesn't compile and check how it affects performance.
        //#define WRITE_OUTPUT_FEATURE(fi) do {                                                   \
        //        const bool should_write =                                                       \
-        //            OUTPUT_FEATURE_NUM %  (TILE_OFM * SIMD) == 0 ||                             \
-        //            out_f + (fi) * SIMD + get_sub_group_local_id() < OUTPUT_FEATURE_NUM;        \
+        //            TILE_OUT_F_NUM %  (TILE_OFM * SIMD) == 0 ||                                 \
+        //            out_f + (fi) * SIMD + get_sub_group_local_id() < TILE_OUT_F_NUM;            \
        //        if (should_write) {                                                             \
        //            output[output_offset] = result[out_bi][fi];                                 \
        //        }                                                                               \
@ -269,7 +280,7 @@ KERNEL(fc)(
        //#define WRITE_OUTPUT(bi) do {                                                           \
        //        const uint out_bi = bi;                                                         \
        //        CONST_LOOP(TILE_OFM, WRITE_OUTPUT_FEATURE);                                     \
-        //        output_offset += OUTPUT_BATCH_PITCH - TILE_OFM * SIMD;                          \
+        //        output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD;                            \
        //    } while (false)
        //
        //CONST_LOOP(TILE_B, WRITE_OUTPUT);
@ -279,14 +290,14 @@ KERNEL(fc)(
        for (uint bi = 0; bi < TILE_B; ++bi) {
            for (uint fi = 0; fi < TILE_OFM; ++fi) {
                const bool should_write =
-                    OUTPUT_FEATURE_NUM %  (TILE_OFM * SIMD) == 0 ||
-                    out_f + fi * SIMD + get_sub_group_local_id() < OUTPUT_FEATURE_NUM;
+                    TILE_OUT_F_NUM % (TILE_OFM * SIMD) == 0 ||
+                    out_f + fi * SIMD + get_sub_group_local_id() < TILE_OUT_F_NUM;
                if (should_write) {
                    output[output_offset] = ((OUTPUT_TYPE*)(&result[bi]))[fi];
                }
                output_offset += SIMD;
            }
-            output_offset += OUTPUT_BATCH_PITCH - TILE_OFM * SIMD;
+            output_offset += TILE_OUT_B_PITCH - TILE_OFM * SIMD;
        }
    }
    // =====================================================================================================================================
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bfyx_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fully_connected_gpu_bfyx_ref.cl
@ -27,10 +27,31 @@ KERNEL(fc)(
 #endif
    )
 {
+#if OUTPUT_3D
+    const uint oxfm = get_global_id(0);
+    const uint b = get_global_id(1);
+    const uint oym = oxfm % OUTPUT_SIZE_Y;
+    const uint ofm = oxfm / OUTPUT_SIZE_Y;
+
+    ACCUMULATOR_TYPE dotProd = ACCUMULATOR_VAL_ZERO;
+
+    for (uint y = 0; y < INPUT0_SIZE_Y; ++y)
+    {
+        for(uint x = 0; x < INPUT0_SIZE_X; ++x )
+        {
+            const uint input0_idx = GET_DATA_INDEX(INPUT0, b, ofm, y, x);
+            const uint filter_idx = GET_FILTER_INDEX(FILTER, 0, oym, y, 0, 0);
+            dotProd += (ACCUMULATOR_TYPE)(input[input0_idx] * weights[filter_idx]);
+        }
+    }
+
+    const uint dst_index = GET_DATA_INDEX(OUTPUT, b, ofm, oym, 0);
+    const uint bias_index = oym;
+#else
    const uint ofm = get_global_id(0);
    const uint b = get_global_id(1);

-    ACCUMULATOR_TYPE dotProd = (ACCUMULATOR_TYPE)0;
+    ACCUMULATOR_TYPE dotProd = ACCUMULATOR_VAL_ZERO;

    for (uint ifm = 0; ifm < INPUT0_FEATURE_NUM; ++ifm)
    {
@ -46,9 +67,10 @@ KERNEL(fc)(
    }

    const uint dst_index = GET_DATA_INDEX(OUTPUT, b, ofm, 0, 0);
+    const uint bias_index = ofm;
+#endif

 #if BIAS_TERM
-    const uint bias_index = ofm;
    ACTIVATION_TYPE dequantized = dotProd + biases[bias_index];
 #else
    ACTIVATION_TYPE dequantized = dotProd;
--- a/inference-engine/thirdparty/clDNN/src/fully_connected.cpp
+++ b/inference-engine/thirdparty/clDNN/src/fully_connected.cpp
@ -54,6 +54,10 @@ bool is_batch_after_spatial(const std::string order) {
 format::type get_preferred_format(const fully_connected_node& node) {
    auto input_layout = node.input().get_output_layout();

+    // for 3d output we have to chose bfyx format
+    if (node.get_primitive()->input_size == 3)
+        return format::bfyx;
+
    if (data_type_traits::is_floating_point(input_layout.data_type) &&
        (is_batch_after_spatial(input_layout.format.order()) ||
         input_layout.format == format::bs_x_bsv16 ||
@ -107,6 +111,9 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node
    }

    auto output_size = tensor(input_layout.size.batch[0], weights_layout.size.batch[0], 1, 1);
+    if (desc->input_size == 3) {
+        output_size = tensor(input_layout.size.batch[0], input_layout.size.feature[0], 1, weights_layout.size.batch[0]);
+    }
    format output_format = get_preferred_format(node);

    return layout(output_type, output_format, output_size);
--- a/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp
@ -57,10 +57,11 @@ public:
                arg.get_program());
        fc_optional_params.allowInputReordering = true;

-        fc_params.output = fc_params.output.FlattenFeatureAndSpatials();
-
        const auto primitive = arg.get_primitive();

+        if (primitive->input_size != 3)
+            fc_params.output = fc_params.output.FlattenFeatureAndSpatials();
+
        if (arg.get_output_layout().data_type == data_types::i8 ||
            arg.get_output_layout().data_type == data_types::u8) {
            fc_params.quantization = kernel_selector::QuantizationType::SYMMETRIC;
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fully_connected_gpu_test.cpp
@ -1150,6 +1150,109 @@ INSTANTIATE_TEST_CASE_P(smoke_bfyx_batched,
                            ::testing::Values(format::bfyx),
                            ::testing::Values("")), );

+
+template <typename InputT, typename WeightsT, typename BiasT, typename OutputT>
+struct fully_connected_random_test_3d : ::testing::TestWithParam<fully_connected_test_params> {
+    void run_test() {
+        size_t batch, input_f, input_x, input_y, output_y;
+        format::type input_format, output_format;
+        std::string kernel;
+
+        std::tie(batch, input_f, input_x, input_y, output_y, input_format, output_format, kernel) = GetParam();
+
+        auto input_data = generate_smart_random_4d<InputT>(batch, input_f, input_y, input_x);
+        auto weights_data = generate_smart_random_4d<WeightsT>(output_y, input_y, 1, 1);
+        auto bias_data = generate_smart_random_2d<BiasT>(1, output_y);
+
+        auto eng = get_test_engine();
+        auto net = network_test(eng);
+        auto input = net.add_input_layout<InputT, 4>("input", input_format, std::move(input_data));
+        auto weights = net.add_data<WeightsT, 4>("weights", format::oiyx, std::move(weights_data));
+        auto bias = net.add_data<BiasT, 2>("bias", format::bfyx, std::move(bias_data));
+        auto fc = net.add_fully_connected_3d<OutputT>("fc", input, weights, bias, implementation_desc{ output_format, kernel }, 3);
+
+        net.run(build_options(build_option::optimize_data(true)));
+    }
+};
+
+
+using fully_connected_random_test_f32_3d = fully_connected_random_test_3d<float, float, float, float>;
+using fully_connected_random_test_f16_3d = fully_connected_random_test_3d<FLOAT16, FLOAT16, FLOAT16, FLOAT16>;
+using fully_connected_random_test_i8_3d = fully_connected_random_test_3d<int8_t, int8_t, int8_t, float>;
+
+TEST_P(fully_connected_random_test_f32_3d, basic) {
+    run_test();
+}
+
+INSTANTIATE_TEST_CASE_P(smoke,
+                        fully_connected_random_test_f32_3d,
+                        ::testing::Combine(
+                            ::testing::Values(1,3),
+                            ::testing::Values(1,3),
+                            ::testing::Values(1),
+                            ::testing::Values(1,3,16),
+                            ::testing::Values(1,3,16),
+                            ::testing::Values(format::bfyx),
+                            ::testing::Values(format::any),
+                            ::testing::Values("")), );
+
+INSTANTIATE_TEST_CASE_P(smoke_big,
+                        fully_connected_random_test_f32_3d,
+                        ::testing::Combine(
+                            ::testing::Values(3),
+                            ::testing::Values(16, 17, 32),
+                            ::testing::Values(1),
+                            ::testing::Values(17, 32),
+                            ::testing::Values(17, 32),
+                            ::testing::Values(format::bfyx),
+                            ::testing::Values(format::any),
+                            ::testing::Values("")), );
+
+TEST_P(fully_connected_random_test_f16_3d, basic) {
+    run_test();
+}
+
+INSTANTIATE_TEST_CASE_P(smoke,
+                        fully_connected_random_test_f16_3d,
+                        ::testing::Combine(
+                            ::testing::Values(1,3),
+                            ::testing::Values(1,3),
+                            ::testing::Values(1),
+                            ::testing::Values(1,3,16),
+                            ::testing::Values(1,3,16),
+                            ::testing::Values(format::bfyx),
+                            ::testing::Values(format::any),
+                            ::testing::Values("")), );
+
+TEST_P(fully_connected_random_test_i8_3d, basic) {
+    run_test();
+}
+
+INSTANTIATE_TEST_CASE_P(smoke,
+                        fully_connected_random_test_i8_3d,
+                        ::testing::Combine(
+                            ::testing::Values(1,3),
+                            ::testing::Values(1,3),
+                            ::testing::Values(1),
+                            ::testing::Values(1,3,16),
+                            ::testing::Values(1,3,16),
+                            ::testing::Values(format::bfyx),
+                            ::testing::Values(format::any),
+                            ::testing::Values("")), );
+
+INSTANTIATE_TEST_CASE_P(smoke_big,
+                        fully_connected_random_test_i8_3d,
+                        ::testing::Combine(
+                            ::testing::Values(1,3),
+                            ::testing::Values(16,17),
+                            ::testing::Values(1),
+                            ::testing::Values(17, 32),
+                            ::testing::Values(17, 32),
+                            ::testing::Values(format::bfyx),
+                            ::testing::Values(format::any),
+                            ::testing::Values("")), );
+
+
 struct quantization_t {
    VF<float> input_low;
    VF<float> input_high;
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@ -368,6 +368,38 @@ public:
    layout get_per_channel_layout(T& p) {
        return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
    }
+
+    size_t get_fc_output_dim_size(bc_test_params& p) {
+        size_t size = 2;
+        for (auto i : p.out_shape.spatial) {
+            if (i > 1)
+                size++;
+        }
+        return size;
+    }
+
+    layout get_fc_weights_layout(T& p) {
+        cldnn::tensor weights_tensor;
+        if (p.out_shape.spatial[1] > 1) {
+            // 3d case
+            weights_tensor = cldnn::tensor(p.kernel.batch[0], p.kernel.feature[0], 1, 1);
+        }
+        else {
+            weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]),
+                   spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2]));
+        }
+        return layout{p.weights_type, p.weights_format, weights_tensor};
+    }
+
+    layout get_fc_bias_layout(T& p) {
+        if (p.out_shape.spatial[1] > 1) {
+            // 3d case
+            return layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.spatial[1], 1, 1} };
+        }
+        else {
+            return layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} };
+        }
+    }
 };

 class ResamplePrimitiveFusingTest : public ::BaseFusingTest<resample_test_params> {
@ -557,10 +589,16 @@ public:
 #define CASE_FC_FP32_1 {1, 1, 3, 1}, {1, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx
 #define CASE_FC_FP32_2 {2, 1, 3, 1}, {2, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx
 #define CASE_FC_FP32_3 {2, 32, 1, 1}, {2, 16, 1, 1}, {16, 32, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
+#define CASE_FC_FP32_3D_1 {5, 3, 1, 3}, {5, 3, 1, 5}, {5, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
+#define CASE_FC_FP32_3D_2 {2, 1, 1, 1}, {2, 1, 1, 32}, {32, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
+#define CASE_FC_FP32_3D_3 {2, 32, 1, 32}, {2, 32, 1, 16}, {16, 32, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx

 #define CASE_FC_U8S8_1 {1, 1, 3, 1}, {1, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
 #define CASE_FC_U8S8_2 {2, 1, 3, 1}, {2, 4, 1, 1}, {4, 1, 3, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
 #define CASE_FC_U8S8_3 {2, 32, 1, 1}, {2, 16, 1, 1}, {16, 32, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
+#define CASE_FC_U8S8_3D_1 {2, 32, 1, 3}, {2, 32, 1, 16}, {16, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
+#define CASE_FC_U8S8_3D_2 {1, 1, 1, 3}, {1, 1, 1, 32}, {32, 3, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
+#define CASE_FC_U8S8_3D_3 {2, 3, 1, 1}, {2, 3, 1, 15}, {15, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx

 #define CASE_NORMALIZE_I8_1 {1, 2, 3, 3}, data_types::u8, format::bfyx, data_types::f32, format::bfyx

@ -2475,9 +2513,9 @@ class fc_fp32_activation : public FCFusingTest {};
 TEST_P(fc_fp32_activation, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-                data("weights", get_mem(get_weights_layout(p))),
-                data("bias", get_mem(get_bias_layout(p))),
-                fully_connected("fc_prim", "input", "weights", "bias"),
+                data("weights", get_mem(get_fc_weights_layout(p))),
+                data("bias", get_mem(get_fc_bias_layout(p))),
+                fully_connected("fc_prim", "input", "weights", "bias", padding(), get_fc_output_dim_size(p)),
                activation("activation", "fc_prim", activation_func::abs),
                reorder("reorder_bfyx", "activation", p.default_format, data_types::f32)
    );
@ -2490,14 +2528,17 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_fp32_activation, ::testing::ValuesIn(std
                                                                            bc_test_params{ CASE_FC_FP32_1, 2, 3 },
                                                                            bc_test_params{ CASE_FC_FP32_2, 2, 3 },
                                                                            bc_test_params{ CASE_FC_FP32_3, 2, 3 },
+                                                                            bc_test_params{ CASE_FC_FP32_3D_1, 2, 3 },
+                                                                            bc_test_params{ CASE_FC_FP32_3D_2, 2, 3 },
+                                                                            bc_test_params{ CASE_FC_FP32_3D_3, 2, 3 },
 }), );

 class fc_fp32_bias : public FCFusingTest {};
 TEST_P(fc_fp32_bias, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-                data("weights", get_mem(get_weights_layout(p))),
-                data("bias", get_mem(get_bias_layout(p))),
+                data("weights", get_mem(get_fc_weights_layout(p))),
+                data("bias", get_mem(get_fc_bias_layout(p))),
                fully_connected("fc_prim", "input", "weights", ""),
                eltwise("bias_add", {"fc_prim", "bias"}, eltwise_mode::sum),
                reorder("reorder_bfyx", "bias_add", p.default_format, data_types::f32)
@ -2517,10 +2558,10 @@ class fc_int8_scale : public FCFusingTest {};
 TEST_P(fc_int8_scale, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-        data("weights", get_mem(get_weights_layout(p))),
-        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_fc_weights_layout(p))),
+        data("bias", get_mem(get_fc_bias_layout(p))),
        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
-        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
+        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
        scale("scale", "fc_prim", "scale_data"),
        reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
    );
@ -2532,10 +2573,10 @@ TEST_P(fc_int8_scale, basic) {
 TEST_P(fc_int8_scale, fp16_scale_out) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-        data("weights", get_mem(get_weights_layout(p))),
-        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_fc_weights_layout(p))),
+        data("bias", get_mem(get_fc_bias_layout(p))),
        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
-        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
+        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
        scale("scale", "fc_prim", "scale_data", optional_data_type{data_types::f16}),
        reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
    );
@ -2549,19 +2590,22 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale,
                        bc_test_params{ CASE_FC_U8S8_1, 2, 3 },
                        bc_test_params{ CASE_FC_U8S8_2, 2, 3 },
                        bc_test_params{ CASE_FC_U8S8_3, 2, 3 },
+                        bc_test_params{ CASE_FC_U8S8_3D_1, 2, 3 },
+                        bc_test_params{ CASE_FC_U8S8_3D_2, 2, 3 },
+                        bc_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
                        }), );

 class fc_int8_quantize_u8 : public FCFusingTest {};
 TEST_P(fc_int8_quantize_u8, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-        data("weights", get_mem(get_weights_layout(p))),
-        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_fc_weights_layout(p))),
+        data("bias", get_mem(get_fc_bias_layout(p))),
        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
        data("out_lo", get_mem(get_single_element_layout(p), 0)),
        data("out_hi", get_mem(get_single_element_layout(p), 255)),
-        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
+        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
        quantize("quantize", "fc_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8),
        reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
    );
@ -2575,20 +2619,23 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu_fc, fc_int8_quantize_u8,
        bc_test_params{CASE_FC_U8S8_1, 2, 3},
        bc_test_params{CASE_FC_U8S8_2, 2, 3},
        bc_test_params{CASE_FC_U8S8_3, 2, 3},
+        bc_test_params{ CASE_FC_U8S8_3D_1, 2, 3 },
+        bc_test_params{ CASE_FC_U8S8_3D_2, 2, 3 },
+        bc_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
        }), );

 class fc_int8_scale_quantize_i8 : public FCFusingTest {};
 TEST_P(fc_int8_scale_quantize_i8, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-        data("weights", get_mem(get_weights_layout(p))),
-        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_fc_weights_layout(p))),
+        data("bias", get_mem(get_fc_bias_layout(p))),
        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
        data("out_lo", get_mem(get_single_element_layout(p), -127)),
        data("out_hi", get_mem(get_single_element_layout(p), 127)),
        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)),
-        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
+        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
        scale("scale", "fc_prim", "scale_data"),
        quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
        reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
@ -2602,6 +2649,9 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale_quantize_i8,
        bc_test_params{CASE_FC_U8S8_1, 2, 4},
        bc_test_params{CASE_FC_U8S8_2, 2, 4},
        bc_test_params{CASE_FC_U8S8_3, 2, 4},
+        bc_test_params{ CASE_FC_U8S8_3D_1, 2, 4 },
+        bc_test_params{ CASE_FC_U8S8_3D_2, 2, 4 },
+        bc_test_params{ CASE_FC_U8S8_3D_3, 2, 4 },
        }), );


@ -2610,14 +2660,14 @@ class fc_int8_scale_activation_quantize_i8 : public FCFusingTest {};
 TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
    auto p = GetParam();
    create_topologies(input_layout("input", get_input_layout(p)),
-        data("weights", get_mem(get_weights_layout(p))),
-        data("bias", get_mem(get_bias_layout(p))),
+        data("weights", get_mem(get_fc_weights_layout(p))),
+        data("bias", get_mem(get_fc_bias_layout(p))),
        data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
        data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
        data("out_lo", get_mem(get_single_element_layout(p), -127)),
        data("out_hi", get_mem(get_single_element_layout(p), 127)),
        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)),
-        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32),
+        fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_fc_output_dim_size(p)),
        scale("scale", "fc_prim", "scale_data"),
        activation("activation_scale", "scale", activation_func::exp),
        quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
@ -2633,6 +2683,14 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8,
        bc_test_params{CASE_FC_U8S8_1, 2, 5},
        bc_test_params{CASE_FC_U8S8_2, 2, 5},
        bc_test_params{CASE_FC_U8S8_3, 2, 5},
+
+        bc_test_params{ CASE_FC_U8S8_3D_1, 2, 5 },
+        bc_test_params{ CASE_FC_U8S8_3D_2, 2, 5 },
+        bc_test_params{ CASE_FC_U8S8_3D_3, 2, 5 },
+
+        bc_test_params{ CASE_FC_FP32_3D_1, 3, 5 },
+        bc_test_params{ CASE_FC_FP32_3D_2, 3, 5 },
+        bc_test_params{ CASE_FC_FP32_3D_3, 3, 5 },
        }), );


--- a/inference-engine/thirdparty/clDNN/tests/test_utils/network_test.h
+++ b/inference-engine/thirdparty/clDNN/tests/test_utils/network_test.h
@ -144,7 +144,6 @@ template <typename T>
 struct reference_tensor_typed<T, 4> : reference_tensor {
    using vector_type = VVVVF<T>;
    reference_tensor_typed(vector_type data) : reference(std::move(data)) {}
-
    void compare(cldnn::memory actual) override {
        auto ptr = actual.pointer<T>();
        for (size_t bi = 0; bi < reference.size(); ++bi) {
@ -231,6 +230,36 @@ VVF<OutputT> fully_connected_reference_typed(VVVVF<InputT>& input, VVVVF<Weights
    return output;
 }

+template <typename OutputT,
+          typename InputT,
+          typename WeightsT,
+          typename BiasT,
+          typename AccT = typename fully_connected_accumulator<InputT>::type>
+VVVVF<OutputT> fully_connected_reference_typed_3d(VVVVF<InputT>& input, VVVVF<WeightsT>& weights, VF<BiasT>& bias) {
+    size_t input_f = input[0].size();
+    size_t input_y = input[0][0].size();
+    size_t input_x = input[0][0][0].size();
+    size_t output_b = input.size();        // input is assumed to be bfyx
+    size_t output_f = weights.size();    // weights is assumed to be bfyx
+    size_t weights_f = weights[0].size();    // weights is assumed to be bfyx
+    VVVVF<OutputT> output(output_b, VVVF<OutputT>(input_f, VVF<OutputT>(output_f, VF<OutputT>(1))));
+    OutputT res;
+    for (size_t b = 0; b < output_b; ++b) {
+        for (size_t n = 0; n < input_f; ++n) {
+            for (size_t f = 0; f < output_f; ++f) {
+                res = bias[f];
+                for (size_t y = 0; y < input_y; ++y) {
+                    for (size_t x = 0; x < input_x; ++x) {
+                        res += (OutputT)input[b][n][y][x] * (OutputT)weights[f][y][0][0];
+                    }
+                }
+                output[b][n][f][0] = (OutputT)res;
+            }
+        }
+    }
+    return output;
+}
+
 // =====================================================================================================================
 // Network test
 struct reference_node_interface {
@ -300,6 +329,22 @@ public:
        return add_node(id, reference_tensor_typed<T, 2>(output_data), { input, weights, bias });
    }

+    template <typename T, typename InputT, size_t InputN, typename WeightsT, typename BiasT>
+    typename reference_node<T, 4>::ptr add_fully_connected_3d(cldnn::primitive_id id,
+                                                           std::shared_ptr<reference_node<InputT, InputN>> input,
+                                                           std::shared_ptr<reference_node<WeightsT, InputN>> weights,
+                                                           std::shared_ptr<reference_node<BiasT, 2>> bias,
+                                                           cldnn::implementation_desc force = cldnn::implementation_desc{cldnn::format::any, ""},
+                                                           size_t input_dim_size = 3) {
+        topo.add(cldnn::fully_connected(id, input->id, weights->id, bias->id, cldnn::type_to_data_type<T>::value, cldnn::padding(), input_dim_size));
+        if (force.output_format != cldnn::format::any || force.kernel_name != "")
+            forced_impls[id] = force;
+        VVVVF<T> output_data = fully_connected_reference_typed_3d<T>(input->reference.reference,
+                                                                     weights->reference.reference,
+                                                                     bias->reference.reference[0]);
+        return add_node(id, reference_tensor_typed<T, 4>(output_data), {input, weights, bias});
+    }
+
    cldnn::network build_network(cldnn::build_options opts) {
        opts.set_option(cldnn::build_option::force_implementations(forced_impls));
        auto net = cldnn::network(eng, topo, opts);