diff --git a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
index a673d623e14..f0ccf5ba925 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include <array>
+
 #include "intel_gpu/plugin/program.hpp"
 #include "intel_gpu/plugin/common_utils.hpp"
 
@@ -166,6 +168,63 @@ static void CreateMatMulOp(Program& p, const std::shared_ptr<ngraph::op::v0::Mat
         // Add actual gemm
         auto alpha = 1.0f;
         auto beta = 0.0f;
+        auto transA = op->get_transpose_a();
+        auto transB = op->get_transpose_b();
+
+        std::array<ngraph::PartialShape, 2> inputShapes{
+            op->get_input_partial_shape(0),
+            op->get_input_partial_shape(1)
+        };
+
+        auto canTransposeInputs = [] (const std::array<ngraph::PartialShape, 2>& shapes, bool transA, bool transB) -> bool {
+            if (!transA && !transB)
+                return false;
+            if (shapes[0].rank().is_dynamic() ||
+                shapes[1].rank().is_dynamic())
+                return false;
+
+            // don't transpose inputs if they're aligned to 16
+            bool inputsAligned = std::all_of(shapes[0].rbegin(), shapes[0].rbegin() + 2,
+                                             [] (const ngraph::Dimension& dim) { return dim.is_static() && dim.get_length() % 16 == 0; }) &&
+                                 std::all_of(shapes[1].rbegin(), shapes[1].rbegin() + 2,
+                                             [] (const ngraph::Dimension& dim) { return dim.is_static() && dim.get_length() % 16 == 0; });
+            if (inputsAligned)
+                return false;
+
+            return std::all_of(shapes[0].rbegin(), shapes[0].rbegin() + 2,
+                               [] (const ngraph::Dimension& dim) { return dim.is_static() && dim.get_length() >= 64; }) &&
+                   std::all_of(shapes[1].rbegin(), shapes[1].rbegin() + 2,
+                               [] (const ngraph::Dimension& dim) { return dim.is_static() && dim.get_length() >= 64; });
+        };
+
+        auto transposeInput = [&layerName] (Program& p, const std::shared_ptr<ngraph::Node>& op, const ngraph::PartialShape& shape,
+                                            const std::string& suffix, const cldnn::primitive_id& primitiveId) -> std::string {
+            std::vector<uint16_t> transposeOrder(shape.size());
+            std::iota(transposeOrder.begin(), transposeOrder.end(), 0);
+            for (auto o = transposeOrder.size(); o < 4; o++)
+                transposeOrder.push_back((uint16_t)o);
+            std::swap(*(transposeOrder.end() - 1), *(transposeOrder.end() - 2));
+
+            auto permuteName = op->get_friendly_name() + suffix;
+            auto permutePrim = cldnn::permute(permuteName,
+                                              primitiveId,
+                                              transposeOrder);
+            p.add_primitive(*op, permutePrim);
+            return permuteName;
+        };
+
+        if (canTransposeInputs(inputShapes, transA, transB)) {
+            if (transA) {
+                inputPrimitives[0] = transposeInput(p, op, inputShapes[0], "/transpose_a", inputPrimitives[0]);
+                transA = false;
+            }
+
+            if (transB) {
+                inputPrimitives[1] = transposeInput(p, op, inputShapes[1], "/transpose_b", inputPrimitives[1]);
+                transB = false;
+            }
+        }
+
         auto gemmPrim = cldnn::gemm(layerName,
                                     inputPrimitives,
                                     cldnn::element_type_to_data_type(op->get_output_element_type(0)),
diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/mat_mul.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
index 8d6b404e3fb..859d65ace9e 100644
--- a/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
+++ b/src/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/mat_mul.cpp
@@ -33,6 +33,9 @@ const std::vector<ShapeRelatedParams> shapeRelatedParams = {
         { { {2, 1, 2, 3}, true }, { {3, 4, 2}, true } },
         { { {3}, false }, { {2, 2, 3, 1}, false } },
         { { {2, 2, 1, 3}, false }, { {3}, false } },
+        { { {65, 100}, false }, { {73, 100}, true } },
+        { { {100, 65}, true }, { {100, 73}, false } },
+        { { {100, 65}, true }, { {73, 100}, true } },
         { { {1, 5}, false }, { {5, 1}, false } },
         { { {5, 1}, true }, { {5, 1}, false } },
         { { {1, 5}, false }, { {1, 5}, true } },