diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp
index 3284499497d..1164227643c 100644
--- a/src/plugins/intel_cpu/src/nodes/reduce.cpp
+++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@@ -1823,8 +1823,7 @@ void Reduce::initSupportedPrimitiveDescriptors() {
 
     precision_change = input_prec != output_prec;
     support_split = algorithm != Algorithm::ReduceL2 && algorithm != Algorithm::ReduceLogSumExp &&
-                    algorithm != Algorithm::ReduceSumSquare &&
-                    (!precision_change || (input_prec == Precision::BF16 && output_prec == Precision::FP32));
+                    algorithm != Algorithm::ReduceSumSquare;
 
     src_data_size = input_prec.size();
     dst_data_size = output_prec.size();
@@ -2527,18 +2526,35 @@ inline void Reduce::reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, s
 inline void Reduce::reduce_kernel_post_process(uint8_t *out_ptr) {
     const size_t integerDivisor = IB * IC * ID * IH * IW / (OB * OC * OD * OH * OW);
     const float divisor = static_cast<float>(integerDivisor);
-    if (layout == ReduceLayoutType::reduce_ncsp || layout == ReduceLayoutType::reduce_nspc) {
+    if (layout == ReduceLayoutType::reduce_ncsp) {
         parallel_for2d(OB, OC, [&](size_t ob, size_t oc) {
             uint8_t *out_p = out_ptr + (ob * OC + oc) * OD * OH * OW * dst_data_size;
             auto arg = jit_reduce_post_call_args();
             arg.dst = static_cast<void *>(out_p);
-            arg.oc_off = layout == ReduceLayoutType::reduce_nspc ? 0 : oc * sizeof(float);
-            arg.channel_size = layout == ReduceLayoutType::reduce_nspc ? OW : OC; // OW is related to nspc-ncsp dimension reinterpret
+            arg.oc_off = oc * sizeof(float);
+            arg.channel_size = OC;
             arg.work_amount = OD * OH * OW;
             arg.divisor = &divisor;
             arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
             (*reduce_post_kernel)(&arg);
         });
+    } else if (layout == ReduceLayoutType::reduce_nspc) {
+        size_t num_threads = static_cast<size_t>(parallel_get_max_threads());
+        size_t OP = OB * OC >= num_threads ? OB * OC : OB * OC * OD;
+        if (OP < num_threads && OW > blk_size)
+            OP *= OH;
+        size_t work_amount = OB * OC * OD * OH * OW / OP;
+        parallel_for(OP, [&](size_t op) {
+            uint8_t *out_p = out_ptr + op * work_amount * dst_data_size;
+            auto arg = jit_reduce_post_call_args();
+            arg.dst = static_cast<void *>(out_p);
+            arg.oc_off = 0;
+            arg.channel_size = OW; // OW is related to nspc-ncsp dimension reinterpret
+            arg.work_amount = work_amount;
+            arg.divisor = &divisor;
+            arg.post_op_data = static_cast<const void **>(postOpsDataPtrs.data());
+            (*reduce_post_kernel)(&arg);
+        });
     } else {
         size_t OCB = div_up(OC, blk_size);
         parallel_for2d(OB, OCB, [&](size_t ob, size_t ocb) {
diff --git a/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp b/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp
index c751387b5b3..93a45ff898e 100644
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/reduce_ops.cpp
@@ -304,6 +304,11 @@ std::vector<std::vector<ov::test::InputShape>> inputShapes_SmallChannel = {
     {{{{1, 5}, 3, {1, 5}, {1, 10}}, {{2, 3, 2, 2}, {2, 3, 2, 9}}}},
 };
 
+std::vector<std::vector<ov::test::InputShape>> inputShapes_SingleBatch = {
+    {{{}, {{1, 19, 2, 9}}}},
+    {{{{1, 5}, 19, {1, 5}, {1, 10}}, {{1, 19, 2, 2}, {1, 19, 2, 9}}}},
+};
+
 std::vector<CPUSpecificParams> cpuParams_4D = {
 #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
         CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
@@ -464,6 +469,19 @@ const auto params_NHWC_SmallChannel = testing::Combine(
         testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
         testing::Values(emptyFusingSpec));
 
+const auto params_SingleBatch = testing::Combine(
+        testing::Combine(
+                testing::ValuesIn(axes),
+                testing::Values(CommonTestUtils::OpType::VECTOR),
+                testing::Values(true),
+                testing::ValuesIn(reductionTypes),
+                testing::ValuesIn(inpOutPrc),
+                testing::Values(ElementType::undefined),
+                testing::Values(ElementType::undefined),
+                testing::ValuesIn(inputShapes_SingleBatch)),
+        testing::ValuesIn(filterCPUSpecificParams(cpuParams_NHWC_4D)),
+        testing::Values(emptyFusingSpec));
+
 INSTANTIATE_TEST_SUITE_P(
         smoke_Reduce_OneAxis_CPU,
         ReduceCPULayerTest,
@@ -516,12 +534,19 @@ INSTANTIATE_TEST_SUITE_P(
 );
 
 INSTANTIATE_TEST_SUITE_P(
-        smoke_Reducea_NHWC_SmallChannel_CPU,
+        smoke_Reduce_NHWC_SmallChannel_CPU,
         ReduceCPULayerTest,
         params_NHWC_SmallChannel,
         ReduceCPULayerTest::getTestCaseName
 );
 
+INSTANTIATE_TEST_SUITE_P(
+        smoke_Reduce_SingleBatch_CPU,
+        ReduceCPULayerTest,
+        params_SingleBatch,
+        ReduceCPULayerTest::getTestCaseName
+);
+
 /* ================================ 1.2 No fusion - Logical ================================ */
 const auto params_OneAxis_Logical = testing::Combine(
         testing::Combine(