diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
index 731155df31d..5aa036c1559 100644
--- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
+++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp
@@ -7,6 +7,7 @@
 #include "dnn_types.h"
 #include <cstdint>
 #include <cpp/ie_cnn_network.h>
+#include <ie_algorithm.hpp>
 
 namespace GNAPluginNS {
 namespace GNALimitations {
@@ -114,5 +115,10 @@ public:
 
 bool AreLayersSupported(InferenceEngine::CNNNetwork& network, std::string& errMessage);
 
+inline size_t GetMinBatchToFitInBuffer(InferenceEngine::DataPtr input) {
+    auto total_size = InferenceEngine::details::product(std::begin(input->getDims()), std::end(input->getDims()));
+    return total_size / bufferMaxSize + 1;
+}
+
 } // namespace GNALimitations
 } // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index 36a63e055e5..01581337aec 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -683,7 +683,7 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto input = layer->insData[0].lock();
 
     auto outputs = *layer->outData.begin();
-    auto reshaped_dims = Get2DReshapedData(input, 8)->getDims();
+    auto reshaped_dims = Get2DReshapedData(input, GNALimitations::GetMinBatchToFitInBuffer(input), 8)->getDims();
     const uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ?
         GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor;
     uint32_t num_rows_in = reshaped_dims[1];
@@ -908,7 +908,7 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
     auto inputs = layer->insData.begin()->lock();
     auto outputs = *layer->outData.begin();
 
-    auto reshaped_dims = Get2DReshapedData(inputs, 8)->getDims();
+    auto reshaped_dims = Get2DReshapedData(inputs, GNALimitations::GetMinBatchToFitInBuffer(inputs), 8)->getDims();
     uint32_t num_rows_in = reshaped_dims[1];
     uint32_t num_columns_in = reshaped_dims[0];
     uint32_t num_rows_out = num_rows_in;
@@ -1410,7 +1410,8 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
         noOfInputsDivisor = GNALimitations::noOfInputsLowPrecDivisor;
     }
 
-    auto input_data = HasTo2DReshapeData(layer) ? Get2DReshapedData(inputs, 8) : inputs;
+    auto input_data = HasTo2DReshapeData(layer) ?
+        Get2DReshapedData(inputs, GNALimitations::GetMinBatchToFitInBuffer(inputs), 8) : inputs;
     auto in_dims = input_data->getDims();
     auto batch_size = (in_dims.size() == 1) ? 1 : in_dims.front();
     uint32_t num_rows_in = InferenceEngine::details::product(in_dims) / batch_size;
diff --git a/inference-engine/src/gna_plugin/gna_groups.hpp b/inference-engine/src/gna_plugin/gna_groups.hpp
index 704588a153d..9c4654e1adc 100644
--- a/inference-engine/src/gna_plugin/gna_groups.hpp
+++ b/inference-engine/src/gna_plugin/gna_groups.hpp
@@ -15,7 +15,9 @@ namespace GNAPluginNS {
  * @param input a pointer to data to be reshaped
  * @param maxZeroDimSize the maximum size of zero dimension
  */
-inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input, size_t maxZeroDimSize) {
+inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input, size_t minZeroDimSize,
+    size_t maxZeroDimSize) {
+    IE_ASSERT(minZeroDimSize > 0);
     auto dims = input->getDims();
     uint32_t numRowsIn = InferenceEngine::details::product(begin(dims), end(dims));
     uint32_t numColumnsIn = 1;
@@ -23,7 +25,7 @@ inline InferenceEngine::DataPtr Get2DReshapedData(InferenceEngine::DataPtr input
     if (numRowsIn % 8 == 0) {
         if (dims.size() >= 2 || dims[0] >= maxZeroDimSize) {
             size_t indexDivide = maxZeroDimSize;
-            while (indexDivide > 1) {
+            while (indexDivide > minZeroDimSize) {
                 if ((numRowsIn / 8) % indexDivide == 0) break;
                 --indexDivide;
             }
@@ -55,4 +57,5 @@ inline bool HasTo2DReshapeData(InferenceEngine::CNNLayerPtr layer) {
     // Don't reshape diagonallayers with bias connection
     return !GNAPluginNS::LayerInfo(getCreatorLayer(layer->insData.front().lock()).lock()).has32BOutput();
 }
+
 } // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/layers/gna_split_layer.hpp b/inference-engine/src/gna_plugin/layers/gna_split_layer.hpp
index c6c16ffe99a..161c3da66f4 100644
--- a/inference-engine/src/gna_plugin/layers/gna_split_layer.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_split_layer.hpp
@@ -45,4 +45,18 @@ public:
     };
     std::vector<SplitConnectedLayerInfo> splitOutputLayers;
 };
+
+// @brief Returns sizes of split outputs to split the input tensor to aligned parts not greater than the specified size
+static std::vector<uint32_t> GetAlignedSplitSizes(uint32_t totalSize, uint32_t maxSplitSize, uint32_t alignment = 64) {
+    std::vector<uint32_t> splitSizes;
+    uint32_t maxAlignedSplitSize = maxSplitSize - maxSplitSize % alignment;
+    uint32_t usedSize = 0;
+    while (usedSize < totalSize) {
+        uint32_t partSize = std::min(totalSize - usedSize, maxAlignedSplitSize);
+        splitSizes.push_back(partSize);
+        usedSize += partSize;
+    }
+    return splitSizes;
+}
+
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index b92bd153370..b7507e8fbf9 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -87,7 +87,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
     });
     IE_ASSERT(inputLayer != nullptr);
     size_t weightsSize = LayerInfo(prevLayer).has32BOutput() ? nextLayer->outData[0]->getDims().back() :
-        Get2DReshapedData(nextLayer->outData[0], 8)->getDims()[1];
+        Get2DReshapedData(nextLayer->outData[0], GNALimitations::GetMinBatchToFitInBuffer(nextLayer->outData[0]), 8)->getDims()[1];
     std::vector<float> weightsValues(weightsSize, fillValue);
     IE_ASSERT(diagLayer != nullptr);
     diagLayer->_weights = make_shared_blob<float>(
@@ -1113,6 +1113,9 @@ void InsertConcatAligningFilterPass::run() {
                                             SizeVector({filterWeights.size()}),
                                             Layout::C));
                 concatAligningFilter->_weights->allocate();
+                if (!concatAligningFilter->_weights->buffer().as<float*>()) {
+                    THROW_GNA_EXCEPTION << "Failed to allocate weights of size " << filterWeights.size() << " for " << filterName;
+                }
 
                 CopyVectorToBlob(concatAligningFilter->_weights, filterWeights);
 
@@ -1395,15 +1398,20 @@ void EltwiseSplitOverChannelsPass::run() {
             THROW_GNA_LAYER_EXCEPTION(l) << "number of outputs expected to be 1";
         }
         auto oData = l->outData.front();
-        auto out_width = GetDataDimSize(oData, DataDimName::W);
-        auto totalElementsForOutput = details::product(oData->getDims().begin(), oData->getDims().end());
-         // gna limit this to be OxFFFF
-        auto maxAffineElements = 65536 - 64;
-        if (totalElementsForOutput <= maxAffineElements) {
+        auto oDims = oData->getDims();
+        auto totalElementsSize = details::product(std::begin(oDims), std::end(oDims));
+        if (totalElementsSize <= GNALimitations::bufferMaxSize) {
             continue;
         }
 
-        auto totalSplits = 1 + totalElementsForOutput / maxAffineElements;
+        auto firstValuableDim = std::find_if(std::begin(oDims), std::end(oDims), [](size_t val) { return val > 1; });
+        IE_ASSERT(firstValuableDim != std::end(oDims));
+        auto splittedElementsSize = *firstValuableDim;
+        auto splittedDimIx = std::distance(std::begin(oDims), firstValuableDim);
+
+        // Split output size should be multiple by 64 to avoid align filters insertion
+        auto splitSizes = GetAlignedSplitSizes(splittedElementsSize,
+            GNALimitations::bufferMaxSize * splittedElementsSize / totalElementsSize);
 
         pass_trace() << "transforming " << LAYER_NAME(l) << " by splitting it to multiple eltwise operations\n";
         auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
@@ -1421,27 +1429,13 @@ void EltwiseSplitOverChannelsPass::run() {
             auto inputDesc = l->insData[kThEltwiseInput].lock()->getTensorDesc();
 
             // create split layer outputs
-            size_t usedElements = 0;
-            for (size_t i = 0; i < totalSplits; i++) {
-                SizeVector newDims;
-                size_t elements_num = std::min(totalElementsForOutput - usedElements,
-                        static_cast<size_t>(maxAffineElements));
-                if (inputDesc.getDims().size() == 2) {
-                    newDims = SizeVector{1, elements_num};
-                } else {
-                    elements_num = elements_num - elements_num % out_width;
-                    newDims = SizeVector{1, elements_num / out_width, out_width};
-                }
-
+            for (auto elementsNum : splitSizes) {
+                auto newDims = oDims;
+                newDims[splittedDimIx] = elementsNum;
                 auto newDesc = TensorDesc(inputDesc.getPrecision(), newDims, inputDesc.getLayout());
                 auto data = std::make_shared<Data>(l->name + "/" + std::to_string(kThEltwiseInput) + "/1", newDesc);
                 getCreatorLayer(data) = split;
                 split->outData.push_back(data);
-
-                usedElements += elements_num;
-                if (usedElements == totalElementsForOutput) {
-                    break;
-                }
             }
             // replacing connection X->eltwise to X->split
             auto oData = CNNLayerFindOutData(l, kThEltwiseInput);
@@ -1461,7 +1455,7 @@ void EltwiseSplitOverChannelsPass::run() {
         concat->outData.push_back(masterEltwise->outData.front());
         getCreatorLayer(masterEltwise->outData.front()) = concat;
 
-        for (size_t k = 0; k != totalSplits; k++) {
+        for (size_t k = 0; k != splitSizes.size(); k++) {
             auto eltwiseRaw = std::make_shared<EltwiseLayer>(
                     LayerParams{l->name + "/eltwise/" + std::to_string(k), "Eltwise", Precision::FP32});
             IE_ASSERT(eltwiseRaw != nullptr);
@@ -1521,7 +1515,9 @@ void SubstituteScaleShiftBroadCastPass::run() {
         if (was_reshaped) {
             dataDims = reshaped_data[insData->getName()];
         } else {
-            dataDims = HasTo2DReshapeData(l) ? Get2DReshapedData(insData, 8)->getDims() : insData->getDims();
+            dataDims = HasTo2DReshapeData(l) ?
+                Get2DReshapedData(insData, GNALimitations::GetMinBatchToFitInBuffer(insData), 8)->getDims() :
+                insData->getDims();
         }
 
         if (dataDims.size() <= 2) {
diff --git a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
index 4043c4aa5f0..b29cc04dac0 100644
--- a/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
+++ b/inference-engine/src/gna_plugin/transformations/split_convolution_with_large_buffer_size.cpp
@@ -12,6 +12,7 @@
 #include <ngraph/pattern/op/wrap_type.hpp>
 #include <ngraph/rt_info.hpp>
 #include "backend/gna_limitations.hpp"
+#include "layers/gna_split_layer.hpp"
 
 using namespace GNAPluginNS;
 
@@ -19,22 +20,6 @@ NGRAPH_RTTI_DEFINITION(SplitConvolution, "SplitConvolution", 0);
 NGRAPH_RTTI_DEFINITION(SplitConvolutionWithBias, "SplitConvolutionWithBias", 0);
 NGRAPH_RTTI_DEFINITION(SplitConvolutionWithFq, "SplitConvolutionWithFq", 0);
 
-static std::vector<int64_t> GetConvSplitSizes(std::shared_ptr<ngraph::Node> conv) {
-    uint32_t width = conv->get_input_shape(0).back();
-    uint32_t in_channels = conv->get_input_shape(0).at(1);
-    uint32_t usedWidth = 0;
-    std::vector<int64_t> split_sizes;
-    uint32_t width_max_size = GNALimitations::bufferMaxSize / in_channels;
-    width_max_size = width_max_size - width_max_size % 64;
-    while (usedWidth < width) {
-        uint32_t width_part = std::min(width - usedWidth, width_max_size);
-        split_sizes.push_back(width_part);
-        usedWidth += width_part;
-    }
-    IE_ASSERT(usedWidth == width);
-    return split_sizes;
-}
-
 static bool Convert(std::shared_ptr<ngraph::Node> conv,
                     std::shared_ptr<ngraph::Node> add,
                     std::shared_ptr<ngraph::Node> bias,
@@ -45,15 +30,21 @@ static bool Convert(std::shared_ptr<ngraph::Node> conv,
         return false;
     }
 
-    auto split_sizes = GetConvSplitSizes(conv);
+    uint32_t width = conv->get_input_shape(0).back();
+    uint32_t in_channels = conv->get_input_shape(0).at(1);
+    auto split_sizes = GetAlignedSplitSizes(width, GNALimitations::bufferMaxSize / in_channels);
     IE_ASSERT(split_sizes.size() > 1);
+    std::vector<int64_t> split_sizes_casted(split_sizes.size());
+    std::transform(std::begin(split_sizes), std::end(split_sizes), std::begin(split_sizes_casted), [](uint32_t size) {
+        return static_cast<int64_t>(size);
+    });
 
     /* TODO check if it's NHWC convolution wrapped with transposes or all input dimensions except of width == 1,
         otherwise this split axis isn't supported */
     const int64_t width_axis = conv->get_input_shape(0).size() - 1;
     auto split_node = std::make_shared<ngraph::opset7::VariadicSplit>(conv->input_value(0),
         ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{width_axis}),
-        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_sizes.size()}), split_sizes));
+        ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_sizes_casted.size()}), split_sizes_casted));
     ngraph::copy_runtime_info(conv, split_node);
     split_node->set_friendly_name(conv->get_friendly_name() + "/split");
     ngraph::OutputVector convOutputs;
diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
index 5f69ab02615..dd6424be051 100644
--- a/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
+++ b/inference-engine/tests/functional/plugin/gna/pass_tests/eltwise_split_over_channels_pass.cpp
@@ -54,8 +54,8 @@ protected:
         auto params = ngraph::builder::makeParams(ngPrc, { inputShape });
         auto const_mult2 = ngraph::builder::makeConstant<float>(ngPrc, inputShape, {-1.0f});
 
-        auto sum = ngraph::builder::makeEltwise(params[0], const_mult2, ngraph::helpers::EltwiseTypes::MULTIPLY);
-        function = std::make_shared<ngraph::Function>(sum, params, "EltwiseSplitOverChannelsPassTest");
+        auto mul = ngraph::builder::makeEltwise(params[0], const_mult2, ngraph::helpers::EltwiseTypes::MULTIPLY);
+        function = std::make_shared<ngraph::Function>(mul, params, "EltwiseSplitOverChannelsPassTest");
     }
 };
 
@@ -77,7 +77,8 @@ const std::vector<std::map<std::string, std::string>> configs = {
 
 const std::vector<std::vector<size_t>> inputShape = {
     {1, 67000},
-    {1, 500000}
+    {1, 500000},
+    {1, 936, 513}
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_EltwiseSplitOverChennels, EltwiseSplitOverChannelsPassTest,
diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
index 9de08e5f84f..71c0cfc3d70 100644
--- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
+++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
@@ -47,6 +47,7 @@ std::map<std::vector<size_t>, std::vector<std::vector<size_t>>> basic = {
         {{1, 4, 4, 128}, {{}}},
         {{8}, {{}}},
         {{5}, {{}}},
+        {{1, 936, 513}, {{}}}
 };
 
 const auto basicCases = ::testing::Combine(
diff --git a/inference-engine/tests/unit/gna/gna_get_2d_reshaped_data.cpp b/inference-engine/tests/unit/gna/gna_get_2d_reshaped_data.cpp
index 5af1bf88fa7..5e96984e152 100644
--- a/inference-engine/tests/unit/gna/gna_get_2d_reshaped_data.cpp
+++ b/inference-engine/tests/unit/gna/gna_get_2d_reshaped_data.cpp
@@ -65,7 +65,7 @@ class Get2DReshapedDataTest : public ::testing::Test {
                            InferenceEngine::Layout layout) const {
         auto data = std::make_shared<InferenceEngine::Data>(input_name,
             InferenceEngine::TensorDesc(precision, input_shape.first, layout));
-        auto new_data = GNAPluginNS::Get2DReshapedData(data, max_batch_size);
+        auto new_data = GNAPluginNS::Get2DReshapedData(data, 1, max_batch_size);
         ASSERT_EQ(new_data->getDims(), input_shape.second);
         ASSERT_EQ(new_data->getPrecision(), precision);
         ASSERT_EQ(new_data->getLayout(), layout);
diff --git a/inference-engine/tests/unit/gna/gna_get_aligned_split_sizes.cpp b/inference-engine/tests/unit/gna/gna_get_aligned_split_sizes.cpp
new file mode 100644
index 00000000000..5d017248e49
--- /dev/null
+++ b/inference-engine/tests/unit/gna/gna_get_aligned_split_sizes.cpp
@@ -0,0 +1,36 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include <gtest/gtest.h>
+// to suppress deprecated definition errors
+#define IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+#include "layers/gna_split_layer.hpp"
+
+namespace {
+
+using GetAlignedSplitSizesData = std::tuple<
+    uint32_t,               // total size
+    uint32_t,               // maximum split size
+    uint32_t,               // alignment
+    std::vector<uint32_t>   // expected sizes
+>;
+
+const std::vector<GetAlignedSplitSizesData> data = {
+    GetAlignedSplitSizesData{1024, 100, 64, std::vector<uint32_t>(16, 64)},
+    GetAlignedSplitSizesData{151, 100, 64, std::vector<uint32_t>{64, 64, 23}},
+    GetAlignedSplitSizesData{151, 65, 32, std::vector<uint32_t>{64, 64, 23}},
+    GetAlignedSplitSizesData{151, 65, 1, std::vector<uint32_t>{65, 65, 21}}
+};
+
+TEST(GetAlignedSplitSizesTest, testAlignedSplitSizes) {
+    for (const auto &dataItem : data) {
+        auto sizes = GNAPluginNS::GetAlignedSplitSizes(std::get<0>(dataItem), std::get<1>(dataItem),
+                                                       std::get<2>(dataItem));
+        ASSERT_EQ(sizes, std::get<3>(dataItem));
+    }
+}
+
+} // namespace
\ No newline at end of file