diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 9bd51f665f9..c363bffd6b6 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -251,5 +251,33 @@ void DnnlPostOpsComposer::appendClip(const std::vector& low, const std::v } } +void DnnlPostOpsComposer::appendDecompressionScales(const std::vector& scales) { + if (scales.empty()) + return; + + int mask = scales.size() > 1 ? weightScaleMaskPerChannel : 0; + DEBUG_LOG("Set weights scales mask ", "DNNL_ARG: ", DNNL_ARG_WEIGHTS, " mask: ", mask); + attr.set_scales_mask(DNNL_ARG_WEIGHTS, mask); + + DnnlBlockedMemoryDesc memoryDesc(InferenceEngine::Precision::FP32, Shape({scales.size()})); + auto mem = std::make_shared(engine, memoryDesc); + memcpy(mem->getData(), scales.data(), scales.size() * sizeof(float)); + args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = mem; +} + +void DnnlPostOpsComposer::appendDecompressionZeroPoints(const std::vector& zero_points) { + if (zero_points.empty()) + return; + + int mask = zero_points.size() > 1 ? weightScaleMaskPerChannel : 0; + DEBUG_LOG("Set weights zero points mask ", "DNNL_ARG: ", DNNL_ARG_WEIGHTS, " mask: ", mask); + attr.set_zero_points_mask(DNNL_ARG_WEIGHTS, mask); + + DnnlBlockedMemoryDesc memoryDesc(InferenceEngine::Precision::FP32, Shape({zero_points.size()})); + auto mem = std::make_shared(engine, memoryDesc); + memcpy(mem->getData(), zero_points.data(), zero_points.size() * sizeof(float)); + args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = mem; +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h index 4ddd2b7ca79..344d93e649a 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h @@ -42,6 +42,9 @@ public: bool appendLinear(const std::vector& scale, const std::vector& shift, bool isLastPostOp, bool allowBinary = true); void appendClip(const std::vector& low, const std::vector& high); + void appendDecompressionScales(const std::vector& scales); + void appendDecompressionZeroPoints(const std::vector& zero_points); + const VectorDims& getOutputDims() { return outputDims; } diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 6b7f4042c58..d40c7df913f 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -75,6 +75,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseConvMatmulFCDeconvAndDQScales(graph); graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression"); + FuseFCAndWeightsDecompression(graph); + graph.RemoveDroppedNodes(); + OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias"); FuseConvolutionMatMulDeconvAndBias(graph); graph.RemoveDroppedNodes(); @@ -283,6 +287,9 @@ void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) { return node->getType() == expectedType && node->getChildEdges().size() == 1; }; + if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2)) + return; + auto& graphNodes = graph.GetNodes(); for (size_t i = 0; i < graphNodes.size(); i++) { const auto fcNode = dynamic_cast(graphNodes[i].get()); @@ -323,6 +330,8 @@ void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) { continue; // Precision limitations + if (fcNode->getOriginalInputPrecisionAtPort(0) != Precision::FP32) + continue; if (multiplyConstNode->getOriginalOutputPrecisionAtPort(0) != Precision::FP32) continue; if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end()) diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index f85f2c15a93..25af6f996f6 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -192,10 +192,6 @@ void FullyConnected::getSupportedDescriptors() { if (getChildEdges().empty()) IE_THROW()<< errorPrefix << " has incorrect number of output edges"; - withBiases = getOriginalInputsNumber() == 3; - - useSparseWeights = useSparseWeightsDecompression(); - auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID)); outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID)); @@ -203,6 +199,13 @@ void FullyConnected::getSupportedDescriptors() { outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0)); } auto weightsDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(WEIGHTS_ID)); + + withBiases = getOriginalInputsNumber() == 3; + + useSparseWeights = useSparseWeightsDecompression(); + useWeightsDecompressionImpl = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2) && + inputDataType == memory::data_type::f32 && weightsDataType == memory::data_type::u8; + // revert back outputDataType on special cases if (inputDataType == memory::data_type::f32) { // oneDNN only support f32 output when input is f32, even if FQ is fused @@ -240,7 +243,7 @@ void FullyConnected::getSupportedDescriptors() { #if defined(OV_CPU_WITH_MLAS) && (defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)) // MLAS doesn't support post-ops fusing and only supports FP32. INT8 is not enabled yet // Disable MLAS when FC could fuse post-ops - useMlas = !useSparseWeights && + useMlas = !useSparseWeights && !useWeightsDecompressionImpl && (inputDataType == memory::data_type::f32 && weightsDataType == memory::data_type::f32) && fusedWith.empty(); auto wgtDims = getInputShapeAtPort(WEIGHTS_ID).getStaticDims(); @@ -587,6 +590,11 @@ void FullyConnected::setPostOps(dnnl::primitive_attr& attr, const VectorDims& di DnnlPostOpsComposer dnnlpoc(getEngine(), attr, ops, postOpsArgs, dims, dims.size() - 1, canBeExecutedInInt8(), 1 << 0, getDQScales(), withBiases); + if (!decompressionMultiply.empty()) + dnnlpoc.appendDecompressionScales(decompressionMultiply); + if (!decompressionSubtract.empty()) + dnnlpoc.appendDecompressionZeroPoints(decompressionSubtract); + for (size_t i = 0; i < fusedWith.size(); ++i) { auto& node = fusedWith[i]; bool isLastPostOp = (i == (fusedWith.size() - 1)); @@ -673,11 +681,14 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes dnnl::memory::data_type wdt = indt; dnnl::memory::data_type bdt = outdt; - if (one_of(indt, dnnl::memory::data_type::bf16, dnnl::memory::data_type::f16)) { - //oneDNN ARM InnerProduct primitive supports only identical in/out data types + if (useWeightsDecompressionImpl) { + // Weights decompression case + wdt = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(WEIGHTS_ID)); + } else if (one_of(indt, dnnl::memory::data_type::bf16, dnnl::memory::data_type::f16)) { #if defined(OPENVINO_ARCH_X86_64) bdt = dnnl::memory::data_type::f32; #else + // oneDNN ARM InnerProduct primitive supports only identical in/out data types bdt = dnnl::memory::data_type::f16; #endif } else if (indt == dnnl::memory::data_type::u8 || indt == dnnl::memory::data_type::s8) { @@ -939,6 +950,9 @@ bool FullyConnected::canBeExecutedInConv1x1() const { bool retVal = false; const auto inRank = getInputShapeAtPort(DATA_ID).getRank(); const auto weightRank = getInputShapeAtPort(WEIGHTS_ID).getRank(); + if (useWeightsDecompressionImpl) { + return false; + } // disable rank=4: // if layout is nhwc: // A matrix: N * IC * H * W --> N * (IC*H*W), the M, N', K of matrix multiply will be: diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 2f9a11e5b57..a3444666788 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -115,6 +115,7 @@ private: void prepackMLASWeight(); #endif + bool useWeightsDecompressionImpl = false; std::vector decompressionSubtract; std::vector decompressionMultiply; }; diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 522066d82ae..d962766143c 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -188,8 +188,6 @@ std::vector disabledTestPatterns() { // New plugin API doesn't support changes of pre-processing R"(.*(Auto|Multi|Hetero).*InferRequestPreprocessTest.*SetPreProcessToInputInfo.*)", R"(.*(Auto|Multi|Hetero).*InferRequestPreprocessTest.*SetPreProcessToInferRequest.*)", - // Issue: 113727 - R"(.*MatMulCompressedWeights.*)", // TODO: for 22.2 (CVS-68949) R"(.*smoke_AutoBatching_CPU/AutoBatching_Test_DetectionOutput.*)", }; diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_weights_decompression.cpp index 3e8298bdb36..f1d0ebb70e0 100644 --- a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_weights_decompression.cpp +++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/matmul_weights_decompression.cpp @@ -189,7 +189,9 @@ protected: } std::map additional_config = std::get<5>(test_param); - const size_t expected_count = additional_config[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES ? 1 : 0; + const size_t expected_count = + InferenceEngine::with_cpu_x86_avx2() && + additional_config[PluginConfigParams::KEY_ENFORCE_BF16] != PluginConfigParams::YES ? 0 : 1; CheckNumberOfNodesWithType(compiledModel, "Convert", expected_count); CheckNumberOfNodesWithType(compiledModel, "Eltwise", expected_count); CheckNumberOfNodesWithType(compiledModel, "Subgraph", 0); @@ -220,6 +222,9 @@ const std::vector> input_shapes_basic = { {{{}, {{1, 4, 48}}}, {{}, {{48, 256}}}}, {{{}, {{1, 4, 512}}}, {{}, {{512, 256}}}}, {{{}, {{1, 16, 32}}}, {{}, {{32, 64}}}}, + {{{}, {{2, 4, 32}}}, {{}, {{32, 65}}}}, + {{{}, {{11, 339, 377}}}, {{}, {{377, 335}}}}, + {{{}, {{3, 12, 768}}}, {{}, {{768, 1024}}}}, }; const std::vector fusingParamsSet { emptyFusingSpec, diff --git a/src/plugins/intel_cpu/tests/functional/target_per_test.cmake b/src/plugins/intel_cpu/tests/functional/target_per_test.cmake index e6e7b64de26..f5ebcfc40f5 100644 --- a/src/plugins/intel_cpu/tests/functional/target_per_test.cmake +++ b/src/plugins/intel_cpu/tests/functional/target_per_test.cmake @@ -87,7 +87,7 @@ function(create_target_per_test_for_directory TEST_DIR TARGET_PREFIX) endfunction() if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST) - create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/src/arm ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/src ov_cpu_func_subgraph) create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests ov_cpu_func_slt) endif() diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 2c0b3f29461..ec7a051b3f4 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 2c0b3f2946185370b3ccbe4330398c15109e5ec4 +Subproject commit ec7a051b3f4a9e65b22382f3d787e84ce74efe07