[CPU] Combine DQ scales and multiply into DQ scales. (#17276)

* Fix the multible multiply case.

* Add test case.

* Fix CI issues.

* Fix the dynamic shape FC kernel creating issue.

* Expose FC weight with original linear layout to CPU graph.

* Apply review comments.

* Apply review comments.

* Applied review comments.
This commit is contained in:
Luwei Zhou 2023-07-04 19:38:25 +08:00 committed by GitHub
parent 2e9e6d2dd4
commit 0b3a004825
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 86 additions and 45 deletions

View File

@ -49,7 +49,8 @@ protected:
actualFunction = ngraph::builder::subgraph::MarkupBiasFunction::get(precision,
test_values.input_shape,
test_values.bias_shape,
layer_type);
layer_type,
false);
SimpleLowPrecisionTransformer transformer;
transformer.transform(actualFunction);
}

View File

@ -248,7 +248,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) {
auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(),
node->getOutputShapeAtPort(0).getDims().size());
auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), 1, std::multiplies<size_t>());
node->initializeDQScales(scalesData, scaleSize);
node->fuseDQScales(scalesData, scaleSize);
return true;
};

View File

@ -1650,18 +1650,23 @@ void Node::addSupportedPrimDesc(const std::vector<PortConfigurator>& inPortConfi
supportedPrimitiveDescriptors.emplace_back(config, implType);
}
void Node::initializeDQScales(const float* scaleData, const size_t scaleSize) {
if (!DQScales.empty() || !scaleSize)
IE_THROW() << "DQ scales is preset or scale size is 0, ##" << getName();
DQScales.reserve(scaleSize);
bool scalePerTensor = true;
for (size_t i = 0; i < scaleSize; i++) {
DQScales.push_back(scaleData[i]);
if (scaleData[i] != scaleData[0])
scalePerTensor = false;
}
if (scalePerTensor)
void Node::fuseDQScales(const float* scaleData, const size_t scaleSize) {
if (DQScales.empty())
DQScales.resize(scaleSize, 1.0);
IE_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize)
<< "set invalid scales size , DQScales vector size: " << DQScales.size()
<< ", scale data size: " << scaleSize
<< "Node: ##" << getName();
if (scaleSize > DQScales.size())
DQScales.resize(scaleSize, DQScales[0]);
if (1 == scaleSize) {
std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val){ return (scaleData[0] * val); });
} else {
for (size_t i = 0; i < DQScales.size(); i++) {
DQScales[i] *= scaleData[i];
}
}
if (std::all_of(DQScales.begin(), DQScales.end(), [=](float val){ return (val == DQScales[0]);}))
DQScales.resize(1);
}

View File

@ -535,7 +535,7 @@ public:
*/
std::pair<std::vector<float>, std::vector<float>> getScalesAndShifts(const Node *parentNode) const;
void initializeDQScales(const float* scaleData, const size_t scaleSize);
void fuseDQScales(const float* scaleData, const size_t scaleSize);
const std::vector<float>& getDQScales() const {
return DQScales;
}

View File

@ -339,7 +339,7 @@ void FullyConnected::prepareParams() {
return std::make_shared<DnnlExecutor>(prim_desc);
}
// fallback to normal convolution primitive
// fallback to normal inner product primitive
auto inDesc = key.inp0->getDnnlDesc();
const auto& inDims = inDesc.get_dims(); // @TODO query + copy might be slow
if (inDims.size() == 3) {
@ -353,14 +353,15 @@ void FullyConnected::prepareParams() {
auto normalizedOutDims = { outDims[0] * outDims[1], outDims[2] };
outDesc = outDesc.reshape(normalizedOutDims);
}
auto wghDescAny = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
key.inp1->getDataType(), memory::format_tag::any);
dnnl::inner_product_forward::primitive_desc prim_desc;
if (key.bias) {
prim_desc = dnnl::inner_product_forward::primitive_desc(
engine,
dnnl::prop_kind::forward_inference,
inDesc,
key.inp1->getDnnlDesc(),
wghDescAny,
key.bias->getDnnlDesc(),
outDesc,
key.attr);
@ -369,17 +370,20 @@ void FullyConnected::prepareParams() {
engine,
dnnl::prop_kind::forward_inference,
inDesc,
key.inp1->getDnnlDesc(),
wghDescAny,
outDesc,
key.attr);
}
auto first_desc = dnnl::inner_product_forward::primitive_desc(prim_desc.get());
const bool found = DnnlExtensionUtils::find_implementation(prim_desc, key.implType);
if (!found)
return nullptr;
if (found)
return std::make_shared<DnnlExecutor>(prim_desc);
return std::make_shared<DnnlExecutor>(prim_desc);;
// For dynamic shape, the expected implement type kernel can support with dummy shape but
// not the run time inference shape. In this case, the implementation type will be
// ignored and the first available primitive descriptor will be chosen
return std::make_shared<DnnlExecutor>(first_desc);
};
auto cache = context->getParamsCache();
@ -704,7 +708,9 @@ void FullyConnected::initSupportedPrimitiveDescriptors() {
std::shared_ptr<MemoryDesc> FullyConnected::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const {
auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1) : prim_desc.src_desc(idx);
if (getInputShapeAtPort(idx).getRank() == 3) {
if (getInputShapeAtPort(idx).getRank() == 3
// report original plain layout for weight since it needs to be reordered dynamically at runtime
|| idx == 1) {
return std::make_shared<CpuBlockedMemoryDesc>(
DnnlExtensionUtils::DataTypeToIEPrecision(desc.get_data_type()), getInputShapeAtPort(idx));
}

View File

@ -29,7 +29,7 @@ using namespace InferenceEngine;
*/
namespace SubgraphTestsDefinitions {
using FQLayerDQBiasParams = std::tuple<InputShape, std::string>;
using FQLayerDQBiasParams = std::tuple<InputShape, std::string, bool>;
class FQLayerDQBias : virtual public SubgraphBaseTest,
public CpuTestWithFusing,
@ -38,7 +38,8 @@ public:
static std::string getTestCaseName(testing::TestParamInfo<FQLayerDQBiasParams> obj) {
InputShape input_shape;
std::string layer_type;
std::tie(input_shape, layer_type) = obj.param;
bool extra_multiply;
std::tie(input_shape, layer_type, extra_multiply) = obj.param;
std::ostringstream result;
result << "IS=(" << CommonTestUtils::partialShape2str({input_shape.first}) << ")_TS=(";
@ -46,6 +47,7 @@ public:
result << CommonTestUtils::vec2str(item) << "_";
}
result << ")_layer_type=" << layer_type;
result << ")_extra_multiply=" << extra_multiply;
return result.str();
}
@ -53,7 +55,8 @@ protected:
void SetUp() override {
InputShape input_shape;
std::string layer_type;
std::tie(input_shape, layer_type) = GetParam();
bool extra_multiply;
std::tie(input_shape, layer_type, extra_multiply) = GetParam();
targetDevice = CommonTestUtils::DEVICE_CPU;
std::tie(inFmts, outFmts, priority, selectedType) = CPUSpecificParams{{}, {}, {}, CPUTestsBase::any_type};
@ -70,7 +73,7 @@ protected:
const auto shapes = layer_type == "MatMul" ? std::vector<InputShape>{input_shape, input_shape}
: std::vector<InputShape>{input_shape};
init_input_shapes(shapes);
function = ngraph::builder::subgraph::MarkupBiasFunction::get(ov::element::f32, inputDynamicShapes[0], {}, layer_type);
function = ngraph::builder::subgraph::MarkupBiasFunction::get(ov::element::f32, inputDynamicShapes[0], {}, layer_type, extra_multiply);
}
std::string node_type;
@ -96,7 +99,8 @@ const std::vector<std::string> layer_types_4D_static = {
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQBias_4D_static, FQLayerDQBias,
::testing::Combine(::testing::ValuesIn(input_shapes_4D_static),
::testing::ValuesIn(layer_types_4D_static)),
::testing::ValuesIn(layer_types_4D_static),
::testing::Values(false)),
FQLayerDQBias::getTestCaseName);
const std::vector<InputShape> input_shapes_4D_dynamic = {
@ -111,7 +115,8 @@ const std::vector<std::string> layer_types_4D_dynamic = {
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQBias_4D_dynamic, FQLayerDQBias,
::testing::Combine(::testing::ValuesIn(input_shapes_4D_dynamic),
::testing::ValuesIn(layer_types_4D_dynamic)),
::testing::ValuesIn(layer_types_4D_dynamic),
::testing::Values(false)),
FQLayerDQBias::getTestCaseName);
const std::vector<InputShape> input_shapes_2D = {
{{-1, 768}, {{1, 768}}}
@ -123,7 +128,14 @@ const std::vector<std::string> layer_types_2D = {
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQBias_2D, FQLayerDQBias,
::testing::Combine(::testing::ValuesIn(input_shapes_2D),
::testing::ValuesIn(layer_types_2D)),
::testing::ValuesIn(layer_types_2D),
::testing::Values(false)),
FQLayerDQBias::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQExtraMultiplyAdd_2D, FQLayerDQBias,
::testing::Combine(::testing::ValuesIn(input_shapes_2D),
::testing::ValuesIn(layer_types_2D),
::testing::Values(false)),
FQLayerDQBias::getTestCaseName);
} // namespace

View File

@ -18,7 +18,8 @@ public:
static std::shared_ptr<ov::Model> get(const ov::element::Type& precision,
const ov::PartialShape& input_shape,
const ov::PartialShape& add_shape,
const std::string& operation_type);
const std::string& operation_type,
const bool extra_multipy);
};
} // namespace subgraph
} // namespace builder

View File

@ -13,7 +13,8 @@ namespace subgraph {
std::shared_ptr<ov::Model> MarkupBiasFunction::get(const ov::element::Type& precision,
const ov::PartialShape& input_shape,
const ov::PartialShape& add_shape,
const std::string& layer_type) {
const std::string& layer_type,
const bool extra_multipy) {
auto input_params = builder::makeDynamicParams(precision, {input_shape});
auto il = opset1::Constant::create(precision, {}, {0.f});
auto ih = opset1::Constant::create(precision, {}, {12.5f});
@ -85,26 +86,41 @@ std::shared_ptr<ov::Model> MarkupBiasFunction::get(const ov::element::Type& prec
layer->set_friendly_name(layer_type);
std::shared_ptr<ov::Node> add_input;
// empty add_shape means that add_input must be generated automatically
if (add_shape.is_static() && add_shape.size() == 0) {
const auto& out_shape = layer->get_output_partial_shape(0);
Shape bias_shape(out_shape.size(), 1);
if (layer_type != "MatMul") {
bias_shape[1] = out_shape[1].get_length();
const auto& out_shape = layer->get_output_partial_shape(0);
std::shared_ptr<ov::Node> add_input0 = layer;
if (extra_multipy) {
Shape mul_shape{};
if (out_shape.is_static()) {
mul_shape.resize(out_shape.size(), 1);
if (layer_type != "MatMul")
mul_shape[1] = out_shape[1].get_length();
else
mul_shape[out_shape.size() - 1] = out_shape[out_shape.size() - 1].get_length();
} else {
mul_shape = Shape{1};
}
add_input = builder::makeConstant<float>(precision, bias_shape, {}, true);
std::shared_ptr<ov::Node> mul;
auto mul_input = builder::makeConstant<float>(precision, mul_shape, {}, true);
add_input0 = std::make_shared<ov::opset1::Multiply>(layer, mul_input);
}
std::shared_ptr<ov::Node> add_input1;
// empty add_shape means that add_input1 must be generated automatically
if (add_shape.is_static() && add_shape.size() == 0) {
Shape bias_shape(out_shape.size(), 1);
if (layer_type != "MatMul")
bias_shape[1] = out_shape[1].get_length();
add_input1 = builder::makeConstant<float>(precision, bias_shape, {}, true);
} else {
if (add_shape.is_static()) {
add_input = builder::makeConstant<float>(precision, add_shape.to_shape(), {}, true);
add_input1 = builder::makeConstant<float>(precision, add_shape.to_shape(), {}, true);
} else {
auto new_param = std::make_shared<ov::opset1::Parameter>(precision, input_shape);
input_params.push_back(new_param);
add_input = new_param;
add_input1 = new_param;
}
}
auto add = std::make_shared<ov::opset1::Add>(layer, add_input);
auto add = std::make_shared<ov::opset1::Add>(add_input0, add_input1);
return std::make_shared<ov::Model>(add, input_params);
}
} // namespace subgraph