[CPU] Combine DQ scales and multiply into DQ scales. (#17276)
* Fix the multible multiply case. * Add test case. * Fix CI issues. * Fix the dynamic shape FC kernel creating issue. * Expose FC weight with original linear layout to CPU graph. * Apply review comments. * Apply review comments. * Applied review comments.
This commit is contained in:
parent
2e9e6d2dd4
commit
0b3a004825
@ -49,7 +49,8 @@ protected:
|
||||
actualFunction = ngraph::builder::subgraph::MarkupBiasFunction::get(precision,
|
||||
test_values.input_shape,
|
||||
test_values.bias_shape,
|
||||
layer_type);
|
||||
layer_type,
|
||||
false);
|
||||
SimpleLowPrecisionTransformer transformer;
|
||||
transformer.transform(actualFunction);
|
||||
}
|
||||
|
@ -248,7 +248,7 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) {
|
||||
auto scalesDims = getNormalizedDimsBySize(scales->getOutputShapeAtPort(0).getDims(),
|
||||
node->getOutputShapeAtPort(0).getDims().size());
|
||||
auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), 1, std::multiplies<size_t>());
|
||||
node->initializeDQScales(scalesData, scaleSize);
|
||||
node->fuseDQScales(scalesData, scaleSize);
|
||||
return true;
|
||||
};
|
||||
|
||||
|
@ -1650,18 +1650,23 @@ void Node::addSupportedPrimDesc(const std::vector<PortConfigurator>& inPortConfi
|
||||
supportedPrimitiveDescriptors.emplace_back(config, implType);
|
||||
}
|
||||
|
||||
void Node::initializeDQScales(const float* scaleData, const size_t scaleSize) {
|
||||
if (!DQScales.empty() || !scaleSize)
|
||||
IE_THROW() << "DQ scales is preset or scale size is 0, ##" << getName();
|
||||
DQScales.reserve(scaleSize);
|
||||
|
||||
bool scalePerTensor = true;
|
||||
for (size_t i = 0; i < scaleSize; i++) {
|
||||
DQScales.push_back(scaleData[i]);
|
||||
if (scaleData[i] != scaleData[0])
|
||||
scalePerTensor = false;
|
||||
}
|
||||
if (scalePerTensor)
|
||||
void Node::fuseDQScales(const float* scaleData, const size_t scaleSize) {
|
||||
if (DQScales.empty())
|
||||
DQScales.resize(scaleSize, 1.0);
|
||||
IE_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize)
|
||||
<< "set invalid scales size , DQScales vector size: " << DQScales.size()
|
||||
<< ", scale data size: " << scaleSize
|
||||
<< "Node: ##" << getName();
|
||||
if (scaleSize > DQScales.size())
|
||||
DQScales.resize(scaleSize, DQScales[0]);
|
||||
if (1 == scaleSize) {
|
||||
std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val){ return (scaleData[0] * val); });
|
||||
} else {
|
||||
for (size_t i = 0; i < DQScales.size(); i++) {
|
||||
DQScales[i] *= scaleData[i];
|
||||
}
|
||||
}
|
||||
if (std::all_of(DQScales.begin(), DQScales.end(), [=](float val){ return (val == DQScales[0]);}))
|
||||
DQScales.resize(1);
|
||||
}
|
||||
|
||||
|
@ -535,7 +535,7 @@ public:
|
||||
*/
|
||||
std::pair<std::vector<float>, std::vector<float>> getScalesAndShifts(const Node *parentNode) const;
|
||||
|
||||
void initializeDQScales(const float* scaleData, const size_t scaleSize);
|
||||
void fuseDQScales(const float* scaleData, const size_t scaleSize);
|
||||
const std::vector<float>& getDQScales() const {
|
||||
return DQScales;
|
||||
}
|
||||
|
@ -339,7 +339,7 @@ void FullyConnected::prepareParams() {
|
||||
return std::make_shared<DnnlExecutor>(prim_desc);
|
||||
}
|
||||
|
||||
// fallback to normal convolution primitive
|
||||
// fallback to normal inner product primitive
|
||||
auto inDesc = key.inp0->getDnnlDesc();
|
||||
const auto& inDims = inDesc.get_dims(); // @TODO query + copy might be slow
|
||||
if (inDims.size() == 3) {
|
||||
@ -353,14 +353,15 @@ void FullyConnected::prepareParams() {
|
||||
auto normalizedOutDims = { outDims[0] * outDims[1], outDims[2] };
|
||||
outDesc = outDesc.reshape(normalizedOutDims);
|
||||
}
|
||||
|
||||
auto wghDescAny = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
|
||||
key.inp1->getDataType(), memory::format_tag::any);
|
||||
dnnl::inner_product_forward::primitive_desc prim_desc;
|
||||
if (key.bias) {
|
||||
prim_desc = dnnl::inner_product_forward::primitive_desc(
|
||||
engine,
|
||||
dnnl::prop_kind::forward_inference,
|
||||
inDesc,
|
||||
key.inp1->getDnnlDesc(),
|
||||
wghDescAny,
|
||||
key.bias->getDnnlDesc(),
|
||||
outDesc,
|
||||
key.attr);
|
||||
@ -369,17 +370,20 @@ void FullyConnected::prepareParams() {
|
||||
engine,
|
||||
dnnl::prop_kind::forward_inference,
|
||||
inDesc,
|
||||
key.inp1->getDnnlDesc(),
|
||||
wghDescAny,
|
||||
outDesc,
|
||||
key.attr);
|
||||
}
|
||||
|
||||
auto first_desc = dnnl::inner_product_forward::primitive_desc(prim_desc.get());
|
||||
const bool found = DnnlExtensionUtils::find_implementation(prim_desc, key.implType);
|
||||
|
||||
if (!found)
|
||||
return nullptr;
|
||||
if (found)
|
||||
return std::make_shared<DnnlExecutor>(prim_desc);
|
||||
|
||||
return std::make_shared<DnnlExecutor>(prim_desc);;
|
||||
// For dynamic shape, the expected implement type kernel can support with dummy shape but
|
||||
// not the run time inference shape. In this case, the implementation type will be
|
||||
// ignored and the first available primitive descriptor will be chosen
|
||||
return std::make_shared<DnnlExecutor>(first_desc);
|
||||
};
|
||||
|
||||
auto cache = context->getParamsCache();
|
||||
@ -704,7 +708,9 @@ void FullyConnected::initSupportedPrimitiveDescriptors() {
|
||||
std::shared_ptr<MemoryDesc> FullyConnected::getSrcMemDesc(const dnnl::primitive_desc &prim_desc, size_t idx) const {
|
||||
auto desc = idx > 0 ? prim_desc.weights_desc(idx - 1) : prim_desc.src_desc(idx);
|
||||
|
||||
if (getInputShapeAtPort(idx).getRank() == 3) {
|
||||
if (getInputShapeAtPort(idx).getRank() == 3
|
||||
// report original plain layout for weight since it needs to be reordered dynamically at runtime
|
||||
|| idx == 1) {
|
||||
return std::make_shared<CpuBlockedMemoryDesc>(
|
||||
DnnlExtensionUtils::DataTypeToIEPrecision(desc.get_data_type()), getInputShapeAtPort(idx));
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ using namespace InferenceEngine;
|
||||
*/
|
||||
|
||||
namespace SubgraphTestsDefinitions {
|
||||
using FQLayerDQBiasParams = std::tuple<InputShape, std::string>;
|
||||
using FQLayerDQBiasParams = std::tuple<InputShape, std::string, bool>;
|
||||
|
||||
class FQLayerDQBias : virtual public SubgraphBaseTest,
|
||||
public CpuTestWithFusing,
|
||||
@ -38,7 +38,8 @@ public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<FQLayerDQBiasParams> obj) {
|
||||
InputShape input_shape;
|
||||
std::string layer_type;
|
||||
std::tie(input_shape, layer_type) = obj.param;
|
||||
bool extra_multiply;
|
||||
std::tie(input_shape, layer_type, extra_multiply) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "IS=(" << CommonTestUtils::partialShape2str({input_shape.first}) << ")_TS=(";
|
||||
@ -46,6 +47,7 @@ public:
|
||||
result << CommonTestUtils::vec2str(item) << "_";
|
||||
}
|
||||
result << ")_layer_type=" << layer_type;
|
||||
result << ")_extra_multiply=" << extra_multiply;
|
||||
return result.str();
|
||||
}
|
||||
|
||||
@ -53,7 +55,8 @@ protected:
|
||||
void SetUp() override {
|
||||
InputShape input_shape;
|
||||
std::string layer_type;
|
||||
std::tie(input_shape, layer_type) = GetParam();
|
||||
bool extra_multiply;
|
||||
std::tie(input_shape, layer_type, extra_multiply) = GetParam();
|
||||
|
||||
targetDevice = CommonTestUtils::DEVICE_CPU;
|
||||
std::tie(inFmts, outFmts, priority, selectedType) = CPUSpecificParams{{}, {}, {}, CPUTestsBase::any_type};
|
||||
@ -70,7 +73,7 @@ protected:
|
||||
const auto shapes = layer_type == "MatMul" ? std::vector<InputShape>{input_shape, input_shape}
|
||||
: std::vector<InputShape>{input_shape};
|
||||
init_input_shapes(shapes);
|
||||
function = ngraph::builder::subgraph::MarkupBiasFunction::get(ov::element::f32, inputDynamicShapes[0], {}, layer_type);
|
||||
function = ngraph::builder::subgraph::MarkupBiasFunction::get(ov::element::f32, inputDynamicShapes[0], {}, layer_type, extra_multiply);
|
||||
}
|
||||
|
||||
std::string node_type;
|
||||
@ -96,7 +99,8 @@ const std::vector<std::string> layer_types_4D_static = {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQBias_4D_static, FQLayerDQBias,
|
||||
::testing::Combine(::testing::ValuesIn(input_shapes_4D_static),
|
||||
::testing::ValuesIn(layer_types_4D_static)),
|
||||
::testing::ValuesIn(layer_types_4D_static),
|
||||
::testing::Values(false)),
|
||||
FQLayerDQBias::getTestCaseName);
|
||||
|
||||
const std::vector<InputShape> input_shapes_4D_dynamic = {
|
||||
@ -111,7 +115,8 @@ const std::vector<std::string> layer_types_4D_dynamic = {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQBias_4D_dynamic, FQLayerDQBias,
|
||||
::testing::Combine(::testing::ValuesIn(input_shapes_4D_dynamic),
|
||||
::testing::ValuesIn(layer_types_4D_dynamic)),
|
||||
::testing::ValuesIn(layer_types_4D_dynamic),
|
||||
::testing::Values(false)),
|
||||
FQLayerDQBias::getTestCaseName);
|
||||
const std::vector<InputShape> input_shapes_2D = {
|
||||
{{-1, 768}, {{1, 768}}}
|
||||
@ -123,7 +128,14 @@ const std::vector<std::string> layer_types_2D = {
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQBias_2D, FQLayerDQBias,
|
||||
::testing::Combine(::testing::ValuesIn(input_shapes_2D),
|
||||
::testing::ValuesIn(layer_types_2D)),
|
||||
::testing::ValuesIn(layer_types_2D),
|
||||
::testing::Values(false)),
|
||||
FQLayerDQBias::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_FQLayerDQExtraMultiplyAdd_2D, FQLayerDQBias,
|
||||
::testing::Combine(::testing::ValuesIn(input_shapes_2D),
|
||||
::testing::ValuesIn(layer_types_2D),
|
||||
::testing::Values(false)),
|
||||
FQLayerDQBias::getTestCaseName);
|
||||
|
||||
} // namespace
|
||||
|
@ -18,7 +18,8 @@ public:
|
||||
static std::shared_ptr<ov::Model> get(const ov::element::Type& precision,
|
||||
const ov::PartialShape& input_shape,
|
||||
const ov::PartialShape& add_shape,
|
||||
const std::string& operation_type);
|
||||
const std::string& operation_type,
|
||||
const bool extra_multipy);
|
||||
};
|
||||
} // namespace subgraph
|
||||
} // namespace builder
|
||||
|
@ -13,7 +13,8 @@ namespace subgraph {
|
||||
std::shared_ptr<ov::Model> MarkupBiasFunction::get(const ov::element::Type& precision,
|
||||
const ov::PartialShape& input_shape,
|
||||
const ov::PartialShape& add_shape,
|
||||
const std::string& layer_type) {
|
||||
const std::string& layer_type,
|
||||
const bool extra_multipy) {
|
||||
auto input_params = builder::makeDynamicParams(precision, {input_shape});
|
||||
auto il = opset1::Constant::create(precision, {}, {0.f});
|
||||
auto ih = opset1::Constant::create(precision, {}, {12.5f});
|
||||
@ -85,26 +86,41 @@ std::shared_ptr<ov::Model> MarkupBiasFunction::get(const ov::element::Type& prec
|
||||
|
||||
layer->set_friendly_name(layer_type);
|
||||
|
||||
std::shared_ptr<ov::Node> add_input;
|
||||
// empty add_shape means that add_input must be generated automatically
|
||||
if (add_shape.is_static() && add_shape.size() == 0) {
|
||||
const auto& out_shape = layer->get_output_partial_shape(0);
|
||||
Shape bias_shape(out_shape.size(), 1);
|
||||
if (layer_type != "MatMul") {
|
||||
bias_shape[1] = out_shape[1].get_length();
|
||||
const auto& out_shape = layer->get_output_partial_shape(0);
|
||||
|
||||
std::shared_ptr<ov::Node> add_input0 = layer;
|
||||
if (extra_multipy) {
|
||||
Shape mul_shape{};
|
||||
if (out_shape.is_static()) {
|
||||
mul_shape.resize(out_shape.size(), 1);
|
||||
if (layer_type != "MatMul")
|
||||
mul_shape[1] = out_shape[1].get_length();
|
||||
else
|
||||
mul_shape[out_shape.size() - 1] = out_shape[out_shape.size() - 1].get_length();
|
||||
} else {
|
||||
mul_shape = Shape{1};
|
||||
}
|
||||
add_input = builder::makeConstant<float>(precision, bias_shape, {}, true);
|
||||
std::shared_ptr<ov::Node> mul;
|
||||
auto mul_input = builder::makeConstant<float>(precision, mul_shape, {}, true);
|
||||
add_input0 = std::make_shared<ov::opset1::Multiply>(layer, mul_input);
|
||||
}
|
||||
std::shared_ptr<ov::Node> add_input1;
|
||||
// empty add_shape means that add_input1 must be generated automatically
|
||||
if (add_shape.is_static() && add_shape.size() == 0) {
|
||||
Shape bias_shape(out_shape.size(), 1);
|
||||
if (layer_type != "MatMul")
|
||||
bias_shape[1] = out_shape[1].get_length();
|
||||
add_input1 = builder::makeConstant<float>(precision, bias_shape, {}, true);
|
||||
} else {
|
||||
if (add_shape.is_static()) {
|
||||
add_input = builder::makeConstant<float>(precision, add_shape.to_shape(), {}, true);
|
||||
add_input1 = builder::makeConstant<float>(precision, add_shape.to_shape(), {}, true);
|
||||
} else {
|
||||
auto new_param = std::make_shared<ov::opset1::Parameter>(precision, input_shape);
|
||||
input_params.push_back(new_param);
|
||||
add_input = new_param;
|
||||
add_input1 = new_param;
|
||||
}
|
||||
}
|
||||
|
||||
auto add = std::make_shared<ov::opset1::Add>(layer, add_input);
|
||||
auto add = std::make_shared<ov::opset1::Add>(add_input0, add_input1);
|
||||
return std::make_shared<ov::Model>(add, input_params);
|
||||
}
|
||||
} // namespace subgraph
|
||||
|
Loading…
Reference in New Issue
Block a user