[CPU] Fixed Divide operation support for I32 precision (#3721)

* [CPU] Fixed Divide operation support for I32 precision
This commit is contained in:
Gorokhov Dmitriy 2020-12-28 19:18:19 +03:00 committed by GitHub
parent a8daab3377
commit 37b6e75730
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 66 additions and 7 deletions

View File

@ -204,14 +204,43 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
auto uni_vdiv = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
switch (exec_prc_) {
case Precision::FP32: {
h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
break;
}
case Precision::I32: {
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
// The opset doesn't contain vector instruction for integer divide operation
// As WA we emulate its behavior via fp divide followed by rounding to zero
h->uni_vcvtdq2ps(vmm_dst, vmm_src0);
h->uni_vcvtdq2ps(vmm_aux0, vmm_src1);
h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0);
h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
h->uni_vcvtps2dq(vmm_dst, vmm_dst);
break;
}
default: assert(!"unsupported precision");
}
};
if (isa == cpu::sse42) {
h->uni_vmovups(vmm_dst, vmm_src0);
h->uni_vdivps(vmm_dst, vmm_dst, vmm_src1);
uni_vdiv(vmm_dst, vmm_dst, vmm_src1);
} else {
h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
uni_vdiv(vmm_dst, vmm_src0, vmm_src1);
}
}
std::set<InferenceEngine::Precision> jit_divide_emitter::get_supported_precisions() {
return {Precision::FP32, Precision::I32};
}
size_t jit_divide_emitter::aux_vecs_count() const {
return exec_prc_ == Precision::I32 ? 1 : 0;
}
/// FLOOR_MOD ///
jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)

View File

@ -81,6 +81,7 @@ public:
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
static std::set<InferenceEngine::Precision> get_supported_precisions();
private:
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
@ -88,6 +89,7 @@ private:
template <mkldnn::impl::cpu::cpu_isa_t isa>
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
size_t aux_vecs_count() const override;
};

View File

@ -70,9 +70,12 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
std::set<Precision> prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get());
std::set<Precision> prcs_intersect = {};
std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(),
prcs.begin(), prcs.end(), std::inserter(supported_precision_intersection, supported_precision_intersection.begin()));
prcs.begin(), prcs.end(), std::inserter(prcs_intersect, prcs_intersect.begin()));
supported_precision_intersection = prcs_intersect;
}
}
@ -1723,9 +1726,29 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
return false;
};
auto isSuitableNode = [](const MKLDNNEltwiseNode* node) {
// [WA] Since execution precision change from I32 to FP32 for Divide operation may lead to incorrect results
// we disable its fusing otherwise there is no guarantee it will be executed it I32
// [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32
// (all should be handled via explicit convert operations)
if (node->getOpType() == Divide) {
for (int i = 0; i < node->getCnnLayer()->insData.size(); i++) {
if (node->getCnnLayer()->insData[i].lock()->getPrecision() == Precision::I32) {
return false;
}
}
}
return true;
};
if (!mayiuse(cpu::sse42))
return false;
if (!isSuitableNode(this)) {
return false;
}
// FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number
size_t addedInputEdgesNum = node->getType() != Quantize ? (node->getParentEdges().size() - 1) : 0;
if (getParentEdges().size() + addedInputEdgesNum > MAX_ELTWISE_INPUTS)
@ -1734,6 +1757,10 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
if (node->getType() == Eltwise) {
auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
if (eltwiseNode->getParentEdgesAtPort(0)[0]->getParent().get() != this) {
if (!isSuitableNode(this)) {
return false;
}
// Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port.
if (isOneOf(eltwiseNode->getOpType(), {Subtract, Divide, FloorMod, Mod, PowerDynamic, Greater, GreaterEqual, Less, LessEqual})) {
return false;

View File

@ -28,6 +28,7 @@ std::vector<std::vector<std::vector<size_t>>> inShapes = {
std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::FP16,
InferenceEngine::Precision::I32,
};
std::vector<ngraph::helpers::InputLayerType> secondaryInputTypes = {

View File

@ -136,7 +136,7 @@ std::vector<std::vector<InferenceEngine::Precision>> inputPrecisions = {
std::vector<std::vector<EltwiseTypes>> eltwiseOps = {
{ EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT },
{ EltwiseTypes::DIVIDE, EltwiseTypes::POWER, EltwiseTypes::ADD },
{ EltwiseTypes::DIVIDE, EltwiseTypes::SQUARED_DIFF, EltwiseTypes::ADD },
};
INSTANTIATE_TEST_CASE_P(smoke_EltwiseChain, EltwiseChainTest,

View File

@ -168,7 +168,8 @@ protected:
}
const auto max = std::max(CommonTestUtils::ie_abs(res), CommonTestUtils::ie_abs(ref));
ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold))
float diff = static_cast<float>(absoluteDifference) / static_cast<float>(max);
ASSERT_TRUE(max != 0 && (diff <= static_cast<float>(threshold)))
<< "Relative comparison of values expected: " << ref << " and actual: " << res
<< " at index " << i << " with threshold " << threshold
<< " failed";

View File

@ -635,8 +635,7 @@ tests_expected_to_fail = [
"OnnxBackendNodeModelTest.test_adagrad_multiple_cpu",
"OnnxBackendNodeModelTest.test_adagrad_cpu"),
(xfail_issue_41894,
"OnnxBackendNodeModelTest.test_max_uint16_cpu",
"OnnxBackendNodeModelTest.test_mod_int64_fmod_cpu"),
"OnnxBackendNodeModelTest.test_max_uint16_cpu"),
(xfail_issue_43523,
"OnnxBackendNodeModelTest.test_reduce_sum_do_not_keepdims_example_cpu",
"OnnxBackendNodeModelTest.test_reduce_sum_do_not_keepdims_random_cpu",