[CPU] Fixed Divide operation support for I32 precision (#3721)
* [CPU] Fixed Divide operation support for I32 precision
This commit is contained in:
parent
a8daab3377
commit
37b6e75730
@ -204,14 +204,43 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
|
||||
Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
|
||||
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
|
||||
|
||||
auto uni_vdiv = [this](Vmm vmm_dst, Vmm vmm_src0, Vmm vmm_src1) {
|
||||
switch (exec_prc_) {
|
||||
case Precision::FP32: {
|
||||
h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
|
||||
break;
|
||||
}
|
||||
case Precision::I32: {
|
||||
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
|
||||
|
||||
// The opset doesn't contain vector instruction for integer divide operation
|
||||
// As WA we emulate its behavior via fp divide followed by rounding to zero
|
||||
h->uni_vcvtdq2ps(vmm_dst, vmm_src0);
|
||||
h->uni_vcvtdq2ps(vmm_aux0, vmm_src1);
|
||||
h->uni_vdivps(vmm_dst, vmm_dst, vmm_aux0);
|
||||
h->uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
|
||||
h->uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
break;
|
||||
}
|
||||
default: assert(!"unsupported precision");
|
||||
}
|
||||
};
|
||||
|
||||
if (isa == cpu::sse42) {
|
||||
h->uni_vmovups(vmm_dst, vmm_src0);
|
||||
h->uni_vdivps(vmm_dst, vmm_dst, vmm_src1);
|
||||
uni_vdiv(vmm_dst, vmm_dst, vmm_src1);
|
||||
} else {
|
||||
h->uni_vdivps(vmm_dst, vmm_src0, vmm_src1);
|
||||
uni_vdiv(vmm_dst, vmm_src0, vmm_src1);
|
||||
}
|
||||
}
|
||||
|
||||
std::set<InferenceEngine::Precision> jit_divide_emitter::get_supported_precisions() {
|
||||
return {Precision::FP32, Precision::I32};
|
||||
}
|
||||
|
||||
size_t jit_divide_emitter::aux_vecs_count() const {
|
||||
return exec_prc_ == Precision::I32 ? 1 : 0;
|
||||
}
|
||||
|
||||
/// FLOOR_MOD ///
|
||||
jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
|
@ -81,6 +81,7 @@ public:
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
static std::set<InferenceEngine::Precision> get_supported_precisions();
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
|
||||
@ -88,6 +89,7 @@ private:
|
||||
|
||||
template <mkldnn::impl::cpu::cpu_isa_t isa>
|
||||
void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
|
||||
size_t aux_vecs_count() const override;
|
||||
};
|
||||
|
||||
|
||||
|
@ -70,9 +70,12 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
|
||||
for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) {
|
||||
if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) {
|
||||
std::set<Precision> prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get());
|
||||
std::set<Precision> prcs_intersect = {};
|
||||
|
||||
std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(),
|
||||
prcs.begin(), prcs.end(), std::inserter(supported_precision_intersection, supported_precision_intersection.begin()));
|
||||
prcs.begin(), prcs.end(), std::inserter(prcs_intersect, prcs_intersect.begin()));
|
||||
|
||||
supported_precision_intersection = prcs_intersect;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1723,9 +1726,29 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
|
||||
return false;
|
||||
};
|
||||
|
||||
auto isSuitableNode = [](const MKLDNNEltwiseNode* node) {
|
||||
// [WA] Since execution precision change from I32 to FP32 for Divide operation may lead to incorrect results
|
||||
// we disable its fusing otherwise there is no guarantee it will be executed it I32
|
||||
// [TODO] We need to rewrite support for different precisions at all to avoid implicit conversions to FP32
|
||||
// (all should be handled via explicit convert operations)
|
||||
if (node->getOpType() == Divide) {
|
||||
for (int i = 0; i < node->getCnnLayer()->insData.size(); i++) {
|
||||
if (node->getCnnLayer()->insData[i].lock()->getPrecision() == Precision::I32) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
if (!mayiuse(cpu::sse42))
|
||||
return false;
|
||||
|
||||
if (!isSuitableNode(this)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// FQ inputs with quantization parameters will be hided inside post_op object, so will not increase inputs number
|
||||
size_t addedInputEdgesNum = node->getType() != Quantize ? (node->getParentEdges().size() - 1) : 0;
|
||||
if (getParentEdges().size() + addedInputEdgesNum > MAX_ELTWISE_INPUTS)
|
||||
@ -1734,6 +1757,10 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
|
||||
if (node->getType() == Eltwise) {
|
||||
auto eltwiseNode = dynamic_cast<MKLDNNEltwiseNode*>(node.get());
|
||||
if (eltwiseNode->getParentEdgesAtPort(0)[0]->getParent().get() != this) {
|
||||
if (!isSuitableNode(this)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Eltwise jitter doesn't respect commutative property, so fusing is disabled in case it applied not for 0-th port.
|
||||
if (isOneOf(eltwiseNode->getOpType(), {Subtract, Divide, FloorMod, Mod, PowerDynamic, Greater, GreaterEqual, Less, LessEqual})) {
|
||||
return false;
|
||||
|
@ -28,6 +28,7 @@ std::vector<std::vector<std::vector<size_t>>> inShapes = {
|
||||
std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||
InferenceEngine::Precision::FP32,
|
||||
InferenceEngine::Precision::FP16,
|
||||
InferenceEngine::Precision::I32,
|
||||
};
|
||||
|
||||
std::vector<ngraph::helpers::InputLayerType> secondaryInputTypes = {
|
||||
|
@ -136,7 +136,7 @@ std::vector<std::vector<InferenceEngine::Precision>> inputPrecisions = {
|
||||
|
||||
std::vector<std::vector<EltwiseTypes>> eltwiseOps = {
|
||||
{ EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT },
|
||||
{ EltwiseTypes::DIVIDE, EltwiseTypes::POWER, EltwiseTypes::ADD },
|
||||
{ EltwiseTypes::DIVIDE, EltwiseTypes::SQUARED_DIFF, EltwiseTypes::ADD },
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_EltwiseChain, EltwiseChainTest,
|
||||
|
@ -168,7 +168,8 @@ protected:
|
||||
}
|
||||
|
||||
const auto max = std::max(CommonTestUtils::ie_abs(res), CommonTestUtils::ie_abs(ref));
|
||||
ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold))
|
||||
float diff = static_cast<float>(absoluteDifference) / static_cast<float>(max);
|
||||
ASSERT_TRUE(max != 0 && (diff <= static_cast<float>(threshold)))
|
||||
<< "Relative comparison of values expected: " << ref << " and actual: " << res
|
||||
<< " at index " << i << " with threshold " << threshold
|
||||
<< " failed";
|
||||
|
@ -635,8 +635,7 @@ tests_expected_to_fail = [
|
||||
"OnnxBackendNodeModelTest.test_adagrad_multiple_cpu",
|
||||
"OnnxBackendNodeModelTest.test_adagrad_cpu"),
|
||||
(xfail_issue_41894,
|
||||
"OnnxBackendNodeModelTest.test_max_uint16_cpu",
|
||||
"OnnxBackendNodeModelTest.test_mod_int64_fmod_cpu"),
|
||||
"OnnxBackendNodeModelTest.test_max_uint16_cpu"),
|
||||
(xfail_issue_43523,
|
||||
"OnnxBackendNodeModelTest.test_reduce_sum_do_not_keepdims_example_cpu",
|
||||
"OnnxBackendNodeModelTest.test_reduce_sum_do_not_keepdims_random_cpu",
|
||||
|
Loading…
Reference in New Issue
Block a user