[BF16] BF16 simulator for AVX512 was done (#3424)
This commit is contained in:
parent
2cfc8ade62
commit
3bcac1641d
@ -33,6 +33,9 @@ Config::Config() {
|
||||
streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::CORES;
|
||||
#endif
|
||||
|
||||
if (!with_cpu_x86_bfloat16())
|
||||
enforceBF16 = false;
|
||||
|
||||
updateProperties();
|
||||
}
|
||||
|
||||
@ -93,7 +96,7 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
|
||||
dumpQuantizedGraphToIr = val;
|
||||
} else if (key == PluginConfigParams::KEY_ENFORCE_BF16) {
|
||||
if (val == PluginConfigParams::YES) {
|
||||
if (with_cpu_x86_bfloat16())
|
||||
if (with_cpu_x86_avx512_core())
|
||||
enforceBF16 = true;
|
||||
else
|
||||
THROW_IE_EXCEPTION << "Platform doesn't support BF16 format";
|
||||
@ -143,8 +146,6 @@ void Config::updateProperties() {
|
||||
_config.insert({ PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS, std::to_string(streamExecutorConfig._streams) });
|
||||
_config.insert({ PluginConfigParams::KEY_CPU_THREADS_NUM, std::to_string(streamExecutorConfig._threads) });
|
||||
_config.insert({ PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT, dumpToDot });
|
||||
if (!with_cpu_x86_bfloat16())
|
||||
enforceBF16 = false;
|
||||
if (enforceBF16)
|
||||
_config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES });
|
||||
else
|
||||
|
@ -63,7 +63,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
|
||||
i++;
|
||||
}
|
||||
|
||||
if (with_cpu_x86_bfloat16() && isFloatModel) {
|
||||
if (with_cpu_x86_avx512_core() && isFloatModel) {
|
||||
BF16Transformer bf16Transformer;
|
||||
CNNNetwork cnnetwork(_clonedNetwork);
|
||||
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.
|
||||
|
@ -13,7 +13,7 @@ namespace MKLDNNPlugin {
|
||||
|
||||
class jit_emitter {
|
||||
public:
|
||||
jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
|
||||
: h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) {
|
||||
k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well
|
||||
@ -32,7 +32,7 @@ protected:
|
||||
size_t get_max_vecs_count() const;
|
||||
size_t get_vec_length() const;
|
||||
|
||||
const MKLDNNNode& n;
|
||||
const MKLDNNNode* n;
|
||||
mkldnn::impl::cpu::jit_generator* h;
|
||||
mkldnn::impl::cpu::cpu_isa_t host_isa_;
|
||||
InferenceEngine::Precision exec_prc_;
|
||||
|
@ -48,8 +48,12 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator() {
|
||||
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
|
||||
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
|
||||
mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
|
||||
mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]);
|
||||
@ -72,16 +76,16 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
|
||||
load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
|
||||
|
||||
if (isa == sse42) {
|
||||
if (isa == cpu::sse42) {
|
||||
uni_vmovups(vmm_mask, vmm_val);
|
||||
uni_vcmpgtps(vmm_mask, vmm_mask, vmm_max);
|
||||
} else if (isa == avx2) {
|
||||
} else if (isa == cpu::avx2) {
|
||||
uni_vcmpgtps(vmm_mask, vmm_val, vmm_max);
|
||||
} else {
|
||||
vcmpps(k_mask, vmm_val, vmm_max, _cmp_nle_us);
|
||||
}
|
||||
|
||||
if (isa == avx512_common) {
|
||||
if (isa == cpu::avx512_common) {
|
||||
vptestmd(k_mask, vmm_mask, vmm_mask);
|
||||
vblendmps(vmm_max | k_mask, vmm_max, vmm_val);
|
||||
} else {
|
||||
@ -143,13 +147,17 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
exp_injector->prepare_table();
|
||||
|
||||
ker_ = (decltype(ker_))this->getCode();
|
||||
}
|
||||
|
||||
private:
|
||||
using Vmm = typename conditional3<isa == sse42, Xbyak::Xmm, isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
|
||||
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
|
||||
Xbyak::Ymm, Xbyak::Zmm>::type;
|
||||
size_t vlen = cpu_isa_traits<isa>::vlen;
|
||||
|
||||
Xbyak::Reg64 reg_src = r8;
|
||||
@ -169,6 +177,8 @@ private:
|
||||
|
||||
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
|
||||
|
||||
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_dt) {
|
||||
@ -192,8 +202,11 @@ private:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case Precision::BF16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
@ -204,7 +217,7 @@ private:
|
||||
SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
|
||||
: input_prec(inpPrc), output_prec(outPrc) {
|
||||
if (Precision::BF16 == output_prec) {
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
if (!mayiuse(avx512_core)) {
|
||||
THROW_IE_EXCEPTION << "SoftmaxGeneric doesn't support BF16 precision on this target.";
|
||||
}
|
||||
}
|
||||
@ -214,14 +227,14 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
|
||||
jcp.src_dt = inpPrc;
|
||||
jcp.dst_dt = outPrc;
|
||||
|
||||
if (mayiuse(avx512_common)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx512_common>(jcp));
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx512_common>(jcp));
|
||||
block_size = 16;
|
||||
} else if (mayiuse(avx2)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx2>(jcp));
|
||||
} else if (mayiuse(cpu::avx2)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx2>(jcp));
|
||||
block_size = 8;
|
||||
} else if (mayiuse(sse42)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<sse42>(jcp));
|
||||
} else if (mayiuse(cpu::sse42)) {
|
||||
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::sse42>(jcp));
|
||||
block_size = 4;
|
||||
}
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ using namespace Xbyak;
|
||||
namespace MKLDNNPlugin {
|
||||
|
||||
/// ADD ///
|
||||
jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_add_emitter::get_inputs_num() { return 2; }
|
||||
@ -50,7 +50,7 @@ void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std
|
||||
}
|
||||
|
||||
/// MUL_ADD ///
|
||||
jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_mul_add_emitter::get_inputs_num() { return 3; }
|
||||
@ -109,7 +109,7 @@ size_t jit_mul_add_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// SUB ///
|
||||
jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_subtract_emitter::get_inputs_num() { return 2; }
|
||||
@ -144,7 +144,7 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, cons
|
||||
|
||||
|
||||
/// MULTIPLY ///
|
||||
jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_multiply_emitter::get_inputs_num() { return 2; }
|
||||
@ -179,7 +179,7 @@ void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, cons
|
||||
|
||||
|
||||
/// DIVIDE ///
|
||||
jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_divide_emitter::get_inputs_num() { return 2; }
|
||||
@ -214,7 +214,7 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
|
||||
|
||||
|
||||
/// FLOOR_MOD ///
|
||||
jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_floor_mod_emitter::get_inputs_num() { return 2; }
|
||||
@ -263,7 +263,7 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// MOD ///
|
||||
jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_mod_emitter::get_inputs_num() { return 2; }
|
||||
@ -312,7 +312,7 @@ size_t jit_mod_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// MAXIMUM ///
|
||||
jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_maximum_emitter::get_inputs_num() { return 2; }
|
||||
@ -359,7 +359,7 @@ std::set<InferenceEngine::Precision> jit_maximum_emitter::get_supported_precisio
|
||||
}
|
||||
|
||||
/// MINIMUM ///
|
||||
jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_minimum_emitter::get_inputs_num() { return 2; }
|
||||
@ -406,7 +406,7 @@ std::set<InferenceEngine::Precision> jit_minimum_emitter::get_supported_precisio
|
||||
}
|
||||
|
||||
/// SQUARED_DIFFERENCE ///
|
||||
jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_squared_difference_emitter::get_inputs_num() { return 2; }
|
||||
@ -444,7 +444,7 @@ void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_
|
||||
|
||||
|
||||
/// POWER_DYNAMIC ///
|
||||
jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {}
|
||||
|
||||
size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; }
|
||||
@ -550,7 +550,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
|
||||
|
||||
|
||||
/// EQUAL ///
|
||||
jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -606,7 +606,7 @@ size_t jit_equal_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// NOT_EQUAL ///
|
||||
jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -662,7 +662,7 @@ size_t jit_not_equal_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// GREATER ///
|
||||
jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -718,7 +718,7 @@ size_t jit_greater_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// GREATER_EQUAL ///
|
||||
jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -774,7 +774,7 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// LESS ///
|
||||
jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -830,7 +830,7 @@ size_t jit_less_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// LESS_EQUAL ///
|
||||
jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -887,7 +887,7 @@ size_t jit_less_equal_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// LOGICAL_AND ///
|
||||
jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -964,7 +964,7 @@ size_t jit_logical_and_emitter::aux_vecs_count() const {
|
||||
|
||||
|
||||
/// LOGICAL_OR ///
|
||||
jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -1040,7 +1040,7 @@ size_t jit_logical_or_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// LOGICAL_XOR ///
|
||||
jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -1116,7 +1116,7 @@ size_t jit_logical_xor_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// LOGICAL_NOT ///
|
||||
jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -1171,7 +1171,7 @@ size_t jit_logical_not_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// POWER_STATIC ///
|
||||
jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
@ -1198,7 +1198,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
|
||||
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
|
||||
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
|
||||
|
||||
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
|
||||
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n->getCnnLayer().get());
|
||||
if (powerLayer == nullptr)
|
||||
THROW_IE_EXCEPTION << "Cannot convert power layer.";
|
||||
|
||||
@ -1340,7 +1340,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
|
||||
}
|
||||
|
||||
void jit_power_static_emitter::register_table_entries() {
|
||||
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
|
||||
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n->getCnnLayer().get());
|
||||
if (powerLayer == nullptr)
|
||||
THROW_IE_EXCEPTION << "Cannot convert power layer.";
|
||||
|
||||
@ -1359,7 +1359,7 @@ size_t jit_power_static_emitter::aux_vecs_count() const {
|
||||
}
|
||||
|
||||
/// PRELU ///
|
||||
jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
|
||||
jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ namespace MKLDNNPlugin {
|
||||
|
||||
class jit_add_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -27,7 +27,7 @@ private:
|
||||
|
||||
class jit_mul_add_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -45,7 +45,7 @@ private:
|
||||
|
||||
class jit_subtract_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -61,7 +61,7 @@ private:
|
||||
|
||||
class jit_multiply_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -77,7 +77,7 @@ private:
|
||||
|
||||
class jit_divide_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -93,7 +93,7 @@ private:
|
||||
|
||||
class jit_floor_mod_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -110,7 +110,7 @@ private:
|
||||
|
||||
class jit_mod_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -127,7 +127,7 @@ private:
|
||||
|
||||
class jit_maximum_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -144,7 +144,7 @@ private:
|
||||
|
||||
class jit_minimum_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -161,7 +161,7 @@ private:
|
||||
|
||||
class jit_squared_difference_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -177,7 +177,7 @@ private:
|
||||
|
||||
class jit_power_dynamic_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -193,7 +193,7 @@ private:
|
||||
|
||||
class jit_equal_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -212,7 +212,7 @@ private:
|
||||
|
||||
class jit_not_equal_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -231,7 +231,7 @@ private:
|
||||
|
||||
class jit_greater_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -250,7 +250,7 @@ private:
|
||||
|
||||
class jit_greater_equal_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -269,7 +269,7 @@ private:
|
||||
|
||||
class jit_less_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -288,7 +288,7 @@ private:
|
||||
|
||||
class jit_less_equal_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -307,7 +307,7 @@ private:
|
||||
|
||||
class jit_logical_and_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -326,7 +326,7 @@ private:
|
||||
|
||||
class jit_logical_or_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -345,7 +345,7 @@ private:
|
||||
|
||||
class jit_logical_xor_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -363,7 +363,7 @@ private:
|
||||
|
||||
class jit_logical_not_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -381,7 +381,7 @@ private:
|
||||
|
||||
class jit_power_static_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
@ -399,7 +399,7 @@ private:
|
||||
|
||||
class jit_prelu_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
|
@ -13,9 +13,9 @@ using namespace Xbyak;
|
||||
|
||||
namespace MKLDNNPlugin {
|
||||
|
||||
jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, InferenceEngine::Precision exec_prc)
|
||||
jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc)
|
||||
: jit_emitter(host, host_isa, node, exec_prc) {
|
||||
auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(n);
|
||||
auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(*n);
|
||||
|
||||
auto alg = static_cast<mkldnn_alg_kind_t>(eltwiseNode.getAlgorithm());
|
||||
|
||||
|
@ -13,7 +13,7 @@ namespace MKLDNNPlugin {
|
||||
|
||||
class jit_mkldnn_emitter : public jit_emitter {
|
||||
public:
|
||||
jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
|
||||
jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
|
||||
|
||||
size_t get_inputs_num() override;
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <cmath>
|
||||
#include <mkldnn_types.h>
|
||||
#include <mkldnn_extension_utils.h>
|
||||
#include "utils/bfloat16.hpp"
|
||||
#include "ie_parallel.hpp"
|
||||
#include "mkldnn_quantize_node.h"
|
||||
#include <map>
|
||||
@ -45,7 +46,7 @@ struct EltwiseEmitterContext {
|
||||
std::shared_ptr<jit_emitter> emitter;
|
||||
mkldnn::impl::cpu::jit_generator *host;
|
||||
mkldnn::impl::cpu::cpu_isa_t host_isa;
|
||||
const MKLDNNNode & node;
|
||||
const MKLDNNNode * node;
|
||||
InferenceEngine::Precision exec_prc;
|
||||
};
|
||||
|
||||
@ -108,6 +109,9 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
for (int i = 0; i < jep.inputs_number; i++)
|
||||
@ -273,6 +277,9 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
eltwise_emitter->emit_table();
|
||||
for (int i = 0; i < post_op_emitters.size(); i++) {
|
||||
post_op_emitters[i]->emit_table();
|
||||
@ -320,6 +327,8 @@ private:
|
||||
Vmm vmm_d_bias = Vmm(13);
|
||||
Vmm vmm_zero = Vmm(15);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
std::shared_ptr<jit_emitter> eltwise_emitter = nullptr;
|
||||
std::vector<std::shared_ptr<jit_emitter>> post_op_emitters = {};
|
||||
|
||||
@ -392,12 +401,13 @@ private:
|
||||
|
||||
std::shared_ptr<jit_emitter> create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) {
|
||||
auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(node);
|
||||
const MKLDNNNode * eltwiseNodePtr = dynamic_cast<const MKLDNNNode*>(&node);
|
||||
|
||||
EltwiseEmitterContext ctx = {
|
||||
nullptr,
|
||||
this,
|
||||
isa,
|
||||
eltwiseNode,
|
||||
eltwiseNodePtr,
|
||||
exec_prec
|
||||
};
|
||||
|
||||
@ -615,8 +625,11 @@ private:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case Precision::BF16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
break;
|
||||
case Precision::I16:
|
||||
if (isa == avx512_common) {
|
||||
@ -1024,7 +1037,7 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
if (!mayiuse(avx512_core)) {
|
||||
bool hasBF16 = false;
|
||||
for (auto &inPrc : inputPrecisions)
|
||||
if (inPrc == Precision::BF16)
|
||||
|
@ -21,7 +21,7 @@
|
||||
#include "jit_uni_depthwise.hpp"
|
||||
#include "jit_uni_quantization.hpp"
|
||||
#include "common/cpu_memcpy.h"
|
||||
#include "ngraph/type/bfloat16.hpp"
|
||||
#include "utils/bfloat16.hpp"
|
||||
|
||||
using namespace mkldnn;
|
||||
using namespace MKLDNNPlugin;
|
||||
@ -59,6 +59,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
if (attr_.post_ops_.len_ != 0)
|
||||
@ -134,6 +137,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
for (auto& inj : eltwise_injectors)
|
||||
inj->prepare_table();
|
||||
if ((jcp_.mode == InterpolateMode::cubic) && (jcp_.layout == InterpolateLayoutType::planar)) {
|
||||
@ -224,6 +230,8 @@ private:
|
||||
Xbyak::Label l_table_constant;
|
||||
Opmask k_mask = Xbyak::Opmask(1);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
|
||||
std::vector<std::shared_ptr<jit_uni_depthwise_injector_f32<isa>>> depthwise_injectors;
|
||||
std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
|
||||
@ -1278,12 +1286,11 @@ private:
|
||||
movd(op, xmm_dst);
|
||||
}
|
||||
} else if (dst_dt == memory::bf16) {
|
||||
if (mayiuse(avx512_core_bf16)) {
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
} else {
|
||||
assert(!"data type of bf16 is only supported for ISA:avx512_core_bf16");
|
||||
}
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1584,7 +1591,7 @@ void MKLDNNInterpolateNode::initSupportedPrimitiveDescriptors() {
|
||||
if ((inputPrecision != Precision::I8) && (inputPrecision != Precision::U8) && (inputPrecision != Precision::BF16)) {
|
||||
inputPrecision = Precision::FP32;
|
||||
}
|
||||
if ((inputPrecision == Precision::BF16) && !mayiuse(avx512_core_bf16)) {
|
||||
if ((inputPrecision == Precision::BF16) && !mayiuse(avx512_core)) {
|
||||
inputPrecision = Precision::FP32;
|
||||
}
|
||||
Precision outputPrecision = inputPrecision;
|
||||
@ -2714,7 +2721,7 @@ float MKLDNNInterpolateNode::getValue(const uint8_t *base, size_t offset, Infere
|
||||
}
|
||||
case Precision::BF16: {
|
||||
const uint16_t *valuePtr = reinterpret_cast<const uint16_t *>(baseOffset);
|
||||
return ngraph::bfloat16::from_bits(*valuePtr);
|
||||
return bfloat16_t::from_bits(*valuePtr);
|
||||
break;
|
||||
}
|
||||
case Precision::FP32: {
|
||||
@ -2743,7 +2750,7 @@ void MKLDNNInterpolateNode::setValue(uint8_t *base, size_t offset, float value,
|
||||
break;
|
||||
}
|
||||
case Precision::BF16: {
|
||||
uint16_t data = ngraph::bfloat16(value).to_bits();
|
||||
uint16_t data = bfloat16_t(value).to_bits();
|
||||
std::memcpy(baseOffset, &data, 2);
|
||||
break;
|
||||
}
|
||||
|
@ -240,6 +240,9 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
|
||||
@ -311,6 +314,9 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
for (auto& inj : eltwise_injectors)
|
||||
inj->prepare_table();
|
||||
|
||||
@ -344,6 +350,8 @@ private:
|
||||
Vmm vmm_d_weights = Vmm(5);
|
||||
Vmm vmm_d_bias = Vmm(6);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
Xbyak::Label l_table;
|
||||
|
||||
std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
|
||||
@ -381,8 +389,11 @@ private:
|
||||
if (dst_dt == memory::f32) {
|
||||
uni_vmovups(op, vmm_dst);
|
||||
} else if (dst_dt == memory::bf16) {
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
} else if (dst_dt == memory::u8) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
if (isa == cpu::avx512_common) {
|
||||
@ -504,7 +515,7 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
if (!mayiuse(avx512_core)) {
|
||||
if (outputPrecision == Precision::BF16)
|
||||
outputPrecision = Precision::FP32;
|
||||
}
|
||||
|
@ -165,6 +165,9 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
|
||||
}
|
||||
}
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
|
||||
@ -188,6 +191,8 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
for (auto& inj : eltwise_injectors)
|
||||
inj->prepare_table();
|
||||
|
||||
@ -230,6 +235,8 @@ private:
|
||||
Vmm vmm_d_bias = Vmm(6);
|
||||
Vmm vmm_zero = Vmm(7);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
|
||||
std::vector<std::shared_ptr<jit_uni_depthwise_injector_f32<isa>>> depthwise_injectors;
|
||||
std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
|
||||
@ -580,7 +587,10 @@ private:
|
||||
if (dst_dt == memory::f32) {
|
||||
uni_vmovups(op, vmm_dst);
|
||||
} else if (dst_dt == memory::bf16) {
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
} else if (dst_dt == memory::u8) {
|
||||
uni_vcvtps2dq(vmm_dst, vmm_dst);
|
||||
@ -752,7 +762,7 @@ void MKLDNNNormalizeNode::getSupportedDescriptors() {
|
||||
weights_blob->allocate();
|
||||
float* src = layer->blobs.at("weights")->buffer();
|
||||
float* dst = weights_blob->wmap();
|
||||
memcpy(dst, src, layer->blobs.at("weights")->byteSize());
|
||||
cpu_memcpy(dst, src, layer->blobs.at("weights")->byteSize());
|
||||
} else if (weights_prec == Precision::BF16) {
|
||||
MKLDNNPlugin::BF16Transformer transformer;
|
||||
weights_blob = transformer.convertBF16ToFloat(tweights);
|
||||
@ -780,7 +790,7 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
|
||||
}
|
||||
|
||||
if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) {
|
||||
if (!mayiuse(avx512_core_bf16))
|
||||
if (!mayiuse(avx512_core))
|
||||
inputPrecision = outputPrecision = Precision::FP32;
|
||||
else
|
||||
inputPrecision = outputPrecision = Precision::BF16;
|
||||
|
@ -78,6 +78,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
|
||||
: jit_uni_reduce_kernel(jcp), jit_generator() {
|
||||
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
|
||||
@ -103,6 +106,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
if (jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::L1 || jcp_.reduce_mode == Reduce::Max ||
|
||||
jcp_.reduce_mode == Reduce::Min || jcp_.reduce_mode == Reduce::Prod || jcp_.reduce_mode == Reduce::Or) {
|
||||
prepare_aux_table();
|
||||
@ -146,6 +152,8 @@ private:
|
||||
|
||||
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
Xbyak::Label l_table;
|
||||
|
||||
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
|
||||
@ -605,8 +613,11 @@ private:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
break;
|
||||
case memory::s8:
|
||||
if (isa == avx512_common) {
|
||||
@ -806,6 +817,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
|
||||
: jit_uni_reduce_post_kernel(jcp), jit_generator() {
|
||||
log_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_log, 0.f, 0.f));
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
|
||||
@ -823,6 +837,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) {
|
||||
log_injector->prepare_table();
|
||||
}
|
||||
@ -855,6 +872,8 @@ private:
|
||||
Xbyak::Xmm xmm_aux2 = Xbyak::Xmm(5);
|
||||
Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(6);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> log_injector;
|
||||
|
||||
inline void reduce_post_main() {
|
||||
@ -1063,8 +1082,11 @@ private:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case memory::bf16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
break;
|
||||
case memory::s8:
|
||||
if (isa == avx512_common) {
|
||||
@ -1355,7 +1377,7 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
|
||||
// Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to
|
||||
// the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32.
|
||||
if (Precision::BF16 == outputPrecision) {
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
if (!mayiuse(avx512_core)) {
|
||||
outputPrecision = Precision::FP32;
|
||||
} else if (reduceMode != Reduce::And && reduceMode != Reduce::Or &&
|
||||
reduceMode != Reduce::Max && reduceMode != Reduce::Min) {
|
||||
|
@ -16,8 +16,9 @@
|
||||
#include "jit_generator.hpp"
|
||||
#include "jit_uni_eltwise.hpp"
|
||||
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace mkldnn;
|
||||
using namespace MKLDNNPlugin;
|
||||
using namespace InferenceEngine;
|
||||
using namespace mkldnn::impl::cpu;
|
||||
using namespace mkldnn::impl::utils;
|
||||
|
||||
@ -56,6 +57,9 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
|
||||
jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jit_uni_logistic_kernel(), jit_generator() {
|
||||
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
|
||||
|
||||
this->preamble();
|
||||
|
||||
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
|
||||
@ -103,6 +107,9 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
|
||||
|
||||
this->postamble();
|
||||
|
||||
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
|
||||
emu_vcvtneps2bf16->emit_table();
|
||||
|
||||
exp_injector->prepare_table();
|
||||
|
||||
prepare_table();
|
||||
@ -111,7 +118,7 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
|
||||
}
|
||||
|
||||
private:
|
||||
using Vmm = typename conditional3<isa == sse42, Xbyak::Xmm, isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
|
||||
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
|
||||
size_t vlen = cpu_isa_traits<isa>::vlen;
|
||||
|
||||
Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; }
|
||||
@ -130,6 +137,8 @@ private:
|
||||
|
||||
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
|
||||
|
||||
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
|
||||
|
||||
Xbyak::Label l_table;
|
||||
|
||||
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
|
||||
@ -148,10 +157,10 @@ private:
|
||||
uni_vmovups(vmm_aux2, table_val(1));
|
||||
uni_vsubps(vmm_aux2, vmm_aux2, vmm_src);
|
||||
|
||||
if (isa == sse42) {
|
||||
if (isa == cpu::sse42) {
|
||||
uni_vblendvps(vmm_aux2, vmm_aux2, vmm_src, vmm_aux0);
|
||||
uni_vmovups(vmm_src, vmm_aux2);
|
||||
} else if (isa == avx2) {
|
||||
} else if (isa == cpu::avx2) {
|
||||
uni_vblendvps(vmm_src, vmm_aux2, vmm_src, vmm_aux0);
|
||||
} else {
|
||||
vptestmd(k_mask, vmm_aux0, vmm_aux0);
|
||||
@ -199,8 +208,11 @@ private:
|
||||
uni_vmovups(op, vmm_dst);
|
||||
break;
|
||||
case InferenceEngine::Precision::BF16:
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
uni_vmovups(op, ymm_dst);
|
||||
if (mayiuse(avx512_core_bf16))
|
||||
vcvtneps2bf16(ymm_dst, vmm_dst);
|
||||
else
|
||||
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
|
||||
vmovdqu16(op, ymm_dst);
|
||||
break;
|
||||
default:
|
||||
assert(!"unknown dst_dt");
|
||||
@ -253,7 +265,7 @@ public:
|
||||
}
|
||||
|
||||
if (Precision::BF16 == output_prec) {
|
||||
if (!mayiuse(avx512_core_bf16)) {
|
||||
if (!mayiuse(avx512_core)) {
|
||||
output_prec = Precision::FP32;
|
||||
}
|
||||
}
|
||||
@ -269,14 +281,14 @@ public:
|
||||
jcp.src_data_size = jcp.dst_data_size = output_prec.size();
|
||||
|
||||
block_size = 1;
|
||||
if (mayiuse(avx512_common)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx512_common>(jcp));
|
||||
if (mayiuse(cpu::avx512_common)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx512_common>(jcp));
|
||||
block_size = 16;
|
||||
} else if (mayiuse(avx2)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx2>(jcp));
|
||||
} else if (mayiuse(cpu::avx2)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx2>(jcp));
|
||||
block_size = 8;
|
||||
} else if (mayiuse(sse42)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<sse42>(jcp));
|
||||
} else if (mayiuse(cpu::sse42)) {
|
||||
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::sse42>(jcp));
|
||||
block_size = 4;
|
||||
}
|
||||
|
||||
@ -383,7 +395,7 @@ private:
|
||||
inline void calculate_logistic(size_t start_index, int count, uint8_t * dst_data) {
|
||||
auto dst_data_size = output_prec.size();
|
||||
if (logistic_kernel) {
|
||||
int blocks_num = div_up(count, block_size);
|
||||
int blocks_num = MKLDNNPlugin::div_up(count, block_size);
|
||||
parallel_for(blocks_num, [&](int ib) {
|
||||
int idx = ib * block_size;
|
||||
int work_amount = std::min(count - idx, block_size);
|
||||
|
@ -6,13 +6,15 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include "utils.hpp"
|
||||
#include "nodes/common/emitter.h"
|
||||
|
||||
/**
|
||||
* The bfloat16_t class can be used as an arithmetic type. All arithmetic operations goes through conversion to the float data type.
|
||||
*/
|
||||
|
||||
|
||||
#define BFLOAT16_ROUND_MODE_TRUNCATE
|
||||
#define BFLOAT16_ROUND_MODE_TO_NEAREST_EVEN
|
||||
|
||||
namespace MKLDNNPlugin {
|
||||
class bfloat16_t {
|
||||
@ -71,6 +73,69 @@ private:
|
||||
};
|
||||
uint16_t m_value;
|
||||
};
|
||||
|
||||
|
||||
class jit_emu_vcvtneps2bf16 : public jit_emitter {
|
||||
public:
|
||||
jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
|
||||
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) {
|
||||
prepare_table();
|
||||
};
|
||||
|
||||
size_t get_inputs_num() { return 1; };
|
||||
|
||||
private:
|
||||
void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs,
|
||||
const std::vector<size_t>& pool_vec_idxs, const std::vector<size_t>& pool_gpr_idxs) {
|
||||
if (host_isa_ == mkldnn::impl::cpu::cpu_isa_t::avx512_common) {
|
||||
Xbyak::Zmm in = Xbyak::Zmm(in_vec_idxs[0]);
|
||||
Xbyak::Ymm out = Xbyak::Ymm(out_vec_idxs[0]);
|
||||
Xbyak::Zmm aux = Xbyak::Zmm(aux_vec_idxs[0]);
|
||||
Xbyak::Zmm aux1 = Xbyak::Zmm(aux_vec_idxs[1]);
|
||||
|
||||
h->uni_vpsrld(aux, in, 16);
|
||||
h->vpandd(aux, aux, table_val("one"));
|
||||
h->uni_vmovups(aux1, table_val("even"));
|
||||
h->uni_vpaddd(aux, aux1, aux);
|
||||
h->uni_vpaddd(aux, in, aux);
|
||||
h->vfixupimmps(aux, in, table_val("selector"), 0);
|
||||
h->vpsrad(aux, aux, 16);
|
||||
h->vpmovdw(out, aux);
|
||||
} else {
|
||||
assert(!"unsupported isa");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
inline int encode_fixup_selector(int input, int output) {
|
||||
return ((output) << (4 * (input)));
|
||||
}
|
||||
|
||||
void register_table_entries() {
|
||||
enum {
|
||||
fixup_input_code_qnan_ = 0,
|
||||
fixup_input_code_snan_ = 1,
|
||||
fixup_input_code_ninf_ = 4,
|
||||
fixup_input_code_pinf_ = 5,
|
||||
fixup_output_code_copy_input_ = 1,
|
||||
fixup_output_code_qnan_input_ = 2,
|
||||
};
|
||||
const int selector_int32 =
|
||||
/* qnan input to qnan output (presenrving input bits 0..21) */
|
||||
encode_fixup_selector(fixup_input_code_snan_, fixup_output_code_qnan_input_) |
|
||||
/* snan input to qnan output (presenrving input bits 0..21) */
|
||||
encode_fixup_selector(fixup_input_code_qnan_, fixup_output_code_qnan_input_) |
|
||||
/* neg inf input copied to output */
|
||||
encode_fixup_selector(fixup_input_code_ninf_, fixup_output_code_copy_input_) |
|
||||
/* pos inf input copied to output */
|
||||
encode_fixup_selector(fixup_input_code_pinf_, fixup_output_code_copy_input_);
|
||||
push_arg_entry_of("one", 0x00000001, true);
|
||||
push_arg_entry_of("even", 0x00007fff, true);
|
||||
push_arg_entry_of("selector", selector_int32, true);
|
||||
}
|
||||
|
||||
size_t aux_vecs_count() const { return 2; }
|
||||
};
|
||||
} // namespace MKLDNNPlugin
|
||||
|
||||
/**
|
||||
@ -139,3 +204,4 @@ public:
|
||||
static constexpr float_round_style round_style = round_to_nearest;
|
||||
};
|
||||
} // namespace std
|
||||
|
||||
|
@ -185,12 +185,12 @@ protected:
|
||||
expectedPrecisions["ReLU1"] = "ndef";
|
||||
expectedPrecisions["Convolution2"] = "BF16";
|
||||
expectedPrecisions["Convolution3"] = "BF16";
|
||||
expectedPrecisions["ReLU2"] = "FP32";
|
||||
expectedPrecisions["Norm1"] = "FP32";
|
||||
expectedPrecisions["ReLU2"] = "BF16";
|
||||
expectedPrecisions["Norm1"] = "BF16";
|
||||
expectedPrecisions["Eltwise1"] = "ndef";
|
||||
expectedPrecisions["ReLU3"] = "ndef";
|
||||
expectedPrecisions["maxPooling1"] = "BF16";
|
||||
expectedPrecisions["Eltwise2"] = "FP32";
|
||||
expectedPrecisions["Eltwise2"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -179,9 +179,9 @@ public:
|
||||
}
|
||||
|
||||
void test() {
|
||||
if (!InferenceEngine::with_cpu_x86_bfloat16()) {
|
||||
// on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
|
||||
// tests are useless on such platforms
|
||||
if (!InferenceEngine::with_cpu_x86_avx512_core()) {
|
||||
// We are enabling bf16 tests on platforms with native support bfloat16, and on platforms with AVX512 ISA
|
||||
// On platforms with AVX512 ISA but w/o native bfloat16 support computations are done via simulation mode
|
||||
GTEST_SKIP();
|
||||
}
|
||||
std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
|
||||
|
@ -131,8 +131,6 @@ protected:
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["CONC_1_TEST"] = "BF16";
|
||||
expectedPrecisions["RELU_1"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -111,9 +111,7 @@ protected:
|
||||
// STAGE3:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
expectedPrecisions["Convolution_0"] = "BF16";
|
||||
expectedPrecisions["Convolution_1"] = "BF16";
|
||||
expectedPrecisions["Elt_sum"] = "FP32";
|
||||
expectedPrecisions["Elt_sum"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -100,7 +100,7 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -118,7 +118,6 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["RELU"] = "ndef";
|
||||
}
|
||||
};
|
||||
|
@ -32,7 +32,7 @@ public:
|
||||
std::shared_ptr<Function> fnPtr;
|
||||
SizeVector inputShapes;
|
||||
std::map<string, string> expectedPrecisions;
|
||||
float threshold = 3e-2;
|
||||
float threshold = 7e-2;
|
||||
Precision netPrecision;
|
||||
size_t kernel;
|
||||
CoordinateDiff pads;
|
||||
|
@ -122,7 +122,6 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["Convolution_0"] = "BF16";
|
||||
expectedPrecisions["Convolution_1"] = "BF16";
|
||||
expectedPrecisions["Elt_max"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -179,8 +179,6 @@ protected:
|
||||
expectedPrecisions["Convolution_1"] = "BF16";
|
||||
expectedPrecisions["Convolution_2"] = "BF16";
|
||||
expectedPrecisions["Convolution_3"] = "BF16";
|
||||
expectedPrecisions["Elt_max"] = "FP32";
|
||||
expectedPrecisions["Elt_mul"] = "FP32";
|
||||
expectedPrecisions["Elt_sum"] = "ndef";
|
||||
}
|
||||
};
|
||||
|
@ -122,9 +122,9 @@ protected:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
expectedPrecisions["Matmul_0"] = "BF16";
|
||||
expectedPrecisions["Mul_1"] = "FP32";
|
||||
expectedPrecisions["Mul_1"] = "BF16";
|
||||
expectedPrecisions["Add_1"] = "FP32";
|
||||
expectedPrecisions["Relu_1"] = "FP32";
|
||||
expectedPrecisions["Relu_1"] = "ndef";
|
||||
expectedPrecisions["Conc_1"] = "BF16";
|
||||
expectedPrecisions["Matmul_1"] = "BF16";
|
||||
}
|
||||
|
@ -1,146 +0,0 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "bfloat16_helpers.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
|
||||
#include <ie_core.hpp>
|
||||
#include <ie_plugin_config.hpp>
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace LayerTestsDefinitions {
|
||||
|
||||
class Interpolation : public BasicBF16Test {
|
||||
protected:
|
||||
std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) override {
|
||||
// Convolution (BF16)
|
||||
// |
|
||||
// Interpolation (In the case of mode = "linear") (FP32)
|
||||
// |
|
||||
// Convolution (BF16)
|
||||
|
||||
// STAGE1: construction of the GRAPH
|
||||
ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
|
||||
auto channelsCount = inputShapes[1];
|
||||
|
||||
// add
|
||||
auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{inputShapes});
|
||||
input1->set_friendly_name("Input_1");
|
||||
std::shared_ptr<ngraph::opset1::Constant> addConst = nullptr;
|
||||
if (netPrecision == Precision::FP32) {
|
||||
addConst = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
|
||||
} else {
|
||||
addConst = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(FuncTestUtils::Bf16TestUtils::reducePrecisionBitwiseS(2.0f)) });
|
||||
}
|
||||
auto addNode = std::make_shared<opset1::Multiply>(input1, addConst);
|
||||
addNode->set_friendly_name("Add_1");
|
||||
|
||||
// convolution
|
||||
std::shared_ptr<ngraph::opset1::Constant> weightsNode1 = nullptr, weightsNode2 = nullptr;
|
||||
ngraph::Shape convFilterShape = { channelsCount, channelsCount, 3, 3 }; // out channel, /input channels, kernel h, kernel w
|
||||
if (netPrecision == Precision::FP32) {
|
||||
std::vector<float> weightValuesFP32;
|
||||
weightValuesFP32.resize(channelsCount * channelsCount * 3 * 3);
|
||||
FuncTestUtils::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
|
||||
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
|
||||
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
|
||||
} else {
|
||||
std::vector<short> weightValuesBF16;
|
||||
weightValuesBF16.resize(channelsCount * channelsCount * 3 * 3);
|
||||
FuncTestUtils::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
|
||||
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
|
||||
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
|
||||
addNode, weightsNode1,
|
||||
ngraph::Strides({ 1, 1 }), // strides
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad end
|
||||
ngraph::Strides({ 1, 1 }), // dilation
|
||||
ngraph::op::PadType::EXPLICIT); // pad type
|
||||
convNode1->set_friendly_name("Convolution_1");
|
||||
|
||||
// interpolation
|
||||
auto heightSize = static_cast<long>(inputShapes[2]);
|
||||
auto weigthSize = static_cast<long>(inputShapes[3]);
|
||||
std::vector<int64_t> outShape = {2 * heightSize, 2 * weigthSize};
|
||||
|
||||
auto interpolShape = std::make_shared<ngraph::op::v0::Constant>(ngraph::element::i64, ngraph::Shape{2}, outShape);
|
||||
ngraph::op::v0::InterpolateAttrs attrs;
|
||||
attrs.pads_begin.push_back(0);
|
||||
attrs.pads_end.push_back(0);
|
||||
attrs.axes = ngraph::AxisSet{2, 3};
|
||||
attrs.align_corners = false;
|
||||
attrs.mode = "linear";
|
||||
attrs.antialias = false;
|
||||
auto interpolNode = std::make_shared<opset1::Interpolate>(
|
||||
convNode1,
|
||||
interpolShape, attrs);
|
||||
interpolNode->set_friendly_name("Interp");
|
||||
|
||||
std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
|
||||
interpolNode, weightsNode2,
|
||||
ngraph::Strides({ 1, 1 }), // strides
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad end
|
||||
ngraph::Strides({ 1, 1 }), // dilation
|
||||
ngraph::op::PadType::EXPLICIT); // pad type
|
||||
convNode2->set_friendly_name("Convolution_2");
|
||||
return std::make_shared<ngraph::Function>(convNode2, ngraph::ParameterVector{input1});
|
||||
}
|
||||
void SetUp() override {
|
||||
std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE2: set up safe threshold <= 5% from maximum value of output tensor
|
||||
threshold = 0.02f; // Max in fp32 network by output: 2.531
|
||||
|
||||
// STAGE3:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
expectedPrecisions["Convolution_1"] = "BF16";
|
||||
expectedPrecisions["Interp"] = "FP32";
|
||||
expectedPrecisions["Convolution_2"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(Interpolation, CompareWithRefImpl) {
|
||||
test();
|
||||
};
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_FP32_bfloat16_NoReshape, Interpolation,
|
||||
::testing::Combine(
|
||||
::testing::Values(Precision::FP32),
|
||||
::testing::Values(Precision::FP32),
|
||||
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
|
||||
::testing::Values(SizeVector()),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Interpolation::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_BF16_bfloat16_NoReshape, Interpolation,
|
||||
::testing::Combine(
|
||||
::testing::Values(Precision::FP32),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
|
||||
::testing::Values(SizeVector()),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Interpolation::getTestCaseName);
|
||||
|
||||
} // namespace LayerTestsDefinitions
|
@ -151,12 +151,9 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["RELU_2"] = "ndef";
|
||||
expectedPrecisions["DW_CONV"] = "BF16";
|
||||
expectedPrecisions["RELU_DW"] = "ndef";
|
||||
expectedPrecisions["NORM_1"] = "FP32";
|
||||
expectedPrecisions["CONC_1"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1,146 +0,0 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "bfloat16_helpers.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
|
||||
#include <ie_core.hpp>
|
||||
#include <ie_plugin_config.hpp>
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
|
||||
#include "ngraph/opsets/opset1.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace ngraph;
|
||||
using namespace InferenceEngine;
|
||||
|
||||
namespace LayerTestsDefinitions {
|
||||
|
||||
class Resample : public BasicBF16Test {
|
||||
protected:
|
||||
std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) override {
|
||||
// Convolution (BF16)
|
||||
// |
|
||||
// Interpolation (Resample in the case of mode = "nearest") (FP32)
|
||||
// |
|
||||
// Convolution (BF16)
|
||||
|
||||
// STAGE1: construction of the GRAPH
|
||||
|
||||
ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
|
||||
// add
|
||||
auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{inputShapes});
|
||||
auto channelsCount = inputShapes[1];
|
||||
input1->set_friendly_name("Input_1");
|
||||
std::shared_ptr<ngraph::opset1::Constant> addConst = nullptr;
|
||||
if (netPrecision == Precision::FP32) {
|
||||
addConst = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
|
||||
} else {
|
||||
addConst = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(FuncTestUtils::Bf16TestUtils::reducePrecisionBitwiseS(2.0f)) });
|
||||
}
|
||||
auto addNode = std::make_shared<opset1::Multiply>(input1, addConst);
|
||||
addNode->set_friendly_name("Add_1");
|
||||
|
||||
// convolution
|
||||
std::shared_ptr<ngraph::opset1::Constant> weightsNode1 = nullptr, weightsNode2 = nullptr;
|
||||
ngraph::Shape convFilterShape = { channelsCount, channelsCount, 3, 3 }; // out channel, /input channels, kernel h, kernel w
|
||||
if (netPrecision == Precision::FP32) {
|
||||
std::vector<float> weightValuesFP32;
|
||||
weightValuesFP32.resize(channelsCount * channelsCount * 3 * 3);
|
||||
FuncTestUtils::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
|
||||
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
|
||||
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
|
||||
} else {
|
||||
std::vector<short> weightValuesBF16;
|
||||
weightValuesBF16.resize(channelsCount * channelsCount * 3 * 3);
|
||||
FuncTestUtils::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
|
||||
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
|
||||
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
|
||||
}
|
||||
|
||||
std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
|
||||
addNode, weightsNode1,
|
||||
ngraph::Strides({ 1, 1 }), // strides
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad end
|
||||
ngraph::Strides({ 1, 1 }), // dilation
|
||||
ngraph::op::PadType::EXPLICIT); // pad type
|
||||
convNode1->set_friendly_name("Convolution_1");
|
||||
|
||||
// interpolation
|
||||
auto heightSize = static_cast<long>(inputShapes[2]);
|
||||
auto weigthSize = static_cast<long>(inputShapes[3]);
|
||||
std::vector<int64_t> outShape = {2 * heightSize, 2 * weigthSize};
|
||||
|
||||
auto interpolShape = std::make_shared<ngraph::op::v0::Constant>(ngraph::element::i64, ngraph::Shape{2}, outShape);
|
||||
ngraph::op::v0::InterpolateAttrs attrs;
|
||||
attrs.pads_begin.push_back(0);
|
||||
attrs.pads_end.push_back(0);
|
||||
attrs.axes = ngraph::AxisSet{2, 3};
|
||||
attrs.align_corners = false;
|
||||
attrs.mode = "nearest";
|
||||
attrs.antialias = false;
|
||||
auto interpolNode = std::make_shared<opset1::Interpolate>(
|
||||
convNode1,
|
||||
interpolShape, attrs);
|
||||
interpolNode->set_friendly_name("Interp");
|
||||
|
||||
std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
|
||||
interpolNode, weightsNode2,
|
||||
ngraph::Strides({ 1, 1 }), // strides
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
|
||||
ngraph::CoordinateDiff({ 1, 1 }), // pad end
|
||||
ngraph::Strides({ 1, 1 }), // dilation
|
||||
ngraph::op::PadType::EXPLICIT); // pad type
|
||||
convNode2->set_friendly_name("Convolution_2");
|
||||
return std::make_shared<ngraph::Function>(convNode2, ngraph::ParameterVector{input1});
|
||||
}
|
||||
void SetUp() override {
|
||||
std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE2: set up safe threshold <= 5% from maximum value of output tensor
|
||||
threshold = 0.02f; // Max in fp32 network by output: 2.35926
|
||||
|
||||
// STAGE3:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
expectedPrecisions["Convolution_1"] = "BF16";
|
||||
expectedPrecisions["Interp"] = "FP32";
|
||||
expectedPrecisions["Convolution_2"] = "BF16";
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(Resample, CompareWithRefImpl) {
|
||||
test();
|
||||
};
|
||||
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_FP32_bfloat16_NoReshape, Resample,
|
||||
::testing::Combine(
|
||||
::testing::Values(Precision::FP32),
|
||||
::testing::Values(Precision::FP32),
|
||||
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
|
||||
::testing::Values(SizeVector()),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Resample::getTestCaseName);
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(smoke_BF16_bfloat16_NoReshape, Resample,
|
||||
::testing::Combine(
|
||||
::testing::Values(Precision::FP32),
|
||||
::testing::Values(Precision::BF16),
|
||||
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
|
||||
::testing::Values(SizeVector()),
|
||||
::testing::Values(CommonTestUtils::DEVICE_CPU)),
|
||||
Resample::getTestCaseName);
|
||||
|
||||
} // namespace LayerTestsDefinitions
|
@ -123,7 +123,6 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["ADD_2"] = "FP32";
|
||||
expectedPrecisions["ELT_1"] = "ndef";
|
||||
}
|
||||
};
|
||||
|
@ -93,7 +93,7 @@ protected:
|
||||
fnPtr = createGraph(netPrecision);
|
||||
|
||||
// STAGE1:
|
||||
threshold = 7e-2;
|
||||
threshold = 9e-2;
|
||||
// STAGE2:
|
||||
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
|
||||
// performance counters
|
||||
|
@ -117,8 +117,6 @@ protected:
|
||||
expectedPrecisions["ADD_1"] = "FP32";
|
||||
expectedPrecisions["CONV_1"] = "BF16";
|
||||
expectedPrecisions["CONV_2"] = "BF16";
|
||||
expectedPrecisions["CONC_1"] = "BF16";
|
||||
expectedPrecisions["RELU_1"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -131,8 +131,7 @@ protected:
|
||||
expectedPrecisions["Add_1"] = "FP32";
|
||||
expectedPrecisions["Add_2"] = "FP32";
|
||||
expectedPrecisions["Convolution_1"] = "BF16";
|
||||
expectedPrecisions["Convolution_2"] = "BF16";
|
||||
expectedPrecisions["ELT_1"] = "FP32";
|
||||
expectedPrecisions["ELT_1"] = "ndef";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -152,7 +152,6 @@ protected:
|
||||
expectedPrecisions["Add_2"] = "FP32";
|
||||
expectedPrecisions["ELT_1"] = "ndef";
|
||||
expectedPrecisions["RELU_1"] = "ndef";
|
||||
expectedPrecisions["Add_3"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -114,7 +114,6 @@ protected:
|
||||
// performance counters
|
||||
expectedPrecisions["Add_4"] = "FP32";
|
||||
expectedPrecisions["Convolution_6"] = "BF16";
|
||||
expectedPrecisions["AvgPool_8"] = "FP32";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -61,7 +61,7 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
R"(.*decomposition1_batch=5_hidden_size=10_input_size=30_.*tanh.relu.*_clip=0_linear_before_reset=1.*_targetDevice=CPU_.*)",
|
||||
};
|
||||
|
||||
if (!InferenceEngine::with_cpu_x86_bfloat16()) {
|
||||
if (!InferenceEngine::with_cpu_x86_avx512_core()) {
|
||||
// on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
|
||||
// tests are useless on such platforms
|
||||
retVector.emplace_back(R"(.*BF16.*)");
|
||||
|
@ -89,8 +89,6 @@ const std::vector<ngraph::op::EpsMode> epsMode = {
|
||||
ngraph::op::EpsMode::MAX,
|
||||
};
|
||||
|
||||
std::vector<Precision> inpOutPrc = {Precision::BF16};
|
||||
|
||||
std::vector<CPUSpecificParams> cpuParams_4D = {
|
||||
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
|
||||
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),
|
||||
|
Loading…
Reference in New Issue
Block a user