[BF16] BF16 simulator for AVX512 was done (#3424)

This commit is contained in:
Alexey Varyzgin 2020-12-10 16:25:01 +03:00 committed by GitHub
parent 2cfc8ade62
commit 3bcac1641d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
36 changed files with 279 additions and 435 deletions

View File

@ -33,6 +33,9 @@ Config::Config() {
streamExecutorConfig._threadBindingType = InferenceEngine::IStreamsExecutor::CORES;
#endif
if (!with_cpu_x86_bfloat16())
enforceBF16 = false;
updateProperties();
}
@ -93,7 +96,7 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
dumpQuantizedGraphToIr = val;
} else if (key == PluginConfigParams::KEY_ENFORCE_BF16) {
if (val == PluginConfigParams::YES) {
if (with_cpu_x86_bfloat16())
if (with_cpu_x86_avx512_core())
enforceBF16 = true;
else
THROW_IE_EXCEPTION << "Platform doesn't support BF16 format";
@ -143,8 +146,6 @@ void Config::updateProperties() {
_config.insert({ PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS, std::to_string(streamExecutorConfig._streams) });
_config.insert({ PluginConfigParams::KEY_CPU_THREADS_NUM, std::to_string(streamExecutorConfig._threads) });
_config.insert({ PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT, dumpToDot });
if (!with_cpu_x86_bfloat16())
enforceBF16 = false;
if (enforceBF16)
_config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES });
else

View File

@ -63,7 +63,7 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
i++;
}
if (with_cpu_x86_bfloat16() && isFloatModel) {
if (with_cpu_x86_avx512_core() && isFloatModel) {
BF16Transformer bf16Transformer;
CNNNetwork cnnetwork(_clonedNetwork);
// If enforceBF16 flag was set, BF16 transformation applies for all layers supported by CPU plugin.

View File

@ -13,7 +13,7 @@ namespace MKLDNNPlugin {
class jit_emitter {
public:
jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
: h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) {
k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well
@ -32,7 +32,7 @@ protected:
size_t get_max_vecs_count() const;
size_t get_vec_length() const;
const MKLDNNNode& n;
const MKLDNNNode* n;
mkldnn::impl::cpu::jit_generator* h;
mkldnn::impl::cpu::cpu_isa_t host_isa_;
InferenceEngine::Precision exec_prc_;

View File

@ -48,8 +48,12 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator() {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]);
@ -72,16 +76,16 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
if (isa == sse42) {
if (isa == cpu::sse42) {
uni_vmovups(vmm_mask, vmm_val);
uni_vcmpgtps(vmm_mask, vmm_mask, vmm_max);
} else if (isa == avx2) {
} else if (isa == cpu::avx2) {
uni_vcmpgtps(vmm_mask, vmm_val, vmm_max);
} else {
vcmpps(k_mask, vmm_val, vmm_max, _cmp_nle_us);
}
if (isa == avx512_common) {
if (isa == cpu::avx512_common) {
vptestmd(k_mask, vmm_mask, vmm_mask);
vblendmps(vmm_max | k_mask, vmm_max, vmm_val);
} else {
@ -143,13 +147,17 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
exp_injector->prepare_table();
ker_ = (decltype(ker_))this->getCode();
}
private:
using Vmm = typename conditional3<isa == sse42, Xbyak::Xmm, isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
Xbyak::Reg64 reg_src = r8;
@ -169,6 +177,8 @@ private:
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_dt) {
@ -192,8 +202,11 @@ private:
uni_vmovups(op, vmm_dst);
break;
case Precision::BF16:
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
default:
assert(!"unknown dst_dt");
@ -204,7 +217,7 @@ private:
SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
: input_prec(inpPrc), output_prec(outPrc) {
if (Precision::BF16 == output_prec) {
if (!mayiuse(avx512_core_bf16)) {
if (!mayiuse(avx512_core)) {
THROW_IE_EXCEPTION << "SoftmaxGeneric doesn't support BF16 precision on this target.";
}
}
@ -214,14 +227,14 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
jcp.src_dt = inpPrc;
jcp.dst_dt = outPrc;
if (mayiuse(avx512_common)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx512_common>(jcp));
if (mayiuse(cpu::avx512_common)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx512_common>(jcp));
block_size = 16;
} else if (mayiuse(avx2)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<avx2>(jcp));
} else if (mayiuse(cpu::avx2)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx2>(jcp));
block_size = 8;
} else if (mayiuse(sse42)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<sse42>(jcp));
} else if (mayiuse(cpu::sse42)) {
softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::sse42>(jcp));
block_size = 4;
}
}

View File

@ -16,7 +16,7 @@ using namespace Xbyak;
namespace MKLDNNPlugin {
/// ADD ///
jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_add_emitter::get_inputs_num() { return 2; }
@ -50,7 +50,7 @@ void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std
}
/// MUL_ADD ///
jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_mul_add_emitter::get_inputs_num() { return 3; }
@ -109,7 +109,7 @@ size_t jit_mul_add_emitter::aux_vecs_count() const {
}
/// SUB ///
jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_subtract_emitter::get_inputs_num() { return 2; }
@ -144,7 +144,7 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, cons
/// MULTIPLY ///
jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_multiply_emitter::get_inputs_num() { return 2; }
@ -179,7 +179,7 @@ void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, cons
/// DIVIDE ///
jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_divide_emitter::get_inputs_num() { return 2; }
@ -214,7 +214,7 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
/// FLOOR_MOD ///
jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_floor_mod_emitter::get_inputs_num() { return 2; }
@ -263,7 +263,7 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const {
}
/// MOD ///
jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_mod_emitter::get_inputs_num() { return 2; }
@ -312,7 +312,7 @@ size_t jit_mod_emitter::aux_vecs_count() const {
}
/// MAXIMUM ///
jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_maximum_emitter::get_inputs_num() { return 2; }
@ -359,7 +359,7 @@ std::set<InferenceEngine::Precision> jit_maximum_emitter::get_supported_precisio
}
/// MINIMUM ///
jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_minimum_emitter::get_inputs_num() { return 2; }
@ -406,7 +406,7 @@ std::set<InferenceEngine::Precision> jit_minimum_emitter::get_supported_precisio
}
/// SQUARED_DIFFERENCE ///
jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_squared_difference_emitter::get_inputs_num() { return 2; }
@ -444,7 +444,7 @@ void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_
/// POWER_DYNAMIC ///
jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {}
size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; }
@ -550,7 +550,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
/// EQUAL ///
jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -606,7 +606,7 @@ size_t jit_equal_emitter::aux_vecs_count() const {
}
/// NOT_EQUAL ///
jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -662,7 +662,7 @@ size_t jit_not_equal_emitter::aux_vecs_count() const {
}
/// GREATER ///
jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -718,7 +718,7 @@ size_t jit_greater_emitter::aux_vecs_count() const {
}
/// GREATER_EQUAL ///
jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -774,7 +774,7 @@ size_t jit_greater_equal_emitter::aux_vecs_count() const {
}
/// LESS ///
jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -830,7 +830,7 @@ size_t jit_less_emitter::aux_vecs_count() const {
}
/// LESS_EQUAL ///
jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -887,7 +887,7 @@ size_t jit_less_equal_emitter::aux_vecs_count() const {
}
/// LOGICAL_AND ///
jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -964,7 +964,7 @@ size_t jit_logical_and_emitter::aux_vecs_count() const {
/// LOGICAL_OR ///
jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -1040,7 +1040,7 @@ size_t jit_logical_or_emitter::aux_vecs_count() const {
}
/// LOGICAL_XOR ///
jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -1116,7 +1116,7 @@ size_t jit_logical_xor_emitter::aux_vecs_count() const {
}
/// LOGICAL_NOT ///
jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -1171,7 +1171,7 @@ size_t jit_logical_not_emitter::aux_vecs_count() const {
}
/// POWER_STATIC ///
jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}
@ -1198,7 +1198,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
Vmm vmm_dst = Vmm(out_vec_idxs[0]);
Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n->getCnnLayer().get());
if (powerLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert power layer.";
@ -1340,7 +1340,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
}
void jit_power_static_emitter::register_table_entries() {
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n.getCnnLayer().get());
auto *powerLayer = dynamic_cast<InferenceEngine::PowerLayer *>(n->getCnnLayer().get());
if (powerLayer == nullptr)
THROW_IE_EXCEPTION << "Cannot convert power layer.";
@ -1359,7 +1359,7 @@ size_t jit_power_static_emitter::aux_vecs_count() const {
}
/// PRELU ///
jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, Precision exec_prc)
jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
}

View File

@ -12,7 +12,7 @@ namespace MKLDNNPlugin {
class jit_add_emitter : public jit_emitter {
public:
jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -27,7 +27,7 @@ private:
class jit_mul_add_emitter : public jit_emitter {
public:
jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -45,7 +45,7 @@ private:
class jit_subtract_emitter : public jit_emitter {
public:
jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -61,7 +61,7 @@ private:
class jit_multiply_emitter : public jit_emitter {
public:
jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -77,7 +77,7 @@ private:
class jit_divide_emitter : public jit_emitter {
public:
jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -93,7 +93,7 @@ private:
class jit_floor_mod_emitter : public jit_emitter {
public:
jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -110,7 +110,7 @@ private:
class jit_mod_emitter : public jit_emitter {
public:
jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -127,7 +127,7 @@ private:
class jit_maximum_emitter : public jit_emitter {
public:
jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -144,7 +144,7 @@ private:
class jit_minimum_emitter : public jit_emitter {
public:
jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -161,7 +161,7 @@ private:
class jit_squared_difference_emitter : public jit_emitter {
public:
jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -177,7 +177,7 @@ private:
class jit_power_dynamic_emitter : public jit_emitter {
public:
jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -193,7 +193,7 @@ private:
class jit_equal_emitter : public jit_emitter {
public:
jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -212,7 +212,7 @@ private:
class jit_not_equal_emitter : public jit_emitter {
public:
jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -231,7 +231,7 @@ private:
class jit_greater_emitter : public jit_emitter {
public:
jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -250,7 +250,7 @@ private:
class jit_greater_equal_emitter : public jit_emitter {
public:
jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -269,7 +269,7 @@ private:
class jit_less_emitter : public jit_emitter {
public:
jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -288,7 +288,7 @@ private:
class jit_less_equal_emitter : public jit_emitter {
public:
jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -307,7 +307,7 @@ private:
class jit_logical_and_emitter : public jit_emitter {
public:
jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -326,7 +326,7 @@ private:
class jit_logical_or_emitter : public jit_emitter {
public:
jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -345,7 +345,7 @@ private:
class jit_logical_xor_emitter : public jit_emitter {
public:
jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -363,7 +363,7 @@ private:
class jit_logical_not_emitter : public jit_emitter {
public:
jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -381,7 +381,7 @@ private:
class jit_power_static_emitter : public jit_emitter {
public:
jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;
@ -399,7 +399,7 @@ private:
class jit_prelu_emitter : public jit_emitter {
public:
jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;

View File

@ -13,9 +13,9 @@ using namespace Xbyak;
namespace MKLDNNPlugin {
jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode& node, InferenceEngine::Precision exec_prc)
jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc)
: jit_emitter(host, host_isa, node, exec_prc) {
auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(n);
auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(*n);
auto alg = static_cast<mkldnn_alg_kind_t>(eltwiseNode.getAlgorithm());

View File

@ -13,7 +13,7 @@ namespace MKLDNNPlugin {
class jit_mkldnn_emitter : public jit_emitter {
public:
jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode& node,
jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);
size_t get_inputs_num() override;

View File

@ -11,6 +11,7 @@
#include <cmath>
#include <mkldnn_types.h>
#include <mkldnn_extension_utils.h>
#include "utils/bfloat16.hpp"
#include "ie_parallel.hpp"
#include "mkldnn_quantize_node.h"
#include <map>
@ -45,7 +46,7 @@ struct EltwiseEmitterContext {
std::shared_ptr<jit_emitter> emitter;
mkldnn::impl::cpu::jit_generator *host;
mkldnn::impl::cpu::cpu_isa_t host_isa;
const MKLDNNNode & node;
const MKLDNNNode * node;
InferenceEngine::Precision exec_prc;
};
@ -108,6 +109,9 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
}
}
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
for (int i = 0; i < jep.inputs_number; i++)
@ -273,6 +277,9 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
eltwise_emitter->emit_table();
for (int i = 0; i < post_op_emitters.size(); i++) {
post_op_emitters[i]->emit_table();
@ -320,6 +327,8 @@ private:
Vmm vmm_d_bias = Vmm(13);
Vmm vmm_zero = Vmm(15);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
std::shared_ptr<jit_emitter> eltwise_emitter = nullptr;
std::vector<std::shared_ptr<jit_emitter>> post_op_emitters = {};
@ -392,12 +401,13 @@ private:
std::shared_ptr<jit_emitter> create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) {
auto& eltwiseNode = dynamic_cast<const MKLDNNEltwiseNode&>(node);
const MKLDNNNode * eltwiseNodePtr = dynamic_cast<const MKLDNNNode*>(&node);
EltwiseEmitterContext ctx = {
nullptr,
this,
isa,
eltwiseNode,
eltwiseNodePtr,
exec_prec
};
@ -615,8 +625,11 @@ private:
uni_vmovups(op, vmm_dst);
break;
case Precision::BF16:
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case Precision::I16:
if (isa == avx512_common) {
@ -1024,7 +1037,7 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
}
}
if (!mayiuse(avx512_core_bf16)) {
if (!mayiuse(avx512_core)) {
bool hasBF16 = false;
for (auto &inPrc : inputPrecisions)
if (inPrc == Precision::BF16)

View File

@ -21,7 +21,7 @@
#include "jit_uni_depthwise.hpp"
#include "jit_uni_quantization.hpp"
#include "common/cpu_memcpy.h"
#include "ngraph/type/bfloat16.hpp"
#include "utils/bfloat16.hpp"
using namespace mkldnn;
using namespace MKLDNNPlugin;
@ -59,6 +59,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
}
}
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
if (attr_.post_ops_.len_ != 0)
@ -134,6 +137,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
for (auto& inj : eltwise_injectors)
inj->prepare_table();
if ((jcp_.mode == InterpolateMode::cubic) && (jcp_.layout == InterpolateLayoutType::planar)) {
@ -224,6 +230,8 @@ private:
Xbyak::Label l_table_constant;
Opmask k_mask = Xbyak::Opmask(1);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
std::vector<std::shared_ptr<jit_uni_depthwise_injector_f32<isa>>> depthwise_injectors;
std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
@ -1278,12 +1286,11 @@ private:
movd(op, xmm_dst);
}
} else if (dst_dt == memory::bf16) {
if (mayiuse(avx512_core_bf16)) {
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
} else {
assert(!"data type of bf16 is only supported for ISA:avx512_core_bf16");
}
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
}
}
@ -1584,7 +1591,7 @@ void MKLDNNInterpolateNode::initSupportedPrimitiveDescriptors() {
if ((inputPrecision != Precision::I8) && (inputPrecision != Precision::U8) && (inputPrecision != Precision::BF16)) {
inputPrecision = Precision::FP32;
}
if ((inputPrecision == Precision::BF16) && !mayiuse(avx512_core_bf16)) {
if ((inputPrecision == Precision::BF16) && !mayiuse(avx512_core)) {
inputPrecision = Precision::FP32;
}
Precision outputPrecision = inputPrecision;
@ -2714,7 +2721,7 @@ float MKLDNNInterpolateNode::getValue(const uint8_t *base, size_t offset, Infere
}
case Precision::BF16: {
const uint16_t *valuePtr = reinterpret_cast<const uint16_t *>(baseOffset);
return ngraph::bfloat16::from_bits(*valuePtr);
return bfloat16_t::from_bits(*valuePtr);
break;
}
case Precision::FP32: {
@ -2743,7 +2750,7 @@ void MKLDNNInterpolateNode::setValue(uint8_t *base, size_t offset, float value,
break;
}
case Precision::BF16: {
uint16_t data = ngraph::bfloat16(value).to_bits();
uint16_t data = bfloat16_t(value).to_bits();
std::memcpy(baseOffset, &data, 2);
break;
}

View File

@ -240,6 +240,9 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
}
}
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
@ -311,6 +314,9 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
for (auto& inj : eltwise_injectors)
inj->prepare_table();
@ -344,6 +350,8 @@ private:
Vmm vmm_d_weights = Vmm(5);
Vmm vmm_d_bias = Vmm(6);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
Xbyak::Label l_table;
std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
@ -381,8 +389,11 @@ private:
if (dst_dt == memory::f32) {
uni_vmovups(op, vmm_dst);
} else if (dst_dt == memory::bf16) {
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
} else if (dst_dt == memory::u8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
if (isa == cpu::avx512_common) {
@ -504,7 +515,7 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
}
}
if (!mayiuse(avx512_core_bf16)) {
if (!mayiuse(avx512_core)) {
if (outputPrecision == Precision::BF16)
outputPrecision = Precision::FP32;
}

View File

@ -165,6 +165,9 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
}
}
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
@ -188,6 +191,8 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
for (auto& inj : eltwise_injectors)
inj->prepare_table();
@ -230,6 +235,8 @@ private:
Vmm vmm_d_bias = Vmm(6);
Vmm vmm_zero = Vmm(7);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
std::vector<std::shared_ptr<jit_uni_eltwise_injector_f32<isa>>> eltwise_injectors;
std::vector<std::shared_ptr<jit_uni_depthwise_injector_f32<isa>>> depthwise_injectors;
std::vector<std::shared_ptr<jit_uni_quantization_injector_f32<isa>>> quantization_injectors;
@ -580,7 +587,10 @@ private:
if (dst_dt == memory::f32) {
uni_vmovups(op, vmm_dst);
} else if (dst_dt == memory::bf16) {
vcvtneps2bf16(ymm_dst, vmm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
} else if (dst_dt == memory::u8) {
uni_vcvtps2dq(vmm_dst, vmm_dst);
@ -752,7 +762,7 @@ void MKLDNNNormalizeNode::getSupportedDescriptors() {
weights_blob->allocate();
float* src = layer->blobs.at("weights")->buffer();
float* dst = weights_blob->wmap();
memcpy(dst, src, layer->blobs.at("weights")->byteSize());
cpu_memcpy(dst, src, layer->blobs.at("weights")->byteSize());
} else if (weights_prec == Precision::BF16) {
MKLDNNPlugin::BF16Transformer transformer;
weights_blob = transformer.convertBF16ToFloat(tweights);
@ -780,7 +790,7 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
}
if (inputPrecision == Precision::BF16 || outputPrecision == Precision::BF16) {
if (!mayiuse(avx512_core_bf16))
if (!mayiuse(avx512_core))
inputPrecision = outputPrecision = Precision::FP32;
else
inputPrecision = outputPrecision = Precision::BF16;

View File

@ -78,6 +78,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
: jit_uni_reduce_kernel(jcp), jit_generator() {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
@ -103,6 +106,9 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
if (jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::L1 || jcp_.reduce_mode == Reduce::Max ||
jcp_.reduce_mode == Reduce::Min || jcp_.reduce_mode == Reduce::Prod || jcp_.reduce_mode == Reduce::Or) {
prepare_aux_table();
@ -146,6 +152,8 @@ private:
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
Xbyak::Label l_table;
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
@ -605,8 +613,11 @@ private:
uni_vmovups(op, vmm_dst);
break;
case memory::bf16:
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case memory::s8:
if (isa == avx512_common) {
@ -806,6 +817,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
: jit_uni_reduce_post_kernel(jcp), jit_generator() {
log_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_log, 0.f, 0.f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
mov(reg_dst, ptr[reg_params + GET_OFF(dst)]);
@ -823,6 +837,9 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) {
log_injector->prepare_table();
}
@ -855,6 +872,8 @@ private:
Xbyak::Xmm xmm_aux2 = Xbyak::Xmm(5);
Xbyak::Xmm xmm_aux3 = Xbyak::Xmm(6);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> log_injector;
inline void reduce_post_main() {
@ -1063,8 +1082,11 @@ private:
uni_vmovups(op, vmm_dst);
break;
case memory::bf16:
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
case memory::s8:
if (isa == avx512_common) {
@ -1355,7 +1377,7 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
// Since in jit mode we use the output memory as an intermediate accumulator for certain reduce modes, we can't use BF16 output precision due to
// the possible accuracy loss. Therefore, for such mods, we will change the output precision to FP32.
if (Precision::BF16 == outputPrecision) {
if (!mayiuse(avx512_core_bf16)) {
if (!mayiuse(avx512_core)) {
outputPrecision = Precision::FP32;
} else if (reduceMode != Reduce::And && reduceMode != Reduce::Or &&
reduceMode != Reduce::Max && reduceMode != Reduce::Min) {

View File

@ -16,8 +16,9 @@
#include "jit_generator.hpp"
#include "jit_uni_eltwise.hpp"
using namespace MKLDNNPlugin;
using namespace mkldnn;
using namespace MKLDNNPlugin;
using namespace InferenceEngine;
using namespace mkldnn::impl::cpu;
using namespace mkldnn::impl::utils;
@ -56,6 +57,9 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jit_uni_logistic_kernel(), jit_generator() {
exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
this->preamble();
mov(reg_src, ptr[reg_params + GET_OFF(src)]);
@ -103,6 +107,9 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
this->postamble();
if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
emu_vcvtneps2bf16->emit_table();
exp_injector->prepare_table();
prepare_table();
@ -111,7 +118,7 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
}
private:
using Vmm = typename conditional3<isa == sse42, Xbyak::Xmm, isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
size_t vlen = cpu_isa_traits<isa>::vlen;
Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; }
@ -130,6 +137,8 @@ private:
const Xbyak::Opmask k_mask = Xbyak::Opmask(1);
std::unique_ptr<jit_emu_vcvtneps2bf16> emu_vcvtneps2bf16;
Xbyak::Label l_table;
std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;
@ -148,10 +157,10 @@ private:
uni_vmovups(vmm_aux2, table_val(1));
uni_vsubps(vmm_aux2, vmm_aux2, vmm_src);
if (isa == sse42) {
if (isa == cpu::sse42) {
uni_vblendvps(vmm_aux2, vmm_aux2, vmm_src, vmm_aux0);
uni_vmovups(vmm_src, vmm_aux2);
} else if (isa == avx2) {
} else if (isa == cpu::avx2) {
uni_vblendvps(vmm_src, vmm_aux2, vmm_src, vmm_aux0);
} else {
vptestmd(k_mask, vmm_aux0, vmm_aux0);
@ -199,8 +208,11 @@ private:
uni_vmovups(op, vmm_dst);
break;
case InferenceEngine::Precision::BF16:
vcvtneps2bf16(ymm_dst, vmm_dst);
uni_vmovups(op, ymm_dst);
if (mayiuse(avx512_core_bf16))
vcvtneps2bf16(ymm_dst, vmm_dst);
else
emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
vmovdqu16(op, ymm_dst);
break;
default:
assert(!"unknown dst_dt");
@ -253,7 +265,7 @@ public:
}
if (Precision::BF16 == output_prec) {
if (!mayiuse(avx512_core_bf16)) {
if (!mayiuse(avx512_core)) {
output_prec = Precision::FP32;
}
}
@ -269,14 +281,14 @@ public:
jcp.src_data_size = jcp.dst_data_size = output_prec.size();
block_size = 1;
if (mayiuse(avx512_common)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx512_common>(jcp));
if (mayiuse(cpu::avx512_common)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx512_common>(jcp));
block_size = 16;
} else if (mayiuse(avx2)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<avx2>(jcp));
} else if (mayiuse(cpu::avx2)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx2>(jcp));
block_size = 8;
} else if (mayiuse(sse42)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<sse42>(jcp));
} else if (mayiuse(cpu::sse42)) {
logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::sse42>(jcp));
block_size = 4;
}
@ -383,7 +395,7 @@ private:
inline void calculate_logistic(size_t start_index, int count, uint8_t * dst_data) {
auto dst_data_size = output_prec.size();
if (logistic_kernel) {
int blocks_num = div_up(count, block_size);
int blocks_num = MKLDNNPlugin::div_up(count, block_size);
parallel_for(blocks_num, [&](int ib) {
int idx = ib * block_size;
int work_amount = std::min(count - idx, block_size);

View File

@ -6,13 +6,15 @@
#include <cmath>
#include <limits>
#include "utils.hpp"
#include "nodes/common/emitter.h"
/**
* The bfloat16_t class can be used as an arithmetic type. All arithmetic operations goes through conversion to the float data type.
*/
#define BFLOAT16_ROUND_MODE_TRUNCATE
#define BFLOAT16_ROUND_MODE_TO_NEAREST_EVEN
namespace MKLDNNPlugin {
class bfloat16_t {
@ -71,6 +73,69 @@ private:
};
uint16_t m_value;
};
class jit_emu_vcvtneps2bf16 : public jit_emitter {
public:
jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) {
prepare_table();
};
size_t get_inputs_num() { return 1; };
private:
void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs,
const std::vector<size_t>& pool_vec_idxs, const std::vector<size_t>& pool_gpr_idxs) {
if (host_isa_ == mkldnn::impl::cpu::cpu_isa_t::avx512_common) {
Xbyak::Zmm in = Xbyak::Zmm(in_vec_idxs[0]);
Xbyak::Ymm out = Xbyak::Ymm(out_vec_idxs[0]);
Xbyak::Zmm aux = Xbyak::Zmm(aux_vec_idxs[0]);
Xbyak::Zmm aux1 = Xbyak::Zmm(aux_vec_idxs[1]);
h->uni_vpsrld(aux, in, 16);
h->vpandd(aux, aux, table_val("one"));
h->uni_vmovups(aux1, table_val("even"));
h->uni_vpaddd(aux, aux1, aux);
h->uni_vpaddd(aux, in, aux);
h->vfixupimmps(aux, in, table_val("selector"), 0);
h->vpsrad(aux, aux, 16);
h->vpmovdw(out, aux);
} else {
assert(!"unsupported isa");
}
};
inline int encode_fixup_selector(int input, int output) {
return ((output) << (4 * (input)));
}
void register_table_entries() {
enum {
fixup_input_code_qnan_ = 0,
fixup_input_code_snan_ = 1,
fixup_input_code_ninf_ = 4,
fixup_input_code_pinf_ = 5,
fixup_output_code_copy_input_ = 1,
fixup_output_code_qnan_input_ = 2,
};
const int selector_int32 =
/* qnan input to qnan output (presenrving input bits 0..21) */
encode_fixup_selector(fixup_input_code_snan_, fixup_output_code_qnan_input_) |
/* snan input to qnan output (presenrving input bits 0..21) */
encode_fixup_selector(fixup_input_code_qnan_, fixup_output_code_qnan_input_) |
/* neg inf input copied to output */
encode_fixup_selector(fixup_input_code_ninf_, fixup_output_code_copy_input_) |
/* pos inf input copied to output */
encode_fixup_selector(fixup_input_code_pinf_, fixup_output_code_copy_input_);
push_arg_entry_of("one", 0x00000001, true);
push_arg_entry_of("even", 0x00007fff, true);
push_arg_entry_of("selector", selector_int32, true);
}
size_t aux_vecs_count() const { return 2; }
};
} // namespace MKLDNNPlugin
/**
@ -139,3 +204,4 @@ public:
static constexpr float_round_style round_style = round_to_nearest;
};
} // namespace std

View File

@ -185,12 +185,12 @@ protected:
expectedPrecisions["ReLU1"] = "ndef";
expectedPrecisions["Convolution2"] = "BF16";
expectedPrecisions["Convolution3"] = "BF16";
expectedPrecisions["ReLU2"] = "FP32";
expectedPrecisions["Norm1"] = "FP32";
expectedPrecisions["ReLU2"] = "BF16";
expectedPrecisions["Norm1"] = "BF16";
expectedPrecisions["Eltwise1"] = "ndef";
expectedPrecisions["ReLU3"] = "ndef";
expectedPrecisions["maxPooling1"] = "BF16";
expectedPrecisions["Eltwise2"] = "FP32";
expectedPrecisions["Eltwise2"] = "BF16";
}
};

View File

@ -179,9 +179,9 @@ public:
}
void test() {
if (!InferenceEngine::with_cpu_x86_bfloat16()) {
// on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
// tests are useless on such platforms
if (!InferenceEngine::with_cpu_x86_avx512_core()) {
// We are enabling bf16 tests on platforms with native support bfloat16, and on platforms with AVX512 ISA
// On platforms with AVX512 ISA but w/o native bfloat16 support computations are done via simulation mode
GTEST_SKIP();
}
std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();

View File

@ -131,8 +131,6 @@ protected:
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["CONV_2"] = "BF16";
expectedPrecisions["CONC_1_TEST"] = "BF16";
expectedPrecisions["RELU_1"] = "FP32";
}
};

View File

@ -111,9 +111,7 @@ protected:
// STAGE3:
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
// performance counters
expectedPrecisions["Convolution_0"] = "BF16";
expectedPrecisions["Convolution_1"] = "BF16";
expectedPrecisions["Elt_sum"] = "FP32";
expectedPrecisions["Elt_sum"] = "BF16";
}
};

View File

@ -100,7 +100,7 @@ protected:
// performance counters
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["CONV_2"] = "BF16";
expectedPrecisions["CONV_2"] = "FP32";
}
};

View File

@ -118,7 +118,6 @@ protected:
// performance counters
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["CONV_2"] = "BF16";
expectedPrecisions["RELU"] = "ndef";
}
};

View File

@ -32,7 +32,7 @@ public:
std::shared_ptr<Function> fnPtr;
SizeVector inputShapes;
std::map<string, string> expectedPrecisions;
float threshold = 3e-2;
float threshold = 7e-2;
Precision netPrecision;
size_t kernel;
CoordinateDiff pads;

View File

@ -122,7 +122,6 @@ protected:
// performance counters
expectedPrecisions["Convolution_0"] = "BF16";
expectedPrecisions["Convolution_1"] = "BF16";
expectedPrecisions["Elt_max"] = "FP32";
}
};

View File

@ -179,8 +179,6 @@ protected:
expectedPrecisions["Convolution_1"] = "BF16";
expectedPrecisions["Convolution_2"] = "BF16";
expectedPrecisions["Convolution_3"] = "BF16";
expectedPrecisions["Elt_max"] = "FP32";
expectedPrecisions["Elt_mul"] = "FP32";
expectedPrecisions["Elt_sum"] = "ndef";
}
};

View File

@ -122,9 +122,9 @@ protected:
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
// performance counters
expectedPrecisions["Matmul_0"] = "BF16";
expectedPrecisions["Mul_1"] = "FP32";
expectedPrecisions["Mul_1"] = "BF16";
expectedPrecisions["Add_1"] = "FP32";
expectedPrecisions["Relu_1"] = "FP32";
expectedPrecisions["Relu_1"] = "ndef";
expectedPrecisions["Conc_1"] = "BF16";
expectedPrecisions["Matmul_1"] = "BF16";
}

View File

@ -1,146 +0,0 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "bfloat16_helpers.hpp"
#include <memory>
#include <tuple>
#include <vector>
#include <string>
#include <map>
#include <functional>
#include <utility>
#include <ie_core.hpp>
#include <ie_plugin_config.hpp>
#include "common_test_utils/common_utils.hpp"
#include "ngraph/opsets/opset1.hpp"
using namespace std;
using namespace ngraph;
using namespace InferenceEngine;
namespace LayerTestsDefinitions {
class Interpolation : public BasicBF16Test {
protected:
std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) override {
// Convolution (BF16)
// |
// Interpolation (In the case of mode = "linear") (FP32)
// |
// Convolution (BF16)
// STAGE1: construction of the GRAPH
ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
auto channelsCount = inputShapes[1];
// add
auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{inputShapes});
input1->set_friendly_name("Input_1");
std::shared_ptr<ngraph::opset1::Constant> addConst = nullptr;
if (netPrecision == Precision::FP32) {
addConst = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
} else {
addConst = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(FuncTestUtils::Bf16TestUtils::reducePrecisionBitwiseS(2.0f)) });
}
auto addNode = std::make_shared<opset1::Multiply>(input1, addConst);
addNode->set_friendly_name("Add_1");
// convolution
std::shared_ptr<ngraph::opset1::Constant> weightsNode1 = nullptr, weightsNode2 = nullptr;
ngraph::Shape convFilterShape = { channelsCount, channelsCount, 3, 3 }; // out channel, /input channels, kernel h, kernel w
if (netPrecision == Precision::FP32) {
std::vector<float> weightValuesFP32;
weightValuesFP32.resize(channelsCount * channelsCount * 3 * 3);
FuncTestUtils::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
} else {
std::vector<short> weightValuesBF16;
weightValuesBF16.resize(channelsCount * channelsCount * 3 * 3);
FuncTestUtils::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
}
std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
addNode, weightsNode1,
ngraph::Strides({ 1, 1 }), // strides
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
ngraph::CoordinateDiff({ 1, 1 }), // pad end
ngraph::Strides({ 1, 1 }), // dilation
ngraph::op::PadType::EXPLICIT); // pad type
convNode1->set_friendly_name("Convolution_1");
// interpolation
auto heightSize = static_cast<long>(inputShapes[2]);
auto weigthSize = static_cast<long>(inputShapes[3]);
std::vector<int64_t> outShape = {2 * heightSize, 2 * weigthSize};
auto interpolShape = std::make_shared<ngraph::op::v0::Constant>(ngraph::element::i64, ngraph::Shape{2}, outShape);
ngraph::op::v0::InterpolateAttrs attrs;
attrs.pads_begin.push_back(0);
attrs.pads_end.push_back(0);
attrs.axes = ngraph::AxisSet{2, 3};
attrs.align_corners = false;
attrs.mode = "linear";
attrs.antialias = false;
auto interpolNode = std::make_shared<opset1::Interpolate>(
convNode1,
interpolShape, attrs);
interpolNode->set_friendly_name("Interp");
std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
interpolNode, weightsNode2,
ngraph::Strides({ 1, 1 }), // strides
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
ngraph::CoordinateDiff({ 1, 1 }), // pad end
ngraph::Strides({ 1, 1 }), // dilation
ngraph::op::PadType::EXPLICIT); // pad type
convNode2->set_friendly_name("Convolution_2");
return std::make_shared<ngraph::Function>(convNode2, ngraph::ParameterVector{input1});
}
void SetUp() override {
std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
fnPtr = createGraph(netPrecision);
// STAGE2: set up safe threshold <= 5% from maximum value of output tensor
threshold = 0.02f; // Max in fp32 network by output: 2.531
// STAGE3:
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
// performance counters
expectedPrecisions["Convolution_1"] = "BF16";
expectedPrecisions["Interp"] = "FP32";
expectedPrecisions["Convolution_2"] = "BF16";
}
};
TEST_P(Interpolation, CompareWithRefImpl) {
test();
};
INSTANTIATE_TEST_CASE_P(smoke_FP32_bfloat16_NoReshape, Interpolation,
::testing::Combine(
::testing::Values(Precision::FP32),
::testing::Values(Precision::FP32),
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
::testing::Values(SizeVector()),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Interpolation::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_BF16_bfloat16_NoReshape, Interpolation,
::testing::Combine(
::testing::Values(Precision::FP32),
::testing::Values(Precision::BF16),
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
::testing::Values(SizeVector()),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Interpolation::getTestCaseName);
} // namespace LayerTestsDefinitions

View File

@ -151,12 +151,9 @@ protected:
// performance counters
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["CONV_2"] = "BF16";
expectedPrecisions["RELU_2"] = "ndef";
expectedPrecisions["DW_CONV"] = "BF16";
expectedPrecisions["RELU_DW"] = "ndef";
expectedPrecisions["NORM_1"] = "FP32";
expectedPrecisions["CONC_1"] = "BF16";
}
};

View File

@ -1,146 +0,0 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "bfloat16_helpers.hpp"
#include <memory>
#include <tuple>
#include <vector>
#include <string>
#include <map>
#include <functional>
#include <utility>
#include <ie_core.hpp>
#include <ie_plugin_config.hpp>
#include "common_test_utils/common_utils.hpp"
#include "ngraph/opsets/opset1.hpp"
using namespace std;
using namespace ngraph;
using namespace InferenceEngine;
namespace LayerTestsDefinitions {
class Resample : public BasicBF16Test {
protected:
std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) override {
// Convolution (BF16)
// |
// Interpolation (Resample in the case of mode = "nearest") (FP32)
// |
// Convolution (BF16)
// STAGE1: construction of the GRAPH
ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
// add
auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{inputShapes});
auto channelsCount = inputShapes[1];
input1->set_friendly_name("Input_1");
std::shared_ptr<ngraph::opset1::Constant> addConst = nullptr;
if (netPrecision == Precision::FP32) {
addConst = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
} else {
addConst = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(FuncTestUtils::Bf16TestUtils::reducePrecisionBitwiseS(2.0f)) });
}
auto addNode = std::make_shared<opset1::Multiply>(input1, addConst);
addNode->set_friendly_name("Add_1");
// convolution
std::shared_ptr<ngraph::opset1::Constant> weightsNode1 = nullptr, weightsNode2 = nullptr;
ngraph::Shape convFilterShape = { channelsCount, channelsCount, 3, 3 }; // out channel, /input channels, kernel h, kernel w
if (netPrecision == Precision::FP32) {
std::vector<float> weightValuesFP32;
weightValuesFP32.resize(channelsCount * channelsCount * 3 * 3);
FuncTestUtils::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
} else {
std::vector<short> weightValuesBF16;
weightValuesBF16.resize(channelsCount * channelsCount * 3 * 3);
FuncTestUtils::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
weightsNode1 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
}
std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
addNode, weightsNode1,
ngraph::Strides({ 1, 1 }), // strides
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
ngraph::CoordinateDiff({ 1, 1 }), // pad end
ngraph::Strides({ 1, 1 }), // dilation
ngraph::op::PadType::EXPLICIT); // pad type
convNode1->set_friendly_name("Convolution_1");
// interpolation
auto heightSize = static_cast<long>(inputShapes[2]);
auto weigthSize = static_cast<long>(inputShapes[3]);
std::vector<int64_t> outShape = {2 * heightSize, 2 * weigthSize};
auto interpolShape = std::make_shared<ngraph::op::v0::Constant>(ngraph::element::i64, ngraph::Shape{2}, outShape);
ngraph::op::v0::InterpolateAttrs attrs;
attrs.pads_begin.push_back(0);
attrs.pads_end.push_back(0);
attrs.axes = ngraph::AxisSet{2, 3};
attrs.align_corners = false;
attrs.mode = "nearest";
attrs.antialias = false;
auto interpolNode = std::make_shared<opset1::Interpolate>(
convNode1,
interpolShape, attrs);
interpolNode->set_friendly_name("Interp");
std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
interpolNode, weightsNode2,
ngraph::Strides({ 1, 1 }), // strides
ngraph::CoordinateDiff({ 1, 1 }), // pad begin
ngraph::CoordinateDiff({ 1, 1 }), // pad end
ngraph::Strides({ 1, 1 }), // dilation
ngraph::op::PadType::EXPLICIT); // pad type
convNode2->set_friendly_name("Convolution_2");
return std::make_shared<ngraph::Function>(convNode2, ngraph::ParameterVector{input1});
}
void SetUp() override {
std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
fnPtr = createGraph(netPrecision);
// STAGE2: set up safe threshold <= 5% from maximum value of output tensor
threshold = 0.02f; // Max in fp32 network by output: 2.35926
// STAGE3:
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
// performance counters
expectedPrecisions["Convolution_1"] = "BF16";
expectedPrecisions["Interp"] = "FP32";
expectedPrecisions["Convolution_2"] = "BF16";
}
};
TEST_P(Resample, CompareWithRefImpl) {
test();
};
INSTANTIATE_TEST_CASE_P(smoke_FP32_bfloat16_NoReshape, Resample,
::testing::Combine(
::testing::Values(Precision::FP32),
::testing::Values(Precision::FP32),
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
::testing::Values(SizeVector()),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Resample::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_BF16_bfloat16_NoReshape, Resample,
::testing::Combine(
::testing::Values(Precision::FP32),
::testing::Values(Precision::BF16),
::testing::Values(SizeVector({ 1, 1, 2, 2 })),
::testing::Values(SizeVector()),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
Resample::getTestCaseName);
} // namespace LayerTestsDefinitions

View File

@ -123,7 +123,6 @@ protected:
// performance counters
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["ADD_2"] = "FP32";
expectedPrecisions["ELT_1"] = "ndef";
}
};

View File

@ -93,7 +93,7 @@ protected:
fnPtr = createGraph(netPrecision);
// STAGE1:
threshold = 7e-2;
threshold = 9e-2;
// STAGE2:
// filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
// performance counters

View File

@ -117,8 +117,6 @@ protected:
expectedPrecisions["ADD_1"] = "FP32";
expectedPrecisions["CONV_1"] = "BF16";
expectedPrecisions["CONV_2"] = "BF16";
expectedPrecisions["CONC_1"] = "BF16";
expectedPrecisions["RELU_1"] = "FP32";
}
};

View File

@ -131,8 +131,7 @@ protected:
expectedPrecisions["Add_1"] = "FP32";
expectedPrecisions["Add_2"] = "FP32";
expectedPrecisions["Convolution_1"] = "BF16";
expectedPrecisions["Convolution_2"] = "BF16";
expectedPrecisions["ELT_1"] = "FP32";
expectedPrecisions["ELT_1"] = "ndef";
}
};

View File

@ -152,7 +152,6 @@ protected:
expectedPrecisions["Add_2"] = "FP32";
expectedPrecisions["ELT_1"] = "ndef";
expectedPrecisions["RELU_1"] = "ndef";
expectedPrecisions["Add_3"] = "FP32";
}
};

View File

@ -114,7 +114,6 @@ protected:
// performance counters
expectedPrecisions["Add_4"] = "FP32";
expectedPrecisions["Convolution_6"] = "BF16";
expectedPrecisions["AvgPool_8"] = "FP32";
}
};

View File

@ -61,7 +61,7 @@ std::vector<std::string> disabledTestPatterns() {
R"(.*decomposition1_batch=5_hidden_size=10_input_size=30_.*tanh.relu.*_clip=0_linear_before_reset=1.*_targetDevice=CPU_.*)",
};
if (!InferenceEngine::with_cpu_x86_bfloat16()) {
if (!InferenceEngine::with_cpu_x86_avx512_core()) {
// on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
// tests are useless on such platforms
retVector.emplace_back(R"(.*BF16.*)");

View File

@ -89,8 +89,6 @@ const std::vector<ngraph::op::EpsMode> epsMode = {
ngraph::op::EpsMode::MAX,
};
std::vector<Precision> inpOutPrc = {Precision::BF16};
std::vector<CPUSpecificParams> cpuParams_4D = {
CPUSpecificParams({nChw16c}, {nChw16c}, {}, {}),
CPUSpecificParams({nhwc}, {nhwc}, {}, {}),