[CPU] Introduced shape agnostic eltwise (#15976)
This commit is contained in:
@@ -354,31 +354,55 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
|
|||||||
const int offset_count = jep.input_size - 1;
|
const int offset_count = jep.input_size - 1;
|
||||||
|
|
||||||
// ptrs initializing
|
// ptrs initializing
|
||||||
auto init_ptrs_with_offsets = [this, offset_count](Reg64 pointer, const std::vector<size_t>& offsets) {
|
if (jep.use_runtime_ptrs) {
|
||||||
for (int j = 0; j < offset_count; j++) {
|
for (int i = 0; i < jep.inputs_number; i++) {
|
||||||
if (jep_.dims[j] != 1 && offsets[j] != 0) {
|
mov(start_to_offsets, ptr[reg_const_params + GET_OFF(src_offsets) + i * sizeof(size_t)]);
|
||||||
mov(reg_tmp_64, offsets[j]);
|
mov(get_src_reg(i), ptr[reg_const_params + GET_OFF(src_ptr[0]) + i * sizeof(size_t)]);
|
||||||
|
for (int j = 0; j < offset_count; j++) {
|
||||||
|
mov(reg_tmp_64, ptr[start_to_offsets + j * sizeof(size_t)]);
|
||||||
imul(reg_tmp_64, ptr[reg_indexes + j * sizeof(size_t)]);
|
imul(reg_tmp_64, ptr[reg_indexes + j * sizeof(size_t)]);
|
||||||
add(pointer, reg_tmp_64);
|
add(get_src_reg(i), reg_tmp_64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
for (int i = 0; i < jep.inputs_number; i++) {
|
mov(start_to_offsets, ptr[reg_const_params + GET_OFF(dst_offsets)]);
|
||||||
mov(get_src_reg(i), ptr[reg_const_params + GET_OFF(src_ptr[0]) + i * sizeof(size_t)]);
|
mov(reg_dst, ptr[reg_const_params + GET_OFF(dst_ptr)]);
|
||||||
init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]);
|
for (int j = 0; j < offset_count; j++) {
|
||||||
|
mov(reg_tmp_64, ptr[start_to_offsets + j * sizeof(size_t)]);
|
||||||
|
imul(reg_tmp_64, ptr[reg_indexes + j * sizeof(size_t)]);
|
||||||
|
add(reg_dst, reg_tmp_64);
|
||||||
|
}
|
||||||
|
|
||||||
|
xor_(reg_oc_off, reg_oc_off);
|
||||||
|
|
||||||
|
mov(reg_work_amount, ptr[reg_const_params + GET_OFF(work_amount)]);
|
||||||
|
} else {
|
||||||
|
auto init_ptrs_with_offsets = [this, offset_count](Reg64 pointer, const std::vector<size_t>& offsets) {
|
||||||
|
for (int j = 0; j < offset_count; j++) {
|
||||||
|
if (jep_.dims[j] != 1 && offsets[j] != 0) {
|
||||||
|
mov(reg_tmp_64, offsets[j]);
|
||||||
|
imul(reg_tmp_64, ptr[reg_indexes + j * sizeof(size_t)]);
|
||||||
|
add(pointer, reg_tmp_64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i = 0; i < jep.inputs_number; i++) {
|
||||||
|
mov(get_src_reg(i), ptr[reg_const_params + GET_OFF(src_ptr[0]) + i * sizeof(size_t)]);
|
||||||
|
init_ptrs_with_offsets(get_src_reg(i), jep.src_offsets[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
mov(reg_dst, ptr[reg_const_params + GET_OFF(dst_ptr)]);
|
||||||
|
init_ptrs_with_offsets(reg_dst, jep.dst_offsets);
|
||||||
|
|
||||||
|
xor_(reg_oc_off, reg_oc_off);
|
||||||
|
init_ptrs_with_offsets(reg_oc_off, jep.oc_offsets);
|
||||||
|
|
||||||
|
mov(reg_work_amount, jep.work_amount);
|
||||||
}
|
}
|
||||||
|
|
||||||
mov(reg_dst, ptr[reg_const_params + GET_OFF(dst_ptr)]);
|
|
||||||
init_ptrs_with_offsets(reg_dst, jep.dst_offsets);
|
|
||||||
|
|
||||||
mov(reg_post_op_ptrs, ptr[reg_const_params + GET_OFF(post_op_data)]);
|
mov(reg_post_op_ptrs, ptr[reg_const_params + GET_OFF(post_op_data)]);
|
||||||
|
|
||||||
xor_(reg_oc_off, reg_oc_off);
|
|
||||||
init_ptrs_with_offsets(reg_oc_off, jep.oc_offsets);
|
|
||||||
|
|
||||||
mov(reg_work_amount, jep.work_amount);
|
|
||||||
|
|
||||||
Xbyak::Label unroll_loop_label;
|
Xbyak::Label unroll_loop_label;
|
||||||
Xbyak::Label unroll_loop_end_label;
|
Xbyak::Label unroll_loop_end_label;
|
||||||
Xbyak::Label main_loop_label;
|
Xbyak::Label main_loop_label;
|
||||||
@@ -565,6 +589,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
Reg64 reg_post_op_ptrs = rax;
|
Reg64 reg_post_op_ptrs = rax;
|
||||||
|
Reg64 start_to_offsets = reg_post_op_ptrs; // rax
|
||||||
Reg64 reg_dst = rbx;
|
Reg64 reg_dst = rbx;
|
||||||
Reg64 reg_work_amount = rdx;
|
Reg64 reg_work_amount = rdx;
|
||||||
|
|
||||||
@@ -1186,7 +1211,7 @@ struct EltwiseKey {
|
|||||||
InferenceEngine::Precision outPrc;
|
InferenceEngine::Precision outPrc;
|
||||||
dnnl::post_ops postOps;
|
dnnl::post_ops postOps;
|
||||||
bool useDynBatch;
|
bool useDynBatch;
|
||||||
bool useJit;
|
EltwiseImplType implType;
|
||||||
|
|
||||||
size_t hash() const {
|
size_t hash() const {
|
||||||
using namespace dnnl::impl;
|
using namespace dnnl::impl;
|
||||||
@@ -1204,10 +1229,17 @@ struct EltwiseKey {
|
|||||||
seed = hash_combine_eltwiseData(seed, item);
|
seed = hash_combine_eltwiseData(seed, item);
|
||||||
});
|
});
|
||||||
seed = get_vector_hash(seed, ops_list);
|
seed = get_vector_hash(seed, ops_list);
|
||||||
seed = get_vector_hash(seed, outBlkDims);
|
if (implType == EltwiseImplType::optimizedShapeAgnostic) {
|
||||||
seed = get_vector_hash(seed, outOrder);
|
seed = hash_combine(seed, outBlkDims.back() == 1);
|
||||||
for (auto&& item : inpDims) {
|
for (auto&& item : inpDims) {
|
||||||
seed = get_vector_hash(seed, item);
|
seed = hash_combine(seed, item.back() == 1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
seed = get_vector_hash(seed, outOrder);
|
||||||
|
seed = get_vector_hash(seed, outBlkDims);
|
||||||
|
for (auto&& item : inpDims) {
|
||||||
|
seed = get_vector_hash(seed, item);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
std::for_each(inpPrc.begin(), inpPrc.end(), [&](const Precision& item) {
|
std::for_each(inpPrc.begin(), inpPrc.end(), [&](const Precision& item) {
|
||||||
seed = hash_combine(seed, item.getPrecVal());
|
seed = hash_combine(seed, item.getPrecVal());
|
||||||
@@ -1215,7 +1247,7 @@ struct EltwiseKey {
|
|||||||
seed = hash_combine(seed, outPrc.getPrecVal());
|
seed = hash_combine(seed, outPrc.getPrecVal());
|
||||||
seed = get_post_op_hash(seed, *postOps.get());
|
seed = get_post_op_hash(seed, *postOps.get());
|
||||||
seed = hash_combine(seed, useDynBatch);
|
seed = hash_combine(seed, useDynBatch);
|
||||||
seed = hash_combine(seed, useJit);
|
seed = hash_combine(seed, implType);
|
||||||
return seed;
|
return seed;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1226,17 +1258,30 @@ struct EltwiseKey {
|
|||||||
|
|
||||||
bool result = eltwise_data == rhs.eltwise_data &&
|
bool result = eltwise_data == rhs.eltwise_data &&
|
||||||
ops_list == rhs.ops_list &&
|
ops_list == rhs.ops_list &&
|
||||||
outBlkDims == rhs.outBlkDims &&
|
|
||||||
outOrder == rhs.outOrder &&
|
|
||||||
inpPrc == rhs.inpPrc &&
|
inpPrc == rhs.inpPrc &&
|
||||||
outPrc == rhs.outPrc &&
|
outPrc == rhs.outPrc &&
|
||||||
*postOps.get() == *rhs.postOps.get() &&
|
*postOps.get() == *rhs.postOps.get() &&
|
||||||
useDynBatch == rhs.useDynBatch &&
|
useDynBatch == rhs.useDynBatch &&
|
||||||
useJit == rhs.useJit;
|
implType == rhs.implType;
|
||||||
|
|
||||||
for (size_t i = 0; i < inpDims.size() && result; ++i) {
|
if (result) {
|
||||||
result = result && (inpDims[i] == rhs.inpDims[i]);
|
if (implType == EltwiseImplType::optimizedShapeAgnostic) {
|
||||||
|
bool broadcast, rhsBroadcast;
|
||||||
|
for (size_t i = 0; i < inpDims.size(); ++i) {
|
||||||
|
broadcast = (inpDims[i].back() == 1);
|
||||||
|
rhsBroadcast = (rhs.inpDims[i].back() == 1);
|
||||||
|
if (broadcast != rhsBroadcast)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result = result && outOrder == rhs.outOrder &&
|
||||||
|
outBlkDims == rhs.outBlkDims;
|
||||||
|
for (size_t i = 0; i < inpDims.size() && result; ++i) {
|
||||||
|
result = result && (inpDims[i] == rhs.inpDims[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -1267,7 +1312,8 @@ public:
|
|||||||
const std::vector<InferenceEngine::Precision>& inpPrc,
|
const std::vector<InferenceEngine::Precision>& inpPrc,
|
||||||
const InferenceEngine::Precision& outPrc,
|
const InferenceEngine::Precision& outPrc,
|
||||||
const dnnl::post_ops& post_ops,
|
const dnnl::post_ops& post_ops,
|
||||||
bool useDynBatch) {
|
bool useDynBatch,
|
||||||
|
bool useRuntimePtrs) {
|
||||||
auto collapseLastDims = [](std::vector<size_t>& dims, int dimsToCollapse) {
|
auto collapseLastDims = [](std::vector<size_t>& dims, int dimsToCollapse) {
|
||||||
for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
|
for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
|
||||||
dims[dims.size() - 1] *= dims[i];
|
dims[dims.size() - 1] *= dims[i];
|
||||||
@@ -1314,6 +1360,8 @@ public:
|
|||||||
jit_eltwise_params jep = {};
|
jit_eltwise_params jep = {};
|
||||||
size_t inputsNumber = inpDims.size();
|
size_t inputsNumber = inpDims.size();
|
||||||
|
|
||||||
|
jep.use_runtime_ptrs = useRuntimePtrs;
|
||||||
|
|
||||||
jep.input_size = inpDims.front().size();
|
jep.input_size = inpDims.front().size();
|
||||||
|
|
||||||
jep.dims.resize(jep.input_size, 1);
|
jep.dims.resize(jep.input_size, 1);
|
||||||
@@ -1335,7 +1383,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (outBlkDims.size() != outOrder.size()) {
|
if (outBlkDims.size() != outOrder.size()) {
|
||||||
IE_THROW() << "Can not make Elwtise executor due to out blocked dims and out order vectors size mismatch.";
|
IE_THROW() << "Can not make Eltwise executor due to out blocked dims and out order vectors size mismatch.";
|
||||||
}
|
}
|
||||||
|
|
||||||
int lastUnchangedAxis = 0;
|
int lastUnchangedAxis = 0;
|
||||||
@@ -1370,7 +1418,7 @@ public:
|
|||||||
int collapsedDims = 0;
|
int collapsedDims = 0;
|
||||||
|
|
||||||
bool hasDifferentDims = false;
|
bool hasDifferentDims = false;
|
||||||
while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount &&
|
while (!useRuntimePtrs && currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount &&
|
||||||
// we shouldn't collapse batch dimension in case dynamic batch is enabled
|
// we shouldn't collapse batch dimension in case dynamic batch is enabled
|
||||||
(!useDynBatch || (outBlkDims.size() - collapsedDims > 2))) {
|
(!useDynBatch || (outBlkDims.size() - collapsedDims > 2))) {
|
||||||
if (collapsedDims >= maxCollapsedDims)
|
if (collapsedDims >= maxCollapsedDims)
|
||||||
@@ -1418,25 +1466,27 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_batchDimIdx = jep.input_size - outBlkDims.size() + collapsedDims;
|
|
||||||
_schedulerWorkAmount = fullWorkAmount / jep.dims[jep.dims.size() - 1];
|
|
||||||
|
|
||||||
if (inpPrc.size() != inputsNumber) {
|
if (inpPrc.size() != inputsNumber) {
|
||||||
IE_THROW() << "Can not make Elwtise executor. Wrong input precisions vector size.";
|
IE_THROW() << "Can not make Eltwise executor. Wrong input precisions vector size.";
|
||||||
}
|
}
|
||||||
|
|
||||||
// init offset
|
if (!useRuntimePtrs) {
|
||||||
jep.dst_offsets.resize(jep.input_size, 1);
|
_batchDimIdx = jep.input_size - outBlkDims.size() + collapsedDims;
|
||||||
offset_out_calc(jep.dst_offsets, jep.dims);
|
_schedulerWorkAmount = fullWorkAmount / jep.dims[jep.dims.size() - 1];
|
||||||
for (int j = 0; j < jep.input_size; j++) {
|
|
||||||
jep.dst_offsets[j] *= outPrc.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < inputsNumber; i++) {
|
// init offset
|
||||||
jep.src_offsets[i].resize(jep.input_size, 1);
|
jep.dst_offsets.resize(jep.input_size, 1);
|
||||||
offset_in_calc(jep.src_offsets[i], inpDims[i], jep.dims);
|
offset_out_calc(jep.dst_offsets, jep.dims);
|
||||||
for (int j = 0; j < jep.input_size; j++) {
|
for (int j = 0; j < jep.input_size; j++) {
|
||||||
jep.src_offsets[i][j] *= inpPrc[i].size();
|
jep.dst_offsets[j] *= outPrc.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < inputsNumber; i++) {
|
||||||
|
jep.src_offsets[i].resize(jep.input_size, 1);
|
||||||
|
offset_in_calc(jep.src_offsets[i], inpDims[i], jep.dims);
|
||||||
|
for (int j = 0; j < jep.input_size; j++) {
|
||||||
|
jep.src_offsets[i][j] *= inpPrc[i].size();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1486,6 +1536,13 @@ public:
|
|||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// execute Optimized Generic
|
// execute Optimized Generic
|
||||||
|
if (_pKernel->jep_.use_runtime_ptrs) {
|
||||||
|
// recalculate _schedulerWorkAmount
|
||||||
|
_schedulerWorkAmount = 1;
|
||||||
|
for (size_t i = 0; i < dims_out.size() - 1; i++) {
|
||||||
|
_schedulerWorkAmount *= dims_out[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
parallel_nt(0, [&](const int ithr, const int nthr) {
|
parallel_nt(0, [&](const int ithr, const int nthr) {
|
||||||
size_t start = 0, end = 0;
|
size_t start = 0, end = 0;
|
||||||
splitter(_schedulerWorkAmount, nthr, ithr, start, end);
|
splitter(_schedulerWorkAmount, nthr, ithr, start, end);
|
||||||
@@ -1538,7 +1595,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (outBlkDims.empty()) {
|
if (outBlkDims.empty()) {
|
||||||
IE_THROW() << "Can not make Elwtise executor from empty output blocked dims vector";
|
IE_THROW() << "Can not make Eltwise executor from empty output blocked dims vector";
|
||||||
}
|
}
|
||||||
|
|
||||||
_inputNum = inpDims.size();
|
_inputNum = inpDims.size();
|
||||||
@@ -1699,7 +1756,7 @@ bool Eltwise::EltwiseData::operator==(const EltwiseData &rhs) const noexcept {
|
|||||||
|
|
||||||
static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
|
static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
|
||||||
Eltwise::executorPtr execPtr;
|
Eltwise::executorPtr execPtr;
|
||||||
if (key.useJit) {
|
if (key.implType != EltwiseImplType::reference) {
|
||||||
execPtr = std::make_shared<EltwiseJitExecutor>(key.eltwise_data,
|
execPtr = std::make_shared<EltwiseJitExecutor>(key.eltwise_data,
|
||||||
key.ops_list,
|
key.ops_list,
|
||||||
key.outBlkDims,
|
key.outBlkDims,
|
||||||
@@ -1708,7 +1765,8 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
|
|||||||
key.inpPrc,
|
key.inpPrc,
|
||||||
key.outPrc,
|
key.outPrc,
|
||||||
key.postOps,
|
key.postOps,
|
||||||
key.useDynBatch);
|
key.useDynBatch,
|
||||||
|
key.implType == EltwiseImplType::optimizedShapeAgnostic);
|
||||||
} else {
|
} else {
|
||||||
execPtr = std::make_shared<EltwiseRefExecutor>(key.eltwise_data.front(),
|
execPtr = std::make_shared<EltwiseRefExecutor>(key.eltwise_data.front(),
|
||||||
key.outBlkDims,
|
key.outBlkDims,
|
||||||
@@ -1840,7 +1898,8 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
// if dim rank is greater than the maximum possible, we should use the reference execution
|
// if dim rank is greater than the maximum possible, we should use the reference execution
|
||||||
canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK;
|
bool canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK;
|
||||||
|
bool canUseOptimizedShapeAgnosticImpl = isDynamicNode() && canUseOptimizedImpl;
|
||||||
|
|
||||||
if (!canUseOptimizedImpl && !fusedWith.empty()) {
|
if (!canUseOptimizedImpl && !fusedWith.empty()) {
|
||||||
IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' uses reference impl, but unexpectedly fused with other ops";
|
IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' uses reference impl, but unexpectedly fused with other ops";
|
||||||
@@ -1873,7 +1932,12 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
|||||||
inputPrecisions.push_back(fusedNode->getOriginalInputPrecisionAtPort(i));
|
inputPrecisions.push_back(fusedNode->getOriginalInputPrecisionAtPort(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (fusedNode->getType() == Type::FakeQuantize) {
|
||||||
|
canUseOptimizedShapeAgnosticImpl = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
implType = canUseOptimizedShapeAgnosticImpl ? EltwiseImplType::optimizedShapeAgnostic :
|
||||||
|
canUseOptimizedImpl ? EltwiseImplType::optimized : EltwiseImplType::reference;
|
||||||
|
|
||||||
if (inputPrecisions.size() != getParentEdges().size())
|
if (inputPrecisions.size() != getParentEdges().size())
|
||||||
IE_THROW() << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration.";
|
IE_THROW() << "Eltwise node with name `" << getName() << "` has invalid input precisions configuration.";
|
||||||
@@ -1894,7 +1958,7 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto filterPrecision = [&](Precision& prc) {
|
auto filterPrecision = [&](Precision& prc) {
|
||||||
if (!canUseOptimizedImpl) {
|
if (implType == EltwiseImplType::reference) {
|
||||||
return Precision(Precision::FP32);
|
return Precision(Precision::FP32);
|
||||||
} else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) {
|
} else if (std::find(supportedPrecisions.begin(), supportedPrecisions.end(), prc) == supportedPrecisions.end()) {
|
||||||
if (prc == Precision::U32 || prc == Precision::I64 || prc == Precision::U64) {
|
if (prc == Precision::U32 || prc == Precision::I64 || prc == Precision::U64) {
|
||||||
@@ -2051,17 +2115,35 @@ void Eltwise::initSupportedPrimitiveDescriptors() {
|
|||||||
currentInBlkDims.resize(inputNum);
|
currentInBlkDims.resize(inputNum);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Eltwise::prepareParams() {
|
void Eltwise::createPrimitive() {
|
||||||
if (memPtrs.empty()) {
|
if (memPtrs.empty()) {
|
||||||
for (auto i = 0; i < inputNum; i++)
|
for (auto i = 0; i < inputNum; i++)
|
||||||
memPtrs.push_back(getParentEdgeAt(i)->getMemoryPtr());
|
memPtrs.push_back(getParentEdgeAt(i)->getMemoryPtr());
|
||||||
memPtrs.push_back(getChildEdgeAt(0)->getMemoryPtr());
|
memPtrs.push_back(getChildEdgeAt(0)->getMemoryPtr());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isDynBatchEnabled = getSelectedPrimitiveDescriptor()->getConfig().dynBatchSupport;
|
||||||
|
|
||||||
|
start_offset_in.resize(inputNum);
|
||||||
|
for (size_t i = 0; i < inputNum; i++) {
|
||||||
|
const auto desc = getParentEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||||
|
start_offset_in[i] = desc->getOffsetPadding() * desc->getPrecision().size();
|
||||||
|
}
|
||||||
|
const auto desc = getChildEdgeAt(0)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||||
|
start_offset_out = desc->getOffsetPadding() * desc->getPrecision().size();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < inputNum; ++i) {
|
||||||
|
inpPrc.push_back(getParentEdgeAt(i)->getMemory().getDesc().getPrecision());
|
||||||
|
}
|
||||||
|
|
||||||
|
outPrc = getChildEdgeAt(0)->getMemory().getDesc().getPrecision();
|
||||||
|
Node::createPrimitive();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Eltwise::prepareParams() {
|
||||||
auto outBlockingDesc = getChildEdgeAt(0)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
auto outBlockingDesc = getChildEdgeAt(0)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
||||||
const auto &outOrder = outBlockingDesc->getOrder();
|
const auto &outOrder = outBlockingDesc->getOrder();
|
||||||
const auto ¤tOutBlkDims = outBlockingDesc->getBlockDims();
|
const auto ¤tOutBlkDims = outBlockingDesc->getBlockDims();
|
||||||
isDynBatchEnabled = getSelectedPrimitiveDescriptor()->getConfig().dynBatchSupport;
|
|
||||||
|
|
||||||
size_t input_size = std::max(static_cast<size_t>(EltwiseJitExecutor::optimalTensorRank), currentOutBlkDims.size());
|
size_t input_size = std::max(static_cast<size_t>(EltwiseJitExecutor::optimalTensorRank), currentOutBlkDims.size());
|
||||||
|
|
||||||
@@ -2094,43 +2176,97 @@ void Eltwise::prepareParams() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
start_offset_in.resize(inputNum);
|
// we can skip searching in the cache if broadcast policy for last input dims is not changed
|
||||||
for (size_t i = 0; i < inputNum; i++) {
|
// last input dim == 1 means broadcasted (also if output dim == 1)
|
||||||
const auto desc = getParentEdgeAt(i)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
// last input dim != 1 means not broadcasted
|
||||||
start_offset_in[i] = desc->getOffsetPadding() * desc->getPrecision().size();
|
bool canSkipSearchInCache = false;
|
||||||
}
|
if (implType == EltwiseImplType::optimizedShapeAgnostic) {
|
||||||
const auto desc = getChildEdgeAt(0)->getMemory().GetDescWithType<BlockedMemoryDesc>();
|
if (execPtr) {
|
||||||
start_offset_out = desc->getOffsetPadding() * desc->getPrecision().size();
|
canSkipSearchInCache = true;
|
||||||
|
// check broadcast policy
|
||||||
std::vector<InferenceEngine::Precision> inpPrc;
|
for (int i = 0; i < inputNum; i++) {
|
||||||
for (size_t i = 0; i < inputNum; ++i) {
|
if (broadcastPolicy[i] != (dims_in[i].back() == 1)) {
|
||||||
inpPrc.push_back(getParentEdgeAt(i)->getMemory().getDesc().getPrecision());
|
broadcastPolicy[i] = (dims_in[i].back() == 1);
|
||||||
}
|
canSkipSearchInCache = false;
|
||||||
|
}
|
||||||
auto outPrc = getChildEdgeAt(0)->getMemory().getDesc().getPrecision();
|
|
||||||
|
|
||||||
EltwiseData thisOp{getAlgorithm(), getOneDnnAlgorithm(), getAlpha(), getBeta(), getGamma()};
|
|
||||||
|
|
||||||
EltwiseKey key = {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, dnnl::post_ops(), isDynBatchEnabled, canUseOptimizedImpl};
|
|
||||||
|
|
||||||
fqDataPtrs.clear();
|
|
||||||
for (const auto &node : fusedWith) {
|
|
||||||
key.ops_list.push_back(node->getType());
|
|
||||||
if (node->getType() == Type::Eltwise) {
|
|
||||||
if (auto eltwise = std::dynamic_pointer_cast<Eltwise>(node)) {
|
|
||||||
key.eltwise_data.push_back({eltwise->getAlgorithm(), eltwise->getOneDnnAlgorithm(), eltwise->getAlpha(),
|
|
||||||
eltwise->getBeta(), eltwise->getGamma()});
|
|
||||||
}
|
}
|
||||||
} else if (node->getType() == Type::FakeQuantize) {
|
|
||||||
node->appendPostOps(key.postOps, {}, fqDataPtrs);
|
|
||||||
} else {
|
} else {
|
||||||
IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' has unexpected fused op of type '" << node->getTypeStr() << "'";
|
// fill broadcast policy
|
||||||
|
broadcastPolicy.resize(inputNum);
|
||||||
|
for (int i = 0; i < inputNum; i++) {
|
||||||
|
broadcastPolicy[i] = (dims_in[i].back() == 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto cache = context->getParamsCache();
|
if (!canSkipSearchInCache) {
|
||||||
auto result = cache->getOrCreate(key, buildExecutor);
|
EltwiseData thisOp{getAlgorithm(), getOneDnnAlgorithm(), getAlpha(), getBeta(), getGamma()};
|
||||||
execPtr = result.first;
|
EltwiseKey key = {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, dnnl::post_ops(), isDynBatchEnabled, implType};
|
||||||
|
fqDataPtrs.clear();
|
||||||
|
for (const auto &node : fusedWith) {
|
||||||
|
key.ops_list.push_back(node->getType());
|
||||||
|
if (node->getType() == Type::Eltwise) {
|
||||||
|
if (auto eltwise = std::dynamic_pointer_cast<Eltwise>(node)) {
|
||||||
|
key.eltwise_data.push_back({eltwise->getAlgorithm(), eltwise->getOneDnnAlgorithm(), eltwise->getAlpha(),
|
||||||
|
eltwise->getBeta(), eltwise->getGamma()});
|
||||||
|
}
|
||||||
|
} else if (node->getType() == Type::FakeQuantize) {
|
||||||
|
node->appendPostOps(key.postOps, {}, fqDataPtrs);
|
||||||
|
} else {
|
||||||
|
IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' has unexpected fused op of type '" << node->getTypeStr() << "'";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto cache = context->getParamsCache();
|
||||||
|
auto result = cache->getOrCreate(key, buildExecutor);
|
||||||
|
execPtr = result.first;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update execParams for shape agnostic kernel
|
||||||
|
if (implType == EltwiseImplType::optimizedShapeAgnostic) {
|
||||||
|
auto &outDims = execParams.outDims;
|
||||||
|
auto &inOffsets = execParams.inOffsets;
|
||||||
|
auto &outOffsets = execParams.outOffsets;
|
||||||
|
|
||||||
|
// outDims recalculation
|
||||||
|
outDims.resize(dims_in[0].size(), 1);
|
||||||
|
for (int i = 0; i < outRank; i++) {
|
||||||
|
outDims[outDims.size() - 1 - i] = currentOutBlkDims[outRank - 1 - i];
|
||||||
|
}
|
||||||
|
// offsets recalculation
|
||||||
|
auto offset_out_calc = [](VectorDims& offset, const VectorDims& dims) {
|
||||||
|
int k = 1;
|
||||||
|
for (int i = offset.size() - 1; i >= 0; i--) {
|
||||||
|
offset[i] = k;
|
||||||
|
k *= dims[i];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto offset_in_calc = [](VectorDims& offset, const VectorDims& dims_in, const VectorDims& dims_out) {
|
||||||
|
int k = 1;
|
||||||
|
for (int i = offset.size() - 1; i >= 0; i--) {
|
||||||
|
offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
|
||||||
|
k *= dims_in[i];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto inputSize = dims_in.front().size();
|
||||||
|
outOffsets.resize(inputSize, 1);
|
||||||
|
offset_out_calc(outOffsets, outDims);
|
||||||
|
for (int j = 0; j < inputSize; j++) {
|
||||||
|
outOffsets[j] *= outPrc.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
auto inputsNumber = dims_in.size();
|
||||||
|
inOffsets.resize(inputsNumber);
|
||||||
|
for (int i = 0; i < inputsNumber; i++) {
|
||||||
|
inOffsets[i].resize(inputSize, 1);
|
||||||
|
offset_in_calc(inOffsets[i], dims_in[i], outDims);
|
||||||
|
for (int j = 0; j < inputSize; j++) {
|
||||||
|
inOffsets[i][j] *= inpPrc[i].size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Eltwise::needPrepareParams() const {
|
bool Eltwise::needPrepareParams() const {
|
||||||
@@ -2148,14 +2284,14 @@ void Eltwise::selectOptimalPrimitiveDescriptor() {
|
|||||||
void Eltwise::execute(dnnl::stream strm) {
|
void Eltwise::execute(dnnl::stream strm) {
|
||||||
if (execPtr) {
|
if (execPtr) {
|
||||||
jit_eltwise_call_args_ptrs args_ptrs = {};
|
jit_eltwise_call_args_ptrs args_ptrs = {};
|
||||||
auto batchDimIdx = execPtr->getBatchDimIdx();
|
VectorDims dims_out = implType == EltwiseImplType::optimizedShapeAgnostic ? execParams.outDims : execPtr->getOutDims();
|
||||||
VectorDims dims_out = execPtr->getOutDims();
|
|
||||||
for (int i = 0; i < memPtrs.size() - 1; i++)
|
for (int i = 0; i < memPtrs.size() - 1; i++)
|
||||||
args_ptrs.src_ptr[i] = reinterpret_cast<const uint8_t*>(memPtrs[i]->GetData()) + start_offset_in[i];
|
args_ptrs.src_ptr[i] = reinterpret_cast<const uint8_t*>(memPtrs[i]->GetData()) + start_offset_in[i];
|
||||||
args_ptrs.dst_ptr = reinterpret_cast<uint8_t*>(memPtrs.back()->GetData()) + start_offset_out;
|
args_ptrs.dst_ptr = reinterpret_cast<uint8_t*>(memPtrs.back()->GetData()) + start_offset_out;
|
||||||
|
|
||||||
// In general case we need to recompute offsets as well but currently all supported layout assumes batch to be outermost dimension
|
// In general case we need to recompute offsets as well but currently all supported layout assumes batch to be outermost dimension
|
||||||
if (isDynBatchEnabled) {
|
if (isDynBatchEnabled) {
|
||||||
|
auto batchDimIdx = execPtr->getBatchDimIdx();
|
||||||
if (dims_out.size() <= batchDimIdx)
|
if (dims_out.size() <= batchDimIdx)
|
||||||
IE_THROW() << "Can't set batch dims for eltwise node with rank: " << dims_out.size() << " and batch idx: " << batchDimIdx;
|
IE_THROW() << "Can't set batch dims for eltwise node with rank: " << dims_out.size() << " and batch idx: " << batchDimIdx;
|
||||||
dims_out[batchDimIdx] = static_cast<size_t>(batchToProcess());
|
dims_out[batchDimIdx] = static_cast<size_t>(batchToProcess());
|
||||||
@@ -2163,6 +2299,15 @@ void Eltwise::execute(dnnl::stream strm) {
|
|||||||
|
|
||||||
args_ptrs.post_op_data = fqDataPtrs.data();
|
args_ptrs.post_op_data = fqDataPtrs.data();
|
||||||
|
|
||||||
|
// shape agnostic kernel: offsets and work amount initialization
|
||||||
|
if (implType == EltwiseImplType::optimizedShapeAgnostic) {
|
||||||
|
args_ptrs.work_amount = dims_out.back();
|
||||||
|
for (int i = 0; i < execParams.inOffsets.size(); i++) {
|
||||||
|
args_ptrs.src_offsets[i] = execParams.inOffsets[i].data();
|
||||||
|
}
|
||||||
|
args_ptrs.dst_offsets = execParams.outOffsets.data();
|
||||||
|
}
|
||||||
|
|
||||||
execPtr->exec(args_ptrs, dims_out);
|
execPtr->exec(args_ptrs, dims_out);
|
||||||
} else {
|
} else {
|
||||||
IE_THROW() << "Can't execute eltwise node with name: " << getName() << ". Primitive isn't created";
|
IE_THROW() << "Can't execute eltwise node with name: " << getName() << ". Primitive isn't created";
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ struct jit_eltwise_params {
|
|||||||
size_t oc_size;
|
size_t oc_size;
|
||||||
|
|
||||||
size_t work_amount;
|
size_t work_amount;
|
||||||
|
bool use_runtime_ptrs;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct jit_eltwise_call_args_ptrs {
|
struct jit_eltwise_call_args_ptrs {
|
||||||
@@ -42,6 +43,11 @@ struct jit_eltwise_call_args_ptrs {
|
|||||||
void *dst_ptr;
|
void *dst_ptr;
|
||||||
//ptr to array of post op inputs pointers (flat list)
|
//ptr to array of post op inputs pointers (flat list)
|
||||||
const void** post_op_data;
|
const void** post_op_data;
|
||||||
|
|
||||||
|
// shape agnostic kernel
|
||||||
|
size_t work_amount;
|
||||||
|
const void *src_offsets[MAX_ELTWISE_INPUTS];
|
||||||
|
const void *dst_offsets;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct jit_eltwise_call_args_indexes {
|
struct jit_eltwise_call_args_indexes {
|
||||||
@@ -66,6 +72,12 @@ struct jit_uni_eltwise_kernel {
|
|||||||
jit_eltwise_params jep_;
|
jit_eltwise_params jep_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum class EltwiseImplType {
|
||||||
|
reference = 0,
|
||||||
|
optimized = 1,
|
||||||
|
optimizedShapeAgnostic = 2
|
||||||
|
};
|
||||||
|
|
||||||
class Eltwise : public Node {
|
class Eltwise : public Node {
|
||||||
public:
|
public:
|
||||||
struct EltwiseData {
|
struct EltwiseData {
|
||||||
@@ -116,6 +128,7 @@ public:
|
|||||||
|
|
||||||
bool needPrepareParams() const override;
|
bool needPrepareParams() const override;
|
||||||
void prepareParams() override;
|
void prepareParams() override;
|
||||||
|
void createPrimitive() override;
|
||||||
|
|
||||||
void executeDynamicImpl(dnnl::stream strm) override;
|
void executeDynamicImpl(dnnl::stream strm) override;
|
||||||
|
|
||||||
@@ -137,16 +150,27 @@ private:
|
|||||||
|
|
||||||
dnnl::algorithm onednnAlgorithm = dnnl::algorithm::undef;
|
dnnl::algorithm onednnAlgorithm = dnnl::algorithm::undef;
|
||||||
|
|
||||||
bool canUseOptimizedImpl = false;
|
EltwiseImplType implType = EltwiseImplType::reference;
|
||||||
|
std::vector<bool> broadcastPolicy;
|
||||||
bool isDynBatchEnabled = false;
|
bool isDynBatchEnabled = false;
|
||||||
bool specialConvolutionAddFusing = false;
|
bool specialConvolutionAddFusing = false;
|
||||||
size_t inputNum = 0;
|
size_t inputNum = 0;
|
||||||
std::vector<ptrdiff_t> start_offset_in = {};
|
std::vector<ptrdiff_t> start_offset_in = {};
|
||||||
ptrdiff_t start_offset_out = 0;
|
ptrdiff_t start_offset_out = 0;
|
||||||
|
|
||||||
|
std::vector<InferenceEngine::Precision> inpPrc;
|
||||||
|
InferenceEngine::Precision outPrc;
|
||||||
|
|
||||||
// blocked dims for which kernel compiled and params prepared
|
// blocked dims for which kernel compiled and params prepared
|
||||||
std::vector<VectorDims> currentInBlkDims = {};
|
std::vector<VectorDims> currentInBlkDims = {};
|
||||||
|
|
||||||
|
// shape agnostic kernel
|
||||||
|
struct {
|
||||||
|
VectorDims outDims;
|
||||||
|
std::vector<VectorDims> inOffsets;
|
||||||
|
VectorDims outOffsets;
|
||||||
|
} execParams;
|
||||||
|
|
||||||
float alpha = 0;
|
float alpha = 0;
|
||||||
float beta = 0;
|
float beta = 0;
|
||||||
float gamma = 0;
|
float gamma = 0;
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user