[GPU] Add permute primitive instead of manual copy for deconv weights (#19516)

This commit is contained in:
Vladimir Paramuzov
2023-09-04 17:03:49 +04:00
committed by GitHub
parent 2d760ba1bf
commit 2f782b2131
2 changed files with 47 additions and 123 deletions

View File

@@ -65,40 +65,60 @@ static cldnn::tensor getConstTensor(const ov::Shape constDims) {
struct ConstProperties {
bool needsBatchInterpretation;
bool swapOI;
bool hasGroupDimension;
};
static void createClDnnConstant(ProgramBuilder& p, const ov::Shape& constDims, const std::shared_ptr<ov::op::v0::Constant>& op, const ConstProperties& props);
static void create_data(ProgramBuilder& p, const ov::Shape& constDims, const std::shared_ptr<ov::op::v0::Constant>& op, const ConstProperties& props) {
cldnn::tensor constTensor = getConstTensor(constDims);
auto constFormat = cldnn::format::get_default_format(constDims.size());
if (props.needsBatchInterpretation) {
constTensor.batch[0] = static_cast<cldnn::tensor::value_type>(constTensor.count());
constTensor.feature[0] = 1;
}
// If constDims has a dimension = 0, then create tensor with single value
// TODO: check if dim=0 is a valid case
if (std::accumulate(constDims.begin(), constDims.end(), size_t(1), std::multiplies<size_t>()) == 0)
constTensor = cldnn::tensor{1};
auto newDims = constDims;
cldnn::data_types out_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0));
cldnn::layout constLayout = p.use_new_shape_infer() ? cldnn::layout(newDims, out_dtype, constFormat) :
cldnn::layout(out_dtype, constFormat, constTensor);
cldnn::primitive_id initialconstPrimID = layer_type_name_ID(op);
cldnn::primitive_id constPrimID;
auto data = op->get_data_ptr<char>();
auto bufIter = p.blobMemCache.find(std::make_pair(data, newDims));
if (bufIter != p.blobMemCache.end()) {
constPrimID = bufIter->second;
p.primitive_ids[initialconstPrimID] = constPrimID;
p.profiling_ids.push_back(initialconstPrimID);
} else {
GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl;
cldnn::memory::ptr mem = p.get_engine().allocate_memory(constLayout, false);
auto& stream = p.get_engine().get_service_stream();
cldnn::mem_lock<char> lock{mem, stream};
auto buf = lock.data();
auto bufSize = constLayout.bytes_count();
std::memcpy(&buf[0], &data[0], bufSize);
p.add_primitive(*op, cldnn::data(initialconstPrimID, mem));
p.blobMemCache[std::make_pair(data, newDims)] = initialconstPrimID;
constPrimID = initialconstPrimID;
}
}
static void CreateConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::Constant>& op) {
ov::Shape constDims = op->get_shape();
auto constUsers = op->get_output_target_inputs(0);
size_t numConstUsers = constUsers.size();
std::unordered_map<std::shared_ptr<ov::op::v0::Constant>, ConstProperties> consts = {
{op, {false, false, false}}
{op, {false}}
};
// handleConvWeights function is executed when one of the constant users is ConvolutionBackpropData or GroupConvolutionBackpropData.
// In that case, we mark that constant's O and I dimensions need to be swapped.
auto handleConvWeights = [&op] (ov::Node* conv, std::unordered_map<std::shared_ptr<ov::op::v0::Constant>, ConstProperties>& consts,
size_t& numConstUsers, bool hasGroupDimension) {
// If constant has multiple users - create its copy and replace 'conv' weights with the copy.
// This is to make sure that dimension change doesn't break other users of the constant node.
// It is a shallow copy, but that's fine since in createClDnnConstant
// every constant created here, gets memcopied to a brand new cldnn::memory.
if (numConstUsers > 1) {
auto constant = std::make_shared<ov::op::v0::Constant>(*(op.get()));
conv->input(1).replace_source_output(constant);
consts.insert({constant, {false, true, hasGroupDimension}});
numConstUsers--;
} else {
consts[op].swapOI = true;
consts[op].hasGroupDimension = hasGroupDimension;
}
};
auto is_binary_eltwise = [&] (ov::Node* op) -> bool {
if (ov::op::util::is_binary_elementwise_arithmetic(op) ||
ov::op::util::is_binary_elementwise_logical(op) ||
@@ -152,10 +172,6 @@ static void CreateConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0
ov::is_type<ov::op::v1::Split>(outOp) ||
ov::is_type<ov::op::v1::VariadicSplit>(outOp)) {
consts[op].needsBatchInterpretation = constDims.size() == 1;
} else if (ov::is_type<ov::op::v1::ConvolutionBackpropData>(outOp) && node.get_index() == 1) {
handleConvWeights(outOp, consts, numConstUsers, false);
} else if (ov::is_type<ov::op::v1::GroupConvolutionBackpropData>(outOp) && node.get_index() == 1) {
handleConvWeights(outOp, consts, numConstUsers, true);
} else if (ov::is_type<ov::op::v0::PRelu>(outOp) && node.get_index() == 1) {
// PReLU slope tensor reshape policy
//
@@ -187,97 +203,7 @@ static void CreateConstantOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0
}
for (auto& it : consts) {
createClDnnConstant(p, constDims, it.first, it.second);
}
}
void createClDnnConstant(ProgramBuilder& p, const ov::Shape& constDims, const std::shared_ptr<ov::op::v0::Constant>& op, const ConstProperties& props) {
cldnn::tensor constTensor = getConstTensor(constDims);
auto constFormat = cldnn::format::get_default_format(constDims.size());
if (props.needsBatchInterpretation) {
constTensor.batch[0] = static_cast<cldnn::tensor::value_type>(constTensor.count());
constTensor.feature[0] = 1;
}
// If constDims has a dimension = 0, then create tensor with single value
// TODO: check if dim=0 is a valid case
if (std::accumulate(constDims.begin(), constDims.end(), size_t(1), std::multiplies<size_t>()) == 0)
constTensor = cldnn::tensor{1};
// Swap O and I dimensions to match expected deconvolution weights format
size_t inputFeatureElements = 1;
size_t outputFeatureElements = 1;
size_t groups = 1;
auto newDims = constDims;
if (props.swapOI) {
size_t expected_min_rank = 2 + (props.hasGroupDimension ? 1 : 0);
if (expected_min_rank > constDims.size())
OPENVINO_THROW("Invalid constant properties or shape");
if (props.hasGroupDimension) {
std::swap(newDims[2], newDims[1]);
inputFeatureElements = newDims[2];
outputFeatureElements = newDims[1];
groups = newDims[0];
} else {
std::swap(newDims[1], newDims[0]);
inputFeatureElements = newDims[1];
outputFeatureElements = newDims[0];
groups = 1;
}
constTensor = getConstTensor(newDims);
}
cldnn::data_types out_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0));
cldnn::layout constLayout = p.use_new_shape_infer() ? cldnn::layout(newDims, out_dtype, constFormat) :
cldnn::layout(out_dtype, constFormat, constTensor);
cldnn::primitive_id initialconstPrimID = layer_type_name_ID(op);
cldnn::primitive_id constPrimID;
auto data = op->get_data_ptr<char>();
auto bufIter = p.blobMemCache.find(std::make_pair(data, newDims));
if (bufIter != p.blobMemCache.end()) {
constPrimID = bufIter->second;
p.primitive_ids[initialconstPrimID] = constPrimID;
p.profiling_ids.push_back(initialconstPrimID);
} else {
GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl;
cldnn::memory::ptr mem = p.get_engine().allocate_memory(constLayout, false);
auto& stream = p.get_engine().get_service_stream();
cldnn::mem_lock<char> lock{mem, stream};
auto buf = lock.data();
auto bufSize = constLayout.bytes_count();
// Do actual weights reorder and change O and I channels order
if (props.swapOI) {
auto elementSize = cldnn::data_type_traits::size_of(constLayout.data_type);
size_t spatial_dim_off = props.hasGroupDimension ? 3 : 2;
size_t featureSize = elementSize;
for (size_t i = spatial_dim_off; i < constDims.size(); i++) {
featureSize *= constDims[i];
}
for (size_t g = 0; g < groups; g++) {
for (size_t i = 0; i < inputFeatureElements; i++) {
for (size_t o = 0; o < outputFeatureElements; o++) {
size_t outputShift = ((g*outputFeatureElements + o)*inputFeatureElements + i)*featureSize;
size_t inputShift = ((g*inputFeatureElements + i)*outputFeatureElements + o)*featureSize;
for (size_t b = 0; b < featureSize; b++) {
buf[outputShift + b] = data[inputShift + b];
}
}
}
}
} else {
std::memcpy(&buf[0], &data[0], bufSize);
}
p.add_primitive(*op, cldnn::data(initialconstPrimID, mem));
p.blobMemCache[std::make_pair(data, newDims)] = initialconstPrimID;
constPrimID = initialconstPrimID;
create_data(p, constDims, it.first, it.second);
}
}

View File

@@ -114,12 +114,11 @@ static void CreateConvolutionBackpropDataOp(ProgramBuilder& p, const std::shared
auto weightsName = inputs[1];
auto weights_node = op->get_input_node_shared_ptr(1);
bool hasConstantWeights = IsNodeOnConstPath(weights_node);
// WA: For the cases like Const(weights)->Sub(zp)->Deconv. And also for the cases with real runtime weights.
// Dimensions order of weights blob is IOYX, but
// the selected format is OIYX by default. So we need to swap (and transpose) I and O dimensions to match the format
// For Constant node on input transpose is not needed, because the data is transposed on const node creation
if ((hasConstantWeights && std::dynamic_pointer_cast<ov::op::v0::Constant>(weights_node) == nullptr) || !hasConstantWeights) {
{
std::string permuteName = layerName + "_cldnn_weights_permute";
auto weights_rank = op->get_input_shape(1).size();
std::vector<uint16_t> permute_order(weights_rank);
@@ -204,12 +203,11 @@ static void CreateGroupConvolutionBackpropDataOp(ProgramBuilder& p, const std::s
auto weightsName = inputs[1];
auto weights_node = op->get_input_node_shared_ptr(1);
bool hasConstWeights = IsNodeOnConstPath(weights_node);
// WA: For the cases like Const(weights)->Sub(zp)->Deconv. And also for the cases with real runtime weights.
// Dimensions order of weights blob is IOYX, but
// the selected format is OIYX by default. So we need to swap I and O dimensions to match the format.
// For Constant node on input transpose is not needed, because the data is transposed on const node creation
if ((hasConstWeights && std::dynamic_pointer_cast<ov::op::v0::Constant>(weights_node) == nullptr) || !hasConstWeights) {
{
std::string permuteName = layerName + "_cldnn_weights_permute";
auto weights_rank = op->get_input_shape(1).size();
std::vector<uint16_t> permute_order(weights_rank);