Reordering is now actually skipped when NONE is chosen.

Also made reordering optional for openclSolver.
This commit is contained in:
tqiu 2021-01-11 16:51:05 +01:00
parent 71692ff52b
commit f26e58c6ca
4 changed files with 80 additions and 45 deletions

View File

@ -53,9 +53,12 @@ namespace bda
{
delete[] invDiagVals;
delete[] diagIndex;
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] toOrder;
delete[] fromOrder;
}
delete[] LUmat->nnzValues;
}
template <unsigned int block_size>
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
@ -67,11 +70,18 @@ namespace bda
this->nnz = mat->nnzbs * block_size * block_size;
this->nnzbs = mat->nnzbs;
int *CSCRowIndices = nullptr;
int *CSCColPointers = nullptr;
if (opencl_ilu_reorder == ILUReorder::NONE) {
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
} else {
toOrder = new int[Nb];
fromOrder = new int[Nb];
int *CSCRowIndices = new int[nnzbs];
int *CSCColPointers = new int[Nb + 1];
CSCRowIndices = new int[nnzbs];
CSCColPointers = new int[Nb + 1];
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
Timer t_convert;
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
@ -80,10 +90,9 @@ namespace bda
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
}
}
Timer t_analysis;
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
std::ostringstream out;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
out << "BILU0 reordering strategy: " << "level_scheduling\n";
@ -93,15 +102,11 @@ namespace bda
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n";
numColors = 1;
rowsPerColor.emplace_back(Nb);
// numColors = Nb;
// for(int i = 0; i < Nb; ++i){
// rowsPerColor.emplace_back(1);
// }
// numColors = 1;
// rowsPerColor.emplace_back(Nb);
numColors = Nb;
for(int i = 0; i < Nb; ++i){
toOrder[i] = i;
fromOrder[i] = i;
rowsPerColor.emplace_back(1);
}
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
@ -114,8 +119,10 @@ namespace bda
}
OpmLog::info(out.str());
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] CSCRowIndices;
delete[] CSCColPointers;
}
diagIndex = new int[mat->Nb];
invDiagVals = new double[mat->Nb * bs * bs];
@ -385,6 +392,10 @@ namespace bda
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
auto *m = mat;
if (opencl_ilu_reorder != ILUReorder::NONE) {
m = rmat.get();
Timer t_reorder;
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
@ -393,10 +404,12 @@ namespace bda
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
Timer t_copy;
memcpy(LUmat->nnzValues, rmat->nnzValues, sizeof(double) * bs * bs * rmat->nnzbs);
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
if (verbosity >= 3){
std::ostringstream out;

View File

@ -126,6 +126,7 @@ public:
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int *toOrder, bool reorder);
};

View File

@ -44,7 +44,7 @@ using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
cl_int err = CL_SUCCESS;
@ -485,7 +485,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
rb = new double[N];
tmp = new double[N];
#if COPY_ROW_BY_ROW
vals_contiguous = new double[N];
@ -508,7 +507,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
if (reorder) {
rb = new double[N];
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
wcontainer->init(wellContribs, N, Nb, reorder);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -544,7 +548,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] rb;
}
delete[] tmp;
#if COPY_ROW_BY_ROW
delete[] vals_contiguous;
@ -572,7 +578,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
if (opencl_ilu_reorder != ILUReorder::NONE) {
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
}
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
@ -624,9 +632,13 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
int lmem_per_work_group = work_group_size * sizeof(double);
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
if (opencl_ilu_reorder == ILUReorder::NONE) {
rmat = mat.get();
} else {
toOrder = prec->getToOrder();
fromOrder = prec->getFromOrder();
rmat = prec->getRMat();
}
if (verbosity > 2) {
std::ostringstream out;
@ -645,7 +657,11 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
Timer t;
mat->nnzValues = vals;
if (opencl_ilu_reorder != ILUReorder::NONE) {
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
} else {
rb = b;
}
if (verbosity > 2) {
std::ostringstream out;
@ -703,8 +719,12 @@ template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) {
Timer t;
if (opencl_ilu_reorder != ILUReorder::NONE) {
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
} else {
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
}
if (verbosity > 2) {
std::ostringstream out;

View File

@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
using Base::initialized;
private:
double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
// OpenCL variables must be reusable, they are initialized in initialize()
@ -59,7 +59,7 @@ private:
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
cl::Buffer d_toOrder;
cl::Buffer d_toOrder; // only used when reordering is used
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
// shared pointers are also passed to other objects
@ -81,7 +81,8 @@ private:
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
bool analysis_done = false;
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix, used for spmv
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
ILUReorder opencl_ilu_reorder; // reordering strategy
/// Divide A by B, and round up: return (int)ceil(A/B)
/// \param[in] A dividend