Do not reorder the matrix for GPU

This commit is contained in:
Tong Dong Qiu
2022-09-22 10:15:23 +02:00
parent 2690917e93
commit e327142088
12 changed files with 76 additions and 126 deletions

View File

@@ -93,16 +93,16 @@ void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, doubl
bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIndices, const std::vector<bool>& doneRows);
/// Find a level scheduling reordering for an input matrix
/// The toOrder and fromOrder arrays must be allocated already
/// The toOrder and fromOrder arrays must be allocated already
/// \param[in] CSRColIndices column indices array, obtained from storing the input matrix in the CSR format
/// \param[in] CSRRowPointers row pointers array, obtained from storing the input matrix in the CSR format
/// \param[in] CSCRowIndices row indices array, obtained from storing the input matrix in the CSC format
/// \param[in] CSCColPointers column pointers array, obtained from storing the input matrix in the CSC format
/// \param[in] Nb number of blockrows in the matrix
/// \param[out] numColors a pointer to the number of colors needed for the level scheduling
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
/// \param[out] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
/// \param[out] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
/// \param[out] rowsPerColor for each color, an array of all rowIndices in that color, this function uses emplace_back() to fill
void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
/// Find a graph coloring reordering for an input matrix

View File

@@ -78,14 +78,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
fromOrder.resize(Nb);
CSCRowIndices.resize(matToDecompose->nnzbs);
CSCColPointers.resize(Nb + 1);
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
if (jacMat) {
rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
} else {
LUmat = std::make_unique<BlockedMatrix>(*rmat);
}
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
Timer t_convert;
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
@@ -101,17 +95,9 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
out << "BILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
if (jacMat) {
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
}
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
out << "BILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
if (jacMat) {
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
}
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n";
// numColors = 1;
@@ -143,6 +129,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
#if CHOW_PATEL
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
@@ -156,14 +143,16 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
#endif
events.resize(2);
events.resize(3);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
}
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
@@ -189,27 +178,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
{
const unsigned int bs = block_size;
auto *matToDecompose = jacMat;
if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
matToDecompose = jacMat ? jacMat : mat;
} else {
Timer t_reorder;
if (jacMat) {
matToDecompose = rJacMat.get();
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
} else {
matToDecompose = rmat.get();
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
}
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
}
auto *matToDecompose = jacMat ? jacMat : mat;
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
@@ -269,11 +238,11 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
std::ostringstream out;
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color+1];
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
OpenclKernels::ILU_decomp(firstRow, lastRow,
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size);
}
@@ -301,11 +270,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows,
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#else
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows,
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#endif
@@ -313,11 +282,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows,
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#else
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows,
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#endif

View File

@@ -54,8 +54,6 @@ class BILU0 : public Preconditioner<block_size>
private:
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
std::shared_ptr<BlockedMatrix> rJacMat = nullptr;
#if CHOW_PATEL
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
#endif
@@ -76,6 +74,7 @@ private:
cl::Buffer invDiagVals;
cl::Buffer diagIndex;
cl::Buffer rowsPerColor;
cl::Buffer rowIndices;
#if CHOW_PATEL
cl::Buffer Lvals, Lcols, Lrows;
cl::Buffer Uvals, Ucols, Urows;
@@ -117,12 +116,7 @@ public:
BlockedMatrix* getRMat() override
{
return rmat.get();
}
BlockedMatrix* getRJacMat()
{
return rJacMat.get();
return LUmat.get();
}
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()

View File

@@ -3,6 +3,7 @@
/// Here, L is it's own BSR matrix.
/// Only used with ChowPatelIlu.
__kernel void ILU_apply1(
__global const unsigned *rowIndices,
__global const double *LUvals,
__global const unsigned int *LUcols,
__global const unsigned int *LUrows,
@@ -30,14 +31,16 @@ __kernel void ILU_apply1(
target_block_row += nodesPerColorPrefix[color];
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row];
const unsigned int last_block = LUrows[target_block_row+1];
unsigned row_idx = rowIndices[target_block_row];
const unsigned int first_block = LUrows[row_idx];
const unsigned int last_block = LUrows[row_idx+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
local_out = y[target_block_row*bs+lane];
local_out = y[row_idx*bs+lane];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
@@ -60,7 +63,7 @@ __kernel void ILU_apply1(
}
if(lane < bs){
const unsigned int row = target_block_row*bs + lane;
const unsigned int row = row_idx*bs + lane;
x[row] = tmp[lane];
}

View File

@@ -3,6 +3,7 @@
/// Here, L is inside a normal, square matrix.
/// In this case, diagIndex indicates where the rows of L end.
__kernel void ILU_apply1(
__global const unsigned *rowIndices,
__global const double *LUvals,
__global const unsigned int *LUcols,
__global const unsigned int *LUrows,
@@ -29,14 +30,17 @@ __kernel void ILU_apply1(
const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row];
const unsigned int last_block = diagIndex[target_block_row];
unsigned row_idx = rowIndices[target_block_row];
const unsigned int first_block = LUrows[row_idx];
const unsigned int last_block = diagIndex[row_idx];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
local_out = y[target_block_row*bs+lane];
local_out = y[row_idx*bs+lane];
}
for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c];
@@ -59,7 +63,7 @@ __kernel void ILU_apply1(
}
if(lane < bs){
const unsigned int row = target_block_row*bs + lane;
const unsigned int row = row_idx*bs + lane;
x[row] = tmp[lane];
}

View File

@@ -3,6 +3,7 @@
/// Here, U is it's own BSR matrix.
/// Only used with ChowPatelIlu.
__kernel void ILU_apply2(
__global const unsigned *rowIndices,
__global const double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
@@ -29,14 +30,15 @@ __kernel void ILU_apply2(
const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row];
const unsigned int last_block = LUrows[target_block_row+1];
unsigned row_idx = rowIndices[target_block_row];
const unsigned int first_block = LUrows[row_idx];
const unsigned int last_block = LUrows[row_idx+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
const unsigned int row = target_block_row*bs+lane;
const unsigned int row = row_idx*bs+lane;
local_out = x[row];
}
for(; block < last_block; block += num_blocks_per_warp){
@@ -64,10 +66,10 @@ __kernel void ILU_apply2(
tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0;
for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
}
const unsigned int row = target_block_row*bs + lane;
const unsigned int row = row_idx*bs + lane;
x[row] = sum;
}

View File

@@ -3,6 +3,7 @@
/// Here, U is inside a normal, square matrix.
/// In this case diagIndex indicates where the rows of U start.
__kernel void ILU_apply2(
__global const unsigned *rowIndices,
__global const double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
@@ -30,14 +31,15 @@ __kernel void ILU_apply2(
target_block_row += nodesPerColorPrefix[color];
while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = diagIndex[target_block_row] + 1;
const unsigned int last_block = LUrows[target_block_row+1];
unsigned row_idx = rowIndices[target_block_row];
const unsigned int first_block = diagIndex[row_idx] + 1;
const unsigned int last_block = LUrows[row_idx+1];
unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0;
if(lane < num_active_threads){
if(lane < bs){
const unsigned int row = target_block_row*bs+lane;
const unsigned int row = row_idx*bs+lane;
local_out = x[row];
}
for(; block < last_block; block += num_blocks_per_warp){
@@ -65,10 +67,10 @@ __kernel void ILU_apply2(
tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0;
for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
}
const unsigned int row = target_block_row*bs + lane;
const unsigned int row = row_idx*bs + lane;
x[row] = sum;
}

View File

@@ -68,6 +68,7 @@ __kernel void inverter(__global double *matrix, __global double *inverse)
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
__kernel void ilu_decomp(const unsigned int firstRow,
const unsigned int lastRow,
__global const unsigned *rowIndices,
__global double *LUvals,
__global const int *LUcols,
__global const int *LUrows,
@@ -91,14 +92,15 @@ __kernel void ilu_decomp(const unsigned int firstRow,
// go through all rows
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
{
int iRowStart = LUrows[i];
int iRowEnd = LUrows[i + 1];
const unsigned row = rowIndices[i];
int iRowStart = LUrows[row];
int iRowEnd = LUrows[row + 1];
// go through all elements of the row
for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUcols[ij];
if (j < i) {
if (j < row) {
// calculate the pivot of this row
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
@@ -127,11 +129,11 @@ __kernel void ilu_decomp(const unsigned int firstRow,
}
// store the inverse in the diagonal
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
inverter(LUvals + diagIndex[row] * bs * bs, invDiagVals + row * bs * bs);
// copy inverse
if (thread_id_in_hwarp < bs * bs) {
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
LUvals[diagIndex[row] * bs * bs + thread_id_in_hwarp] = invDiagVals[row * bs * bs + thread_id_in_hwarp];
}
}
}

View File

@@ -390,7 +390,7 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
}
}
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
@@ -403,8 +403,8 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, diagIndex, y, x,
rowsPerColor, color, block_size,
rowIndices, vals, cols, rows, diagIndex,
y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
@@ -415,7 +415,7 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
}
}
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
@@ -428,8 +428,8 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, diagIndex, invDiagVals,
x, rowsPerColor, color, block_size,
rowIndices, vals, cols, rows, diagIndex,
invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
@@ -440,7 +440,7 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
}
}
void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size)
@@ -453,7 +453,8 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
firstRow, lastRow, vals, cols, rows,
firstRow, lastRow, rowIndices,
vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group));

View File

@@ -39,9 +39,9 @@ using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&,
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
@@ -51,7 +51,7 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
@@ -135,13 +135,13 @@ public:
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false);
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
static void ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
static void ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,

View File

@@ -232,12 +232,6 @@ void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& co
queue = queue_;
}
template <unsigned int block_size>
openclSolverBackend<block_size>::~openclSolverBackend() {
finalize();
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
@@ -464,7 +458,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
if (reorder) {
rb = new double[N];
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
@@ -482,13 +475,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] rb;
}
} // end finalize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
@@ -508,7 +494,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
if (opencl_ilu_reorder != ILUReorder::NONE) {
events.resize(6);
@@ -546,7 +532,7 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
@@ -602,13 +588,8 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
Timer t;
mat->nnzValues = vals;
if (opencl_ilu_reorder != ILUReorder::NONE) {
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
} else {
rb = b;
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
}
h_b = b;
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
if (verbosity > 2) {
std::ostringstream out;
@@ -670,12 +651,7 @@ template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) {
Timer t;
if (opencl_ilu_reorder != ILUReorder::NONE) {
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
} else {
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
}
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
if (verbosity > 2) {
std::ostringstream out;

View File

@@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
using Base::initialized;
private:
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
double *h_b = nullptr; // b vector, on host
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
// OpenCL variables must be reusable, they are initialized in initialize()
@@ -184,9 +184,6 @@ public:
/// For the CPR coarse solver
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
/// Destroy a openclSolver, and free memory
~openclSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] matrix matrix A
/// \param[in] b input vector, contains N values